Repository: pingcap/tidb-ansible Branch: master Commit: 61f5ec376c82 Files: 379 Total size: 3.7 MB Directory structure: gitextract_3y37qitl/ ├── .gitignore ├── LICENSE ├── README.md ├── ansible.cfg ├── bootstrap.yml ├── callback_plugins/ │ ├── help.py │ └── yaml.py ├── clean_log_cron.yml ├── cloud/ │ └── aws-ansible/ │ ├── aws_bootstrap.yml │ ├── aws_inventory_file_generate.yml │ ├── aws_prepare.yml │ ├── aws_teardown.yml │ ├── ec2.ini │ ├── ec2.py │ ├── files/ │ │ └── sources.list │ ├── roles/ │ │ └── aws/ │ │ └── tasks/ │ │ └── main.yml │ ├── templates/ │ │ └── aws.inventory.ini.j2 │ └── vars.yml ├── collect_diagnosis.yml ├── common_tasks/ │ ├── add_evict_leader_scheduler.yml │ ├── create_grafana_api_keys.yml │ ├── get_pd_leader.yml │ ├── get_pd_leader_tls.yml │ ├── get_pd_name.yml │ ├── get_pd_name_tls.yml │ ├── get_pd_tikv_addr.yml │ ├── get_store_id.yml │ ├── get_store_id_tls.yml │ ├── remove_evict_leader_scheduler.yml │ └── transfer_pd_leader.yml ├── conf/ │ ├── alertmanager.yml │ ├── drainer.toml │ ├── pd.yml │ ├── pump.yml │ ├── spark-defaults.yml │ ├── spark-env.yml │ ├── ssl/ │ │ ├── ca-config.json │ │ └── ca-csr.json │ ├── tidb-lightning.yml │ ├── tidb.yml │ ├── tiflash-learner.yml │ ├── tiflash.yml │ ├── tikv-importer.yml │ └── tikv.yml ├── create_users.yml ├── deploy.yml ├── deploy_drainer.yml ├── deploy_ntp.yml ├── excessive_rolling_update.yml ├── filter_plugins/ │ └── tags.py ├── graceful_stop.yml ├── group_vars/ │ ├── alertmanager_servers.yml │ ├── all.yml │ ├── drainer_servers.yml │ ├── grafana_servers.yml │ ├── importer_server.yml │ ├── lightning_server.yml │ ├── monitored_servers.yml │ ├── monitoring_servers.yml │ ├── pd_servers.yml │ ├── pump_servers.yml │ ├── tidb_servers.yml │ ├── tiflash_servers.yml │ └── tikv_servers.yml ├── hosts.ini ├── inventory.ini ├── library/ │ ├── coreos_facts │ ├── docker_facts │ └── wait_for_pid.py ├── local_prepare.yml ├── log/ │ └── .gitignore ├── migrate_monitor.yml ├── requirements.txt ├── roles/ │ ├── alertmanager/ │ │ ├── defaults/ │ │ │ └── main.yml │ │ ├── meta/ │ │ │ └── main.yml │ │ ├── tasks/ │ │ │ ├── binary_deployment.yml │ │ │ ├── docker_deployment.yml │ │ │ ├── main.yml │ │ │ ├── supervise_deployment.yml │ │ │ └── systemd_deployment.yml │ │ └── templates/ │ │ ├── run_alertmanager_binary.sh.j2 │ │ └── run_alertmanager_docker.sh.j2 │ ├── blackbox_exporter/ │ │ ├── defaults/ │ │ │ └── main.yml │ │ ├── meta/ │ │ │ └── main.yml │ │ ├── tasks/ │ │ │ ├── binary_deployment.yml │ │ │ ├── docker_deployment.yml │ │ │ ├── main.yml │ │ │ ├── supervise_deployment.yml │ │ │ └── systemd_deployment.yml │ │ └── templates/ │ │ ├── blackbox.yml.j2 │ │ ├── run_blackbox_exporter_binary.sh.j2 │ │ └── run_blackbox_exporter_docker.sh.j2 │ ├── bootstrap/ │ │ ├── defaults/ │ │ │ └── main.yml │ │ └── tasks/ │ │ ├── main.yml │ │ └── root_tasks.yml │ ├── check_config_dynamic/ │ │ └── tasks/ │ │ └── main.yml │ ├── check_config_pd/ │ │ └── tasks/ │ │ └── main.yml │ ├── check_config_static/ │ │ └── tasks/ │ │ └── main.yml │ ├── check_config_tidb/ │ │ └── tasks/ │ │ └── main.yml │ ├── check_config_tikv/ │ │ └── tasks/ │ │ └── main.yml │ ├── check_system_dynamic/ │ │ ├── defaults/ │ │ │ └── main.yml │ │ └── tasks/ │ │ └── main.yml │ ├── check_system_optional/ │ │ ├── defaults/ │ │ │ └── main.yml │ │ └── tasks/ │ │ └── main.yml │ ├── check_system_static/ │ │ └── tasks/ │ │ └── main.yml │ ├── clean_log_pd/ │ │ └── tasks/ │ │ ├── add_cron.yml │ │ ├── del_cron.yml │ │ └── main.yml │ ├── clean_log_tidb/ │ │ └── tasks/ │ │ ├── add_cron.yml │ │ ├── del_cron.yml │ │ └── main.yml │ ├── clean_log_tikv/ │ │ └── tasks/ │ │ ├── add_cron.yml │ │ ├── del_cron.yml │ │ └── main.yml │ ├── collect_diagnosis/ │ │ ├── meta/ │ │ │ └── main.yml │ │ └── tasks/ │ │ └── main.yml │ ├── collector_host/ │ │ └── tasks/ │ │ ├── collect_log.yml │ │ └── main.yml │ ├── collector_pd/ │ │ └── tasks/ │ │ ├── collect_config.yml │ │ ├── collect_log.yml │ │ └── main.yml │ ├── collector_prometheus/ │ │ └── tasks/ │ │ └── main.yml │ ├── collector_pump/ │ │ └── tasks/ │ │ ├── collect_log.yml │ │ └── main.yml │ ├── collector_tidb/ │ │ └── tasks/ │ │ ├── collect_config.yml │ │ ├── collect_log.yml │ │ └── main.yml │ ├── collector_tikv/ │ │ └── tasks/ │ │ ├── collect_config.yml │ │ ├── collect_log.yml │ │ └── main.yml │ ├── common_dir/ │ │ └── tasks/ │ │ └── main.yml │ ├── dashboard_topo/ │ │ ├── tasks/ │ │ │ └── main.yml │ │ └── templates/ │ │ └── init_dashboard_topo.sh.j2 │ ├── drainer/ │ │ ├── defaults/ │ │ │ └── main.yml │ │ ├── files/ │ │ │ └── make-ssl.sh │ │ ├── meta/ │ │ │ └── main.yml │ │ ├── tasks/ │ │ │ ├── binary_deployment.yml │ │ │ ├── check_certs.yml │ │ │ ├── gen_certs.yml │ │ │ ├── install_certs.yml │ │ │ ├── main.yml │ │ │ ├── supervise_deployment.yml │ │ │ └── systemd_deployment.yml │ │ ├── templates/ │ │ │ └── run_drainer_binary.sh.j2 │ │ └── vars/ │ │ └── default.yml │ ├── firewalld/ │ │ ├── defaults/ │ │ │ └── main.yml │ │ ├── handlers/ │ │ │ └── main.yml │ │ └── tasks/ │ │ └── main.yml │ ├── grafana/ │ │ ├── defaults/ │ │ │ └── main.yml │ │ ├── meta/ │ │ │ └── main.yml │ │ ├── tasks/ │ │ │ ├── binary_deployment.yml │ │ │ ├── docker_deployment.yml │ │ │ ├── main.yml │ │ │ ├── supervise_deployment.yml │ │ │ ├── systemd_deployment.yml │ │ │ └── tasks.yml │ │ └── templates/ │ │ ├── data_source.json.j2 │ │ ├── grafana.ini.j2 │ │ ├── run_grafana_binary.sh.j2 │ │ └── run_grafana_docker.sh.j2 │ ├── kafka_exporter/ │ │ ├── defaults/ │ │ │ └── main.yml │ │ ├── meta/ │ │ │ └── main.yml │ │ ├── tasks/ │ │ │ ├── binary_deployment.yml │ │ │ ├── main.yml │ │ │ ├── supervise_deployment.yml │ │ │ └── systemd_deployment.yml │ │ └── templates/ │ │ └── run_kafka_exporter_binary.sh.j2 │ ├── local/ │ │ ├── tasks/ │ │ │ ├── binary_deployment.yml │ │ │ ├── docker_deployment.yml │ │ │ └── main.yml │ │ └── templates/ │ │ ├── binary_packages.yml.j2 │ │ ├── common_packages.yml.j2 │ │ └── docker_packages.yml.j2 │ ├── machine_benchmark/ │ │ ├── defaults/ │ │ │ └── main.yml │ │ └── tasks/ │ │ ├── fio_randread.yml │ │ ├── fio_randread_write.yml │ │ ├── fio_randread_write_latency.yml │ │ └── main.yml │ ├── node_exporter/ │ │ ├── defaults/ │ │ │ └── main.yml │ │ ├── meta/ │ │ │ └── main.yml │ │ ├── tasks/ │ │ │ ├── binary_deployment.yml │ │ │ ├── docker_deployment.yml │ │ │ ├── main.yml │ │ │ ├── supervise_deployment.yml │ │ │ └── systemd_deployment.yml │ │ └── templates/ │ │ ├── run_node_exporter_binary.sh.j2 │ │ └── run_node_exporter_docker.sh.j2 │ ├── ops/ │ │ ├── tasks/ │ │ │ └── main.yml │ │ └── templates/ │ │ ├── check_tikv.sh.j2 │ │ └── pd-ctl.sh.j2 │ ├── pd/ │ │ ├── defaults/ │ │ │ └── main.yml │ │ ├── files/ │ │ │ └── make-ssl.sh │ │ ├── meta/ │ │ │ └── main.yml │ │ ├── tasks/ │ │ │ ├── binary_deployment.yml │ │ │ ├── check_certs.yml │ │ │ ├── docker_deployment.yml │ │ │ ├── gen_certs.yml │ │ │ ├── install_certs.yml │ │ │ ├── main.yml │ │ │ ├── supervise_deployment.yml │ │ │ └── systemd_deployment.yml │ │ ├── templates/ │ │ │ ├── pd.toml.j2 │ │ │ ├── run_pd_binary.sh.j2 │ │ │ └── run_pd_docker.sh.j2 │ │ └── vars/ │ │ └── default.yml │ ├── perf_tools/ │ │ └── tasks/ │ │ └── main.yml │ ├── pre-ansible/ │ │ ├── defaults/ │ │ │ └── main.yml │ │ └── tasks/ │ │ ├── coreos.yml │ │ ├── main.yml │ │ └── root_tasks.yml │ ├── prometheus/ │ │ ├── defaults/ │ │ │ └── main.yml │ │ ├── files/ │ │ │ ├── binlog.rules.yml │ │ │ ├── blacker.rules.yml │ │ │ ├── bypass.rules.yml │ │ │ ├── kafka.rules.yml │ │ │ ├── lightning.rules.yml │ │ │ ├── node.rules.yml │ │ │ ├── pd.rules.yml │ │ │ ├── tidb.rules.yml │ │ │ ├── tiflash.rules.yml │ │ │ ├── tikv.accelerate.rules.yml │ │ │ └── tikv.rules.yml │ │ ├── meta/ │ │ │ └── main.yml │ │ ├── tasks/ │ │ │ ├── binary_deployment.yml │ │ │ ├── docker_deployment.yml │ │ │ ├── main.yml │ │ │ ├── supervise_deployment.yml │ │ │ └── systemd_deployment.yml │ │ └── templates/ │ │ ├── prometheus.yml.j2 │ │ ├── run_prometheus_binary.sh.j2 │ │ └── run_prometheus_docker.sh.j2 │ ├── pump/ │ │ ├── defaults/ │ │ │ └── main.yml │ │ ├── files/ │ │ │ └── make-ssl.sh │ │ ├── meta/ │ │ │ └── main.yml │ │ ├── tasks/ │ │ │ ├── binary_deployment.yml │ │ │ ├── check_certs.yml │ │ │ ├── docker_deployment.yml │ │ │ ├── gen_certs.yml │ │ │ ├── install_certs.yml │ │ │ ├── main.yml │ │ │ ├── supervise_deployment.yml │ │ │ └── systemd_deployment.yml │ │ ├── templates/ │ │ │ ├── pump.toml.j2 │ │ │ ├── run_pump_binary.sh.j2 │ │ │ └── run_pump_docker.sh.j2 │ │ └── vars/ │ │ └── default.yml │ ├── pushgateway/ │ │ ├── defaults/ │ │ │ └── main.yml │ │ ├── meta/ │ │ │ └── main.yml │ │ ├── tasks/ │ │ │ ├── binary_deployment.yml │ │ │ ├── docker_deployment.yml │ │ │ ├── main.yml │ │ │ ├── supervise_deployment.yml │ │ │ └── systemd_deployment.yml │ │ └── templates/ │ │ ├── run_pushgateway_binary.sh.j2 │ │ └── run_pushgateway_docker.sh.j2 │ ├── supervise/ │ │ ├── tasks/ │ │ │ └── main.yml │ │ └── templates/ │ │ ├── start_role.sh.j2 │ │ └── stop_role.sh.j2 │ ├── systemd/ │ │ ├── tasks/ │ │ │ └── main.yml │ │ └── templates/ │ │ ├── start_role.sh.j2 │ │ ├── stop_role.sh.j2 │ │ ├── systemd_binary.service.j2 │ │ └── systemd_docker.service.j2 │ ├── tidb/ │ │ ├── defaults/ │ │ │ └── main.yml │ │ ├── files/ │ │ │ └── make-ssl.sh │ │ ├── meta/ │ │ │ └── main.yml │ │ ├── tasks/ │ │ │ ├── binary_deployment.yml │ │ │ ├── check_certs.yml │ │ │ ├── docker_deployment.yml │ │ │ ├── gen_certs.yml │ │ │ ├── install_certs.yml │ │ │ ├── main.yml │ │ │ ├── supervise_deployment.yml │ │ │ └── systemd_deployment.yml │ │ ├── templates/ │ │ │ ├── run_tidb_binary.sh.j2 │ │ │ ├── run_tidb_docker.sh.j2 │ │ │ └── tidb.toml.j2 │ │ └── vars/ │ │ └── default.yml │ ├── tidb_lightning/ │ │ ├── defaults/ │ │ │ └── main.yml │ │ ├── meta/ │ │ │ └── main.yml │ │ ├── tasks/ │ │ │ ├── binary_deployment.yml │ │ │ └── main.yml │ │ ├── templates/ │ │ │ ├── start_lightning_binary.sh.j2 │ │ │ ├── stop_lightning_binary.sh.j2 │ │ │ ├── tidb-lightning.toml.j2 │ │ │ └── tidb_lightning_ctl_binary.sh.j2 │ │ └── vars/ │ │ └── tidb-lightning.yml │ ├── tiflash/ │ │ ├── defaults/ │ │ │ └── main.yml │ │ ├── meta/ │ │ │ └── main.yml │ │ ├── tasks/ │ │ │ ├── binary_deployment.yml │ │ │ ├── main.yml │ │ │ ├── supervise_deployment.yml │ │ │ └── systemd_deployment.yml │ │ ├── templates/ │ │ │ ├── run_tiflash_binary.sh.j2 │ │ │ ├── tiflash.toml.j2 │ │ │ └── tiflash_learner.toml.j2 │ │ └── vars/ │ │ ├── tiflash-learner.yml │ │ └── tiflash.yml │ ├── tikv/ │ │ ├── defaults/ │ │ │ └── main.yml │ │ ├── files/ │ │ │ └── make-ssl.sh │ │ ├── meta/ │ │ │ └── main.yml │ │ ├── tasks/ │ │ │ ├── binary_deployment.yml │ │ │ ├── check_certs.yml │ │ │ ├── check_filesystem.yml │ │ │ ├── docker_deployment.yml │ │ │ ├── gen_certs.yml │ │ │ ├── install_certs.yml │ │ │ ├── main.yml │ │ │ ├── supervise_deployment.yml │ │ │ └── systemd_deployment.yml │ │ ├── templates/ │ │ │ ├── run_tikv_binary.sh.j2 │ │ │ ├── run_tikv_docker.sh.j2 │ │ │ └── tikv.toml.j2 │ │ └── vars/ │ │ └── default.yml │ ├── tikv_importer/ │ │ ├── defaults/ │ │ │ └── main.yml │ │ ├── meta/ │ │ │ └── main.yml │ │ ├── tasks/ │ │ │ ├── binary_deployment.yml │ │ │ └── main.yml │ │ ├── templates/ │ │ │ ├── start_importer_binary.sh.j2 │ │ │ ├── stop_importer_binary.sh.j2 │ │ │ └── tikv-importer.toml.j2 │ │ └── vars/ │ │ └── tikv-importer.yml │ └── tispark/ │ ├── tasks/ │ │ └── main.yml │ └── templates/ │ ├── log4j.properties.j2 │ ├── spark-defaults.conf.j2 │ ├── spark-env.sh.j2 │ └── start-slave.sh.j2 ├── rolling_update.yml ├── rolling_update_monitor.yml ├── scripts/ │ ├── binlog.json │ ├── blackbox_exporter.json │ ├── br.json │ ├── check/ │ │ ├── check_cpufreq.py │ │ ├── epoll_chk.cc │ │ ├── epollexclusive-amd64 │ │ ├── epollexclusive-arm64 │ │ └── parse_fio_output.py │ ├── clsrun.sh │ ├── dashboard_topo.py │ ├── disk_performance.json │ ├── funcslower │ ├── grafana-config-copy.py │ ├── grafana_pdf.py │ ├── inventory_check.py │ ├── iosnoop │ ├── kafka.json │ ├── lightning.json │ ├── loader.json │ ├── metrics-delete.py │ ├── montidb.sh │ ├── node.json │ ├── overview.json │ ├── pd.json │ ├── pdn.json │ ├── performance_read.json │ ├── performance_write.json │ ├── reparo.json │ ├── syncer.json │ ├── table-regions-statistic.py │ ├── table-regions.py │ ├── tidb.json │ ├── tidb_summary.json │ ├── tiflash_proxy_details.json │ ├── tiflash_proxy_summary.json │ ├── tiflash_summary.json │ ├── tikv_details.json │ ├── tikv_raw.json │ ├── tikv_summary.json │ └── tikv_trouble_shooting.json ├── start.yml ├── start_drainer.yml ├── start_spark.yml ├── stop.yml ├── stop_drainer.yml ├── stop_spark.yml ├── templates/ │ └── grafana.dest.json.j2 ├── unsafe_cleanup.yml ├── unsafe_cleanup_container.yml └── unsafe_cleanup_data.yml ================================================ FILE CONTENTS ================================================ ================================================ FILE: .gitignore ================================================ retry_files/ downloads/ resources/ fact_files/ conf/keys scripts/dests.json .vagrant/ *.retry *.pyc .vscode .DS_Store ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "{}" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright {} Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: README.md ================================================ ## :warning: End of project :warning: [![development](https://img.shields.io/badge/development-halted-red.svg)](https://github.com/pingcap/tidb-ansible/issues/1365) **Ansible Playbook for TiDB is superseded by [TiUP](https://tiup.io/), a powerful tool to manage a TiDB cluster.** And this project [has ended](https://github.com/pingcap/tidb-ansible/issues/1365). All development/maintenance activities have halted. As it is free software, people are free and welcome to fork and develop the codebase on their own. However, to avoid any confusion, the original repository is archived and we recommend any further fork/development to proceed with an explicit rename and rebranding first. We encourage all interested parties to mirror any relevant bits as we can't actively guarantee their existence in the future. # Ansible Playbook for TiDB ## Overview Ansible is an IT automation tool. It can configure systems, deploy software, and orchestrate more advanced IT tasks such as continuous deployments or zero downtime rolling updates. TiDB-Ansible is a TiDB cluster deployment tool developed by PingCAP, based on Ansible playbook. TiDB-Ansible enables you to quickly deploy a new TiDB cluster which includes PD, TiDB, TiKV, and the cluster monitoring modules. You can use the TiDB-Ansible configuration file to set up the cluster topology, completing all operation tasks with one click, including: - Initializing the system, including creating the user for deployment, setting up the hostname, etc. - Deploying the components - Rolling update, including module survival detection - Cleaning data - Cleaning the environment - Configuring monitoring modules ## Tutorial - [English](https://docs.pingcap.com/tidb/v3.0/online-deployment-using-ansible) - [简体中文](https://docs.pingcap.com/zh/tidb/v3.0/online-deployment-using-ansible) ## License TiDB-Ansible is under the Apache 2.0 license. ================================================ FILE: ansible.cfg ================================================ [defaults] ## Customize this! inventory = inventory.ini transport = ssh # disable SSH key host checking host_key_checking = False # gathering = smart gathering = explicit fact_caching = jsonfile fact_caching_connection = fact_files retry_files_save_path = retry_files #remote_tmp = /tmp/ansible # for slow connections timeout = 10 gather_subset = network,hardware # if ssh port is not 22 #remote_port = 22 # for fun # cow_selection = random stdout_callback = yaml # log information about executions at the designated location log_path = log/ansible.log deprecation_warnings = False callback_whitelist = help [ssh_connection] ## AWS key connection # ssh_args = -i aws.key -C -o ControlMaster=auto -o ControlPersist=60s ## Jumper host connection # ssh_args = -C -o ControlMaster=auto -o ControlPersist=60s -o ProxyCommand="ssh user@host -p 22 nc %h %p" ## Default # ssh_args = -C -o ControlMaster=auto -o ControlPersist=60s ## Use custom ssh config file # ssh_args = -F ssh_config #scp_if_ssh = True # close when using a jumper host, or have TTY errors # Ubuntu is OK, while CentOS may cause errors # pipelining = True ================================================ FILE: bootstrap.yml ================================================ --- # Copyright 2016 PingCAP, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # See the License for the specific language governing permissions and # limitations under the License. # This play book is intend for one pass execution - name: initializing deployment target hosts: localhost gather_facts: false roles: - check_config_static - name: check node config hosts: all gather_facts: false become: true roles: - pre-ansible - bootstrap - name: check system hosts: all any_errors_fatal: true roles: - check_system_static - { role: check_system_optional, when: not dev_mode|default(false) } - name: tikv_servers machine benchmark hosts: tikv_servers gather_facts: false roles: - { role: machine_benchmark, when: not dev_mode|default(false) } - name: create ops scripts hosts: localhost connection: local gather_facts: false roles: - ops ================================================ FILE: callback_plugins/help.py ================================================ # Make coding more python3-ish from __future__ import (absolute_import, division, print_function) __metaclass__ = type DOCUMENTATION = ''' callback: help type: notification short_description: print help message version_added: historical description: - This plugin will print help message when tasks fail. ''' import os import io import logging import yaml from ansible.plugins.callback import CallbackBase, strip_internal_keys from ansible.parsing.yaml.dumper import AnsibleDumper from ansible import constants as C FAIL_LOGFILE = os.path.dirname(C.DEFAULT_LOG_PATH) + "/fail.log" class CallbackModule(CallbackBase): CALLBACK_VERSION = 2.0 CALLBACK_TYPE = 'notification' CALLBACK_NAME = 'help' CALLBACK_NEEDS_WHITELIST = True def __init__(self): self._play = None self._last_task_banner = None self._last_task_name = None self._task_type_cache = {} super(CallbackModule, self).__init__() if not os.path.exists(os.path.dirname(C.DEFAULT_LOG_PATH)): os.makedirs(os.path.dirname(C.DEFAULT_LOG_PATH)) self.logger = logging.getLogger('fail') self.logger.setLevel(logging.DEBUG) self.handler = logging.FileHandler(FAIL_LOGFILE) self.logger.addHandler(self.handler) def _format_results(self, result, indent=None, sort_keys=True, keep_invocation=False): # All result keys stating with _ansible_ are internal, so remove them from the result before we output anything. abridged_result = strip_internal_keys(result._result) # remove invocation unless specifically wanting it if not keep_invocation and self._display.verbosity < 3 and 'invocation' in abridged_result: del abridged_result['invocation'] # remove diff information from screen output if self._display.verbosity < 3 and 'diff' in abridged_result: del abridged_result['diff'] if 'access_control_allow_headers' in abridged_result: del abridged_result['access_control_allow_headers'] if 'access_control_allow_methods' in abridged_result: del abridged_result['access_control_allow_methods'] if 'access_control_allow_origin' in abridged_result: del abridged_result['access_control_allow_origin'] if 'x_content_type_options' in abridged_result: del abridged_result['x_content_type_options'] # remove exception from screen output if 'exception' in abridged_result: del abridged_result['exception'] dumped = '' dumpd_tile = '[' + str(result._host.name) + ']: Ansible Failed! ==>\n ' # put changed and skipped into a header line if 'changed' in abridged_result: dumped += 'changed=' + str(abridged_result['changed']).lower() + ' ' del abridged_result['changed'] if 'skipped' in abridged_result: dumped += 'skipped=' + str(abridged_result['skipped']).lower() + ' ' del abridged_result['skipped'] # if we already have stdout, we don't need stdout_lines if 'stdout' in abridged_result and 'stdout_lines' in abridged_result: abridged_result['stdout_lines'] = '' if abridged_result: dumped += '\n' dumped += yaml.dump(abridged_result, width=1000, Dumper=AnsibleDumper, default_flow_style=False) # indent by a couple of spaces dumped = '\n '.join(dumped.split('\n')).rstrip() return dumpd_tile + dumped + '\n' def print_help_message(self): self._display.display("Ask TiDB User Group for help:", color=C.COLOR_WARN) self._display.display( "It seems that you have encountered some problem. Please describe your operation steps and provide error information as much as possible on https://asktug.com (in Chinese) or https://stackoverflow.com/questions/tagged/tidb (in English). We will do our best to help solve your problem. Thanks. :-)", color=C.COLOR_WARN) def v2_runner_on_failed(self, result, ignore_errors=False): if not ignore_errors: messages = self._format_results(result) self.logger.error(messages) def v2_runner_on_unreachable(self, result): # self.print_help_message() self.logger.error('[%s]: Ansible UNREACHABLE! => changed=%s\n playbook: %s\n %s\n stderr: %s\n', result._host.name, result._result['changed'], self.playbook, result._task, result._result['msg']) def v2_playbook_on_start(self, playbook): self.playbook = playbook._file_name open(FAIL_LOGFILE, 'w').close() def v2_playbook_on_stats(self, stats): if os.path.isfile(FAIL_LOGFILE): count = -1 with open(FAIL_LOGFILE, 'r') as f: for count, line in enumerate(f): pass count += 1 if count > 0: self._display.banner("ERROR MESSAGE SUMMARY") with io.open(FAIL_LOGFILE, 'r', encoding="utf-8") as f: for _, line in enumerate(f): self._display.display(line.strip('\n'), color=C.COLOR_ERROR) self.print_help_message() else: self._display.display("Congrats! All goes well. :-)", color=C.COLOR_OK) ================================================ FILE: callback_plugins/yaml.py ================================================ # (c) 2017 Ansible Project # GNU General Public License v3.0+ (see COPYING or https://www.gnu.org/licenses/gpl-3.0.txt) # Make coding more python3-ish from __future__ import (absolute_import, division, print_function) __metaclass__ = type DOCUMENTATION = ''' callback: yaml type: stdout short_description: yaml-ized Ansible screen output version_added: 2.5 description: - Ansible output that can be quite a bit easier to read than the default JSON formatting. extends_documentation_fragment: - default_callback requirements: - set as stdout in configuration ''' import yaml import json import re import string import sys from ansible.plugins.callback import CallbackBase, strip_internal_keys from ansible.plugins.callback.default import CallbackModule as Default from ansible.parsing.yaml.dumper import AnsibleDumper # from http://stackoverflow.com/a/15423007/115478 def should_use_block(value): """Returns true if string should be in block format""" for c in u"\u000a\u000d\u001c\u001d\u001e\u0085\u2028\u2029": if c in value: return True return False def my_represent_scalar(self, tag, value, style=None): """Uses block style for multi-line strings""" if style is None: if should_use_block(value): style = '|' # we care more about readable than accuracy, so... # ...no trailing space value = value.rstrip() # ...and non-printable characters value = ''.join(x for x in value if x in string.printable) # ...tabs prevent blocks from expanding value = value.expandtabs() # ...and odd bits of whitespace value = re.sub(r'[\x0b\x0c\r]', '', value) # ...as does trailing space value = re.sub(r' +\n', '\n', value) else: style = self.default_style node = yaml.representer.ScalarNode(tag, value, style=style) if self.alias_key is not None: self.represented_objects[self.alias_key] = node return node class CallbackModule(Default): """ Variation of the Default output which uses nicely readable YAML instead of JSON for printing results. """ CALLBACK_VERSION = 2.0 CALLBACK_TYPE = 'stdout' CALLBACK_NAME = 'yaml' def __init__(self): super(CallbackModule, self).__init__() yaml.representer.BaseRepresenter.represent_scalar = my_represent_scalar def _dump_results(self, result, indent=None, sort_keys=True, keep_invocation=False): if result.get('_ansible_no_log', False): return json.dumps(dict(censored="the output has been hidden due to the fact that 'no_log: true' was specified for this result")) # All result keys stating with _ansible_ are internal, so remove them from the result before we output anything. abridged_result = strip_internal_keys(result) # remove invocation unless specifically wanting it if not keep_invocation and self._display.verbosity < 3 and 'invocation' in result: del abridged_result['invocation'] # remove diff information from screen output if self._display.verbosity < 3 and 'diff' in result: del abridged_result['diff'] # remove exception from screen output if 'exception' in abridged_result: del abridged_result['exception'] dumped = '' # put changed and skipped into a header line if 'changed' in abridged_result: dumped += 'changed=' + str(abridged_result['changed']).lower() + ' ' del abridged_result['changed'] if 'skipped' in abridged_result: dumped += 'skipped=' + str(abridged_result['skipped']).lower() + ' ' del abridged_result['skipped'] # if we already have stdout, we don't need stdout_lines if 'stdout' in abridged_result and 'stdout_lines' in abridged_result: abridged_result['stdout_lines'] = '' if abridged_result: dumped += '\n' dumped += yaml.dump(abridged_result, width=1000, Dumper=AnsibleDumper, default_flow_style=False) # indent by a couple of spaces dumped = '\n '.join(dumped.split('\n')).rstrip() return dumped def v2_runner_on_skipped(self, result): pass def v2_runner_item_on_skipped(self, result): pass ================================================ FILE: clean_log_cron.yml ================================================ --- - hosts: pd_servers tags: - pd roles: - clean_log_pd - hosts: tikv_servers tags: - tikv roles: - clean_log_tikv - hosts: tidb_servers tags: - tidb roles: - clean_log_tidb ================================================ FILE: cloud/aws-ansible/aws_bootstrap.yml ================================================ --- # This play book is intend for one pass execution - name: "Group nodes by OS distribution" hosts: all gather_facts: true tasks: - name: group hosts by distribution group_by: key="{{ ansible_distribution }}-{{ ansible_distribution_version }}" changed_when: false - name: authorized access hosts: all gather_facts: true tags: - ssh tasks: - name: generate rsa key if not exists shell: | yes n | ssh-keygen -t rsa -N "" -f ~/.ssh/id_rsa || echo ok - name: Slup slurp: src: ".ssh/id_rsa.pub" register: rsa_pub_result - set_fact: rsa_pub_key: "{{ rsa_pub_result.content | b64decode | trim }}" - name: add to authrized_keys authorized_key: user: "{{ ansible_user }}" key: "{{ hostvars[item].rsa_pub_key }}" with_items: "{{ groups.all }}" # ssh-keygen -R {{ item }}; - name: add to know_hosts shell: | ssh-keygen -R {{ hostvars[item].ansible_hostname }}; ssh-keygen -R {{ hostvars[item].ansible_default_ipv4.address }}; ssh-keygen -R {{ hostvars[item].ansible_hostname }},{{ hostvars[item].ansible_default_ipv4.address }}; (ssh-keyscan -H {{ hostvars[item].ansible_hostname }}; ssh-keyscan -H {{ hostvars[item].ansible_default_ipv4.address }}; ssh-keyscan -H {{ hostvars[item].ansible_hostname }},{{ hostvars[item].ansible_default_ipv4.address }}) | uniq >> ~/.ssh/known_hosts with_items: "{{ groups.all }}" - name: do AWS host preparation hosts: Ubuntu-14.04 gather_facts: false tasks: - name: disable apt key check lineinfile: > dest=/etc/apt/apt.conf.d/99skipkeycheck line="APT::Get::AllowUnauthenticated "true";" create=yes become: true - name: change apt mirror.list copy: src=sources.list dest=/etc/apt/sources.list mode=0644 become: true - name: add apt-fast to apt sources.list.d lineinfile: > dest=/etc/apt/sources.list.d/saiarcot895-myppa-trusty.list line="deb http://ppa.launchpad.net/saiarcot895/myppa/ubuntu trusty main" create=yes become: true - name: install apt-fast apt: name={{ item }} update_cache=yes become: true with_items: - apt-fast - name: add docker to apt sources.list.d lineinfile: > dest=/etc/apt/sources.list.d/docker.list line="deb https://mirrors.tuna.tsinghua.edu.cn/docker/apt/repo ubuntu-trusty main" create=yes become: true - name: update apt cache shell: apt-fast -y update # --skip-tags docker - name: install docker tags: - docker shell: >- creates=/usr/bin/docker apt-fast -y install docker-engine become: true - name: add user to docker group tags: - docker user: name=ubuntu groups=docker append=yes become: true - name: install perf/systemtab/unzip/ntp/zip shell: >- apt-fast -y install linux-tools-$(uname -r) systemtap unzip ntp zip iotop htop sysstat - name: add user to docker group user: name=ubuntu groups=stapusr append=yes become: true - name: add user to docker group user: name=ubuntu groups=stapdev append=yes become: true ================================================ FILE: cloud/aws-ansible/aws_inventory_file_generate.yml ================================================ --- # Copyright 2016 PingCAP, Inc. # The Playbook of TiDB # Generates - name: prepare inventory config hosts: localhost gather_facts: false tasks: - fail: msg="inventory is not empty!" when: "{{ groups.all }}" - include_vars: file: "{{ playbook_dir }}/vars.yml" - name: Gather EC2 facts. ec2_remote_facts: region: cn-north-1 filters: instance-state-name: running "tag:ManagedBy": "{{ managed_by }}" "tag:Creator": "{{ creator }}" register: aws_ec2_facts - name: set up deploy servers add_host: groups: "{{ item.tags.Type | default('unused') }}_servers" hostname: "{{ item.public_ip_address }}" when: item.tags.Type is defined and item.tags.ManagedBy == managed_by with_items: "{{ aws_ec2_facts.instances | selectattr('state', 'equalto', 'running') | list }}" - name: set up monitoring server add_host: groups: monitoring_servers hostname: "{{ groups.tidb_servers[0] }}" when: - not (groups.monitoring_servers is defined and groups.monitoring_servers) - groups.tidb_servers is defined and groups.tidb_servers - name: set up monitored servers add_host: groups: monitored_servers hostname: "{{ item.public_ip_address }}" when: item.tags.ManagedBy is defined and item.tags.ManagedBy == managed_by with_items: "{{ aws_ec2_facts.instances | selectattr('state', 'equalto', 'running') | list }}" - name: write local inventory file to aws.ini.new template: src=aws.inventory.ini.j2 dest={{ playbook_dir }}/aws.ini.new - name: finnal message debug: msg="now copy aws.ini.new to your tidb-ansible project and enjoy deployment!" ================================================ FILE: cloud/aws-ansible/aws_prepare.yml ================================================ --- - name: do AWS preparation hosts: localhost gather_facts: false pre_tasks: - include_vars: file: "{{ playbook_dir }}/vars.yml" roles: - aws post_tasks: - name: display hosts debug: msg="run `ansible-playbook aws_inventory_file_generate.yml` to get your aws.ini!" ================================================ FILE: cloud/aws-ansible/aws_teardown.yml ================================================ --- - name: test hosts: localhost connection: local gather_facts: false pre_tasks: - include_vars: file: "{{ playbook_dir }}/vars.yml" tasks: - name: host facts ec2_remote_facts: filters: "tag:ManagedBy": "{{ managed_by }}" region: cn-north-1 register: ec2_instances - name: add hosts add_host: name: "{{ item.public_ip_address }}" groups: sre-to-be-teardown with_items: "{{ ec2_instances.instances }}" #when: ec2_instances.skipped | defined and not ec2_instances.skipped - debug: var=groups['sre-to-be-teardown'] - pause: prompt="Are you sure to tear these down(C to continue, A to Abort)?" - name: EC2 Instances hosts: sre-to-be-teardown gather_facts: false tasks: - name: gather facts ec2_facts: - name: Terminate instances that were previously launched delegate_to: localhost ec2: state: 'absent' instance_ids: '{{ ansible_ec2_instance_id }}' region: cn-north-1 wait: yes wait_timeout: 500 - name: AWS hosts: localhost connection: local become: false gather_facts: false tasks: - name: terminate security group ec2_group: name: "ansible-sg-by-{{ managed_by }}" description: vpc security group by {{ creator }} region: cn-north-1 state: absent - debug: msg="we do not terminate vpc :)" ================================================ FILE: cloud/aws-ansible/ec2.ini ================================================ # Ansible EC2 external inventory script settings # [ec2] # to talk to a private eucalyptus instance uncomment these lines # and edit edit eucalyptus_host to be the host name of your cloud controller #eucalyptus = True #eucalyptus_host = clc.cloud.domain.org # AWS regions to make calls to. Set this to 'all' to make request to all regions # in AWS and merge the results together. Alternatively, set this to a comma # separated list of regions. E.g. 'us-east-1,us-west-1,us-west-2' regions = cn-north-1 regions_exclude = us-gov-west-1 #,cn-north-1 # When generating inventory, Ansible needs to know how to address a server. # Each EC2 instance has a lot of variables associated with it. Here is the list: # http://docs.pythonboto.org/en/latest/ref/ec2.html#module-boto.ec2.instance # Below are 2 variables that are used as the address of a server: # - destination_variable # - vpc_destination_variable # This is the normal destination variable to use. If you are running Ansible # from outside EC2, then 'public_dns_name' makes the most sense. If you are # running Ansible from within EC2, then perhaps you want to use the internal # address, and should set this to 'private_dns_name'. The key of an EC2 tag # may optionally be used; however the boto instance variables hold precedence # in the event of a collision. destination_variable = public_dns_name # This allows you to override the inventory_name with an ec2 variable, instead # of using the destination_variable above. Addressing (aka ansible_ssh_host) # will still use destination_variable. Tags should be written as 'tag_TAGNAME'. #hostname_variable = tag_Name # For server inside a VPC, using DNS names may not make sense. When an instance # has 'subnet_id' set, this variable is used. If the subnet is public, setting # this to 'ip_address' will return the public IP address. For instances in a # private subnet, this should be set to 'private_ip_address', and Ansible must # be run from within EC2. The key of an EC2 tag may optionally be used; however # the boto instance variables hold precedence in the event of a collision. # WARNING: - instances that are in the private vpc, _without_ public ip address # will not be listed in the inventory until You set: # vpc_destination_variable = private_ip_address vpc_destination_variable = ip_address # The following two settings allow flexible ansible host naming based on a # python format string and a comma-separated list of ec2 tags. Note that: # # 1) If the tags referenced are not present for some instances, empty strings # will be substituted in the format string. # 2) This overrides both destination_variable and vpc_destination_variable. # #destination_format = {0}.{1}.example.com #destination_format_tags = Name,environment # To tag instances on EC2 with the resource records that point to them from # Route53, uncomment and set 'route53' to True. route53 = False # To exclude RDS instances from the inventory, uncomment and set to False. #rds = False # To exclude ElastiCache instances from the inventory, uncomment and set to False. #elasticache = False # Additionally, you can specify the list of zones to exclude looking up in # 'route53_excluded_zones' as a comma-separated list. # route53_excluded_zones = samplezone1.com, samplezone2.com # By default, only EC2 instances in the 'running' state are returned. Set # 'all_instances' to True to return all instances regardless of state. all_instances = False # By default, only EC2 instances in the 'running' state are returned. Specify # EC2 instance states to return as a comma-separated list. This # option is overriden when 'all_instances' is True. # instance_states = pending, running, shutting-down, terminated, stopping, stopped # By default, only RDS instances in the 'available' state are returned. Set # 'all_rds_instances' to True return all RDS instances regardless of state. all_rds_instances = False # Include RDS cluster information (Aurora etc.) include_rds_clusters = False # By default, only ElastiCache clusters and nodes in the 'available' state # are returned. Set 'all_elasticache_clusters' and/or 'all_elastic_nodes' # to True return all ElastiCache clusters and nodes, regardless of state. # # Note that all_elasticache_nodes only applies to listed clusters. That means # if you set all_elastic_clusters to false, no node will be return from # unavailable clusters, regardless of the state and to what you set for # all_elasticache_nodes. all_elasticache_replication_groups = False all_elasticache_clusters = False all_elasticache_nodes = False # API calls to EC2 are slow. For this reason, we cache the results of an API # call. Set this to the path you want cache files to be written to. Two files # will be written to this directory: # - ansible-ec2.cache # - ansible-ec2.index cache_path = ~/.ansible/tmp # The number of seconds a cache file is considered valid. After this many # seconds, a new API call will be made, and the cache file will be updated. # To disable the cache, set this value to 0 cache_max_age = 300 # Organize groups into a nested/hierarchy instead of a flat namespace. nested_groups = False # Replace - tags when creating groups to avoid issues with ansible replace_dash_in_groups = True # If set to true, any tag of the form "a,b,c" is expanded into a list # and the results are used to create additional tag_* inventory groups. expand_csv_tags = False # The EC2 inventory output can become very large. To manage its size, # configure which groups should be created. group_by_instance_id = True group_by_region = True group_by_availability_zone = True group_by_ami_id = True group_by_instance_type = True group_by_key_pair = True group_by_vpc_id = True group_by_security_group = True group_by_tag_keys = True group_by_tag_none = True group_by_route53_names = True group_by_rds_engine = True group_by_rds_parameter_group = True group_by_elasticache_engine = True group_by_elasticache_cluster = True group_by_elasticache_parameter_group = True group_by_elasticache_replication_group = True # If you only want to include hosts that match a certain regular expression # pattern_include = staging-* # If you want to exclude any hosts that match a certain regular expression # pattern_exclude = staging-* # Instance filters can be used to control which instances are retrieved for # inventory. For the full list of possible filters, please read the EC2 API # docs: http://docs.aws.amazon.com/AWSEC2/latest/APIReference/ApiReference-query-DescribeInstances.html#query-DescribeInstances-filters # Filters are key/value pairs separated by '=', to list multiple filters use # a list separated by commas. See examples below. # Retrieve only instances with (key=value) env=staging tag # instance_filters = tag:env=staging # Retrieve only instances with role=webservers OR role=dbservers tag # instance_filters = tag:role=webservers,tag:role=dbservers # Retrieve only t1.micro instances OR instances with tag env=staging # instance_filters = instance-type=t1.micro,tag:env=staging # You can use wildcards in filter values also. Below will list instances which # tag Name value matches webservers1* # (ex. webservers15, webservers1a, webservers123 etc) # instance_filters = tag:Name=webservers1* # A boto configuration profile may be used to separate out credentials # see http://boto.readthedocs.org/en/latest/boto_config_tut.html # boto_profile = some-boto-profile-name [credentials] # The AWS credentials can optionally be specified here. Credentials specified # here are ignored if the environment variable AWS_ACCESS_KEY_ID or # AWS_PROFILE is set, or if the boto_profile property above is set. # # Supplying AWS credentials here is not recommended, as it introduces # non-trivial security concerns. When going down this route, please make sure # to set access permissions for this file correctly, e.g. handle it the same # way as you would a private SSH key. # # Unlike the boto and AWS configure files, this section does not support # profiles. # # aws_access_key_id = AXXXXXXXXXXXXXX # aws_secret_access_key = XXXXXXXXXXXXXXXXXXX # aws_security_token = XXXXXXXXXXXXXXXXXXXXXXXXXXXX ================================================ FILE: cloud/aws-ansible/ec2.py ================================================ #!/usr/bin/env python ''' EC2 external inventory script ================================= Generates inventory that Ansible can understand by making API request to AWS EC2 using the Boto library. NOTE: This script assumes Ansible is being executed where the environment variables needed for Boto have already been set: export AWS_ACCESS_KEY_ID='AK123' export AWS_SECRET_ACCESS_KEY='abc123' This script also assumes there is an ec2.ini file alongside it. To specify a different path to ec2.ini, define the EC2_INI_PATH environment variable: export EC2_INI_PATH=/path/to/my_ec2.ini If you're using eucalyptus you need to set the above variables and you need to define: export EC2_URL=http://hostname_of_your_cc:port/services/Eucalyptus If you're using boto profiles (requires boto>=2.24.0) you can choose a profile using the --boto-profile command line argument (e.g. ec2.py --boto-profile prod) or using the AWS_PROFILE variable: AWS_PROFILE=prod ansible-playbook -i ec2.py myplaybook.yml For more details, see: http://docs.pythonboto.org/en/latest/boto_config_tut.html When run against a specific host, this script returns the following variables: - ec2_ami_launch_index - ec2_architecture - ec2_association - ec2_attachTime - ec2_attachment - ec2_attachmentId - ec2_block_devices - ec2_client_token - ec2_deleteOnTermination - ec2_description - ec2_deviceIndex - ec2_dns_name - ec2_eventsSet - ec2_group_name - ec2_hypervisor - ec2_id - ec2_image_id - ec2_instanceState - ec2_instance_type - ec2_ipOwnerId - ec2_ip_address - ec2_item - ec2_kernel - ec2_key_name - ec2_launch_time - ec2_monitored - ec2_monitoring - ec2_networkInterfaceId - ec2_ownerId - ec2_persistent - ec2_placement - ec2_platform - ec2_previous_state - ec2_private_dns_name - ec2_private_ip_address - ec2_publicIp - ec2_public_dns_name - ec2_ramdisk - ec2_reason - ec2_region - ec2_requester_id - ec2_root_device_name - ec2_root_device_type - ec2_security_group_ids - ec2_security_group_names - ec2_shutdown_state - ec2_sourceDestCheck - ec2_spot_instance_request_id - ec2_state - ec2_state_code - ec2_state_reason - ec2_status - ec2_subnet_id - ec2_tenancy - ec2_virtualization_type - ec2_vpc_id These variables are pulled out of a boto.ec2.instance object. There is a lack of consistency with variable spellings (camelCase and underscores) since this just loops through all variables the object exposes. It is preferred to use the ones with underscores when multiple exist. In addition, if an instance has AWS Tags associated with it, each tag is a new variable named: - ec2_tag_[Key] = [Value] Security groups are comma-separated in 'ec2_security_group_ids' and 'ec2_security_group_names'. ''' # (c) 2012, Peter Sankauskas # # This file is part of Ansible, # # Ansible is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # Ansible is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Ansible. If not, see . ###################################################################### import sys import os import argparse import re from time import time import boto from boto import ec2 from boto import rds from boto import elasticache from boto import route53 import six from ansible.module_utils import ec2 as ec2_utils HAS_BOTO3 = False try: import boto3 HAS_BOTO3 = True except ImportError: pass from six.moves import configparser from collections import defaultdict try: import json except ImportError: import simplejson as json class Ec2Inventory(object): def _empty_inventory(self): return {"_meta" : {"hostvars" : {}}} def __init__(self): ''' Main execution path ''' # Inventory grouped by instance IDs, tags, security groups, regions, # and availability zones self.inventory = self._empty_inventory() # Index of hostname (address) to instance ID self.index = {} # Boto profile to use (if any) self.boto_profile = None # AWS credentials. self.credentials = {} # Read settings and parse CLI arguments self.parse_cli_args() self.read_settings() # Make sure that profile_name is not passed at all if not set # as pre 2.24 boto will fall over otherwise if self.boto_profile: if not hasattr(boto.ec2.EC2Connection, 'profile_name'): self.fail_with_error("boto version must be >= 2.24 to use profile") # Cache if self.args.refresh_cache: self.do_api_calls_update_cache() elif not self.is_cache_valid(): self.do_api_calls_update_cache() # Data to print if self.args.host: data_to_print = self.get_host_info() elif self.args.list: # Display list of instances for inventory if self.inventory == self._empty_inventory(): data_to_print = self.get_inventory_from_cache() else: data_to_print = self.json_format_dict(self.inventory, True) print(data_to_print) def is_cache_valid(self): ''' Determines if the cache files have expired, or if it is still valid ''' if os.path.isfile(self.cache_path_cache): mod_time = os.path.getmtime(self.cache_path_cache) current_time = time() if (mod_time + self.cache_max_age) > current_time: if os.path.isfile(self.cache_path_index): return True return False def read_settings(self): ''' Reads the settings from the ec2.ini file ''' if six.PY3: config = configparser.ConfigParser() else: config = configparser.SafeConfigParser() ec2_default_ini_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'ec2.ini') ec2_ini_path = os.path.expanduser(os.path.expandvars(os.environ.get('EC2_INI_PATH', ec2_default_ini_path))) config.read(ec2_ini_path) # is eucalyptus? self.eucalyptus_host = None self.eucalyptus = False if config.has_option('ec2', 'eucalyptus'): self.eucalyptus = config.getboolean('ec2', 'eucalyptus') if self.eucalyptus and config.has_option('ec2', 'eucalyptus_host'): self.eucalyptus_host = config.get('ec2', 'eucalyptus_host') # Regions self.regions = [] configRegions = config.get('ec2', 'regions') configRegions_exclude = config.get('ec2', 'regions_exclude') if (configRegions == 'all'): if self.eucalyptus_host: self.regions.append(boto.connect_euca(host=self.eucalyptus_host).region.name, **self.credentials) else: for regionInfo in ec2.regions(): if regionInfo.name not in configRegions_exclude: self.regions.append(regionInfo.name) else: self.regions = configRegions.split(",") # Destination addresses self.destination_variable = config.get('ec2', 'destination_variable') self.vpc_destination_variable = config.get('ec2', 'vpc_destination_variable') if config.has_option('ec2', 'hostname_variable'): self.hostname_variable = config.get('ec2', 'hostname_variable') else: self.hostname_variable = None if config.has_option('ec2', 'destination_format') and \ config.has_option('ec2', 'destination_format_tags'): self.destination_format = config.get('ec2', 'destination_format') self.destination_format_tags = config.get('ec2', 'destination_format_tags').split(',') else: self.destination_format = None self.destination_format_tags = None # Route53 self.route53_enabled = config.getboolean('ec2', 'route53') self.route53_excluded_zones = [] if config.has_option('ec2', 'route53_excluded_zones'): self.route53_excluded_zones.extend( config.get('ec2', 'route53_excluded_zones', '').split(',')) # Include RDS instances? self.rds_enabled = True if config.has_option('ec2', 'rds'): self.rds_enabled = config.getboolean('ec2', 'rds') # Include RDS cluster instances? if config.has_option('ec2', 'include_rds_clusters'): self.include_rds_clusters = config.getboolean('ec2', 'include_rds_clusters') else: self.include_rds_clusters = False # Include ElastiCache instances? self.elasticache_enabled = True if config.has_option('ec2', 'elasticache'): self.elasticache_enabled = config.getboolean('ec2', 'elasticache') # Return all EC2 instances? if config.has_option('ec2', 'all_instances'): self.all_instances = config.getboolean('ec2', 'all_instances') else: self.all_instances = False # Instance states to be gathered in inventory. Default is 'running'. # Setting 'all_instances' to 'yes' overrides this option. ec2_valid_instance_states = [ 'pending', 'running', 'shutting-down', 'terminated', 'stopping', 'stopped' ] self.ec2_instance_states = [] if self.all_instances: self.ec2_instance_states = ec2_valid_instance_states elif config.has_option('ec2', 'instance_states'): for instance_state in config.get('ec2', 'instance_states').split(','): instance_state = instance_state.strip() if instance_state not in ec2_valid_instance_states: continue self.ec2_instance_states.append(instance_state) else: self.ec2_instance_states = ['running'] # Return all RDS instances? (if RDS is enabled) if config.has_option('ec2', 'all_rds_instances') and self.rds_enabled: self.all_rds_instances = config.getboolean('ec2', 'all_rds_instances') else: self.all_rds_instances = False # Return all ElastiCache replication groups? (if ElastiCache is enabled) if config.has_option('ec2', 'all_elasticache_replication_groups') and self.elasticache_enabled: self.all_elasticache_replication_groups = config.getboolean('ec2', 'all_elasticache_replication_groups') else: self.all_elasticache_replication_groups = False # Return all ElastiCache clusters? (if ElastiCache is enabled) if config.has_option('ec2', 'all_elasticache_clusters') and self.elasticache_enabled: self.all_elasticache_clusters = config.getboolean('ec2', 'all_elasticache_clusters') else: self.all_elasticache_clusters = False # Return all ElastiCache nodes? (if ElastiCache is enabled) if config.has_option('ec2', 'all_elasticache_nodes') and self.elasticache_enabled: self.all_elasticache_nodes = config.getboolean('ec2', 'all_elasticache_nodes') else: self.all_elasticache_nodes = False # boto configuration profile (prefer CLI argument) self.boto_profile = self.args.boto_profile if config.has_option('ec2', 'boto_profile') and not self.boto_profile: self.boto_profile = config.get('ec2', 'boto_profile') # AWS credentials (prefer environment variables) if not (self.boto_profile or os.environ.get('AWS_ACCESS_KEY_ID') or os.environ.get('AWS_PROFILE')): if config.has_option('credentials', 'aws_access_key_id'): aws_access_key_id = config.get('credentials', 'aws_access_key_id') else: aws_access_key_id = None if config.has_option('credentials', 'aws_secret_access_key'): aws_secret_access_key = config.get('credentials', 'aws_secret_access_key') else: aws_secret_access_key = None if config.has_option('credentials', 'aws_security_token'): aws_security_token = config.get('credentials', 'aws_security_token') else: aws_security_token = None if aws_access_key_id: self.credentials = { 'aws_access_key_id': aws_access_key_id, 'aws_secret_access_key': aws_secret_access_key } if aws_security_token: self.credentials['security_token'] = aws_security_token # Cache related cache_dir = os.path.expanduser(config.get('ec2', 'cache_path')) if self.boto_profile: cache_dir = os.path.join(cache_dir, 'profile_' + self.boto_profile) if not os.path.exists(cache_dir): os.makedirs(cache_dir) cache_name = 'ansible-ec2' aws_profile = lambda: (self.boto_profile or os.environ.get('AWS_PROFILE') or os.environ.get('AWS_ACCESS_KEY_ID') or self.credentials.get('aws_access_key_id', None)) if aws_profile(): cache_name = '%s-%s' % (cache_name, aws_profile()) self.cache_path_cache = cache_dir + "/%s.cache" % cache_name self.cache_path_index = cache_dir + "/%s.index" % cache_name self.cache_max_age = config.getint('ec2', 'cache_max_age') if config.has_option('ec2', 'expand_csv_tags'): self.expand_csv_tags = config.getboolean('ec2', 'expand_csv_tags') else: self.expand_csv_tags = False # Configure nested groups instead of flat namespace. if config.has_option('ec2', 'nested_groups'): self.nested_groups = config.getboolean('ec2', 'nested_groups') else: self.nested_groups = False # Replace dash or not in group names if config.has_option('ec2', 'replace_dash_in_groups'): self.replace_dash_in_groups = config.getboolean('ec2', 'replace_dash_in_groups') else: self.replace_dash_in_groups = True # Configure which groups should be created. group_by_options = [ 'group_by_instance_id', 'group_by_region', 'group_by_availability_zone', 'group_by_ami_id', 'group_by_instance_type', 'group_by_key_pair', 'group_by_vpc_id', 'group_by_security_group', 'group_by_tag_keys', 'group_by_tag_none', 'group_by_route53_names', 'group_by_rds_engine', 'group_by_rds_parameter_group', 'group_by_elasticache_engine', 'group_by_elasticache_cluster', 'group_by_elasticache_parameter_group', 'group_by_elasticache_replication_group', ] for option in group_by_options: if config.has_option('ec2', option): setattr(self, option, config.getboolean('ec2', option)) else: setattr(self, option, True) # Do we need to just include hosts that match a pattern? try: pattern_include = config.get('ec2', 'pattern_include') if pattern_include and len(pattern_include) > 0: self.pattern_include = re.compile(pattern_include) else: self.pattern_include = None except configparser.NoOptionError: self.pattern_include = None # Do we need to exclude hosts that match a pattern? try: pattern_exclude = config.get('ec2', 'pattern_exclude'); if pattern_exclude and len(pattern_exclude) > 0: self.pattern_exclude = re.compile(pattern_exclude) else: self.pattern_exclude = None except configparser.NoOptionError: self.pattern_exclude = None # Instance filters (see boto and EC2 API docs). Ignore invalid filters. self.ec2_instance_filters = defaultdict(list) if config.has_option('ec2', 'instance_filters'): filters = [f for f in config.get('ec2', 'instance_filters').split(',') if f] for instance_filter in filters: instance_filter = instance_filter.strip() if not instance_filter or '=' not in instance_filter: continue filter_key, filter_value = [x.strip() for x in instance_filter.split('=', 1)] if not filter_key: continue self.ec2_instance_filters[filter_key].append(filter_value) def parse_cli_args(self): ''' Command line argument processing ''' parser = argparse.ArgumentParser(description='Produce an Ansible Inventory file based on EC2') parser.add_argument('--list', action='store_true', default=True, help='List instances (default: True)') parser.add_argument('--host', action='store', help='Get all the variables about a specific instance') parser.add_argument('--refresh-cache', action='store_true', default=False, help='Force refresh of cache by making API requests to EC2 (default: False - use cache files)') parser.add_argument('--profile', '--boto-profile', action='store', dest='boto_profile', help='Use boto profile for connections to EC2') self.args = parser.parse_args() def do_api_calls_update_cache(self): ''' Do API calls to each region, and save data in cache files ''' if self.route53_enabled: self.get_route53_records() for region in self.regions: self.get_instances_by_region(region) if self.rds_enabled: self.get_rds_instances_by_region(region) if self.elasticache_enabled: self.get_elasticache_clusters_by_region(region) self.get_elasticache_replication_groups_by_region(region) if self.include_rds_clusters: self.include_rds_clusters_by_region(region) self.write_to_cache(self.inventory, self.cache_path_cache) self.write_to_cache(self.index, self.cache_path_index) def connect(self, region): ''' create connection to api server''' if self.eucalyptus: conn = boto.connect_euca(host=self.eucalyptus_host, **self.credentials) conn.APIVersion = '2010-08-31' else: conn = self.connect_to_aws(ec2, region) return conn def boto_fix_security_token_in_profile(self, connect_args): ''' monkey patch for boto issue boto/boto#2100 ''' profile = 'profile ' + self.boto_profile if boto.config.has_option(profile, 'aws_security_token'): connect_args['security_token'] = boto.config.get(profile, 'aws_security_token') return connect_args def connect_to_aws(self, module, region): connect_args = self.credentials # only pass the profile name if it's set (as it is not supported by older boto versions) if self.boto_profile: connect_args['profile_name'] = self.boto_profile self.boto_fix_security_token_in_profile(connect_args) conn = module.connect_to_region(region, **connect_args) # connect_to_region will fail "silently" by returning None if the region name is wrong or not supported if conn is None: self.fail_with_error("region name: %s likely not supported, or AWS is down. connection to region failed." % region) return conn def get_instances_by_region(self, region): ''' Makes an AWS EC2 API call to the list of instances in a particular region ''' try: conn = self.connect(region) reservations = [] if self.ec2_instance_filters: for filter_key, filter_values in self.ec2_instance_filters.items(): reservations.extend(conn.get_all_instances(filters = { filter_key : filter_values })) else: reservations = conn.get_all_instances() # Pull the tags back in a second step # AWS are on record as saying that the tags fetched in the first `get_all_instances` request are not # reliable and may be missing, and the only way to guarantee they are there is by calling `get_all_tags` instance_ids = [] for reservation in reservations: instance_ids.extend([instance.id for instance in reservation.instances]) max_filter_value = 199 tags = [] for i in range(0, len(instance_ids), max_filter_value): tags.extend(conn.get_all_tags(filters={'resource-type': 'instance', 'resource-id': instance_ids[i:i+max_filter_value]})) tags_by_instance_id = defaultdict(dict) for tag in tags: tags_by_instance_id[tag.res_id][tag.name] = tag.value for reservation in reservations: for instance in reservation.instances: instance.tags = tags_by_instance_id[instance.id] self.add_instance(instance, region) except boto.exception.BotoServerError as e: if e.error_code == 'AuthFailure': error = self.get_auth_error_message() else: backend = 'Eucalyptus' if self.eucalyptus else 'AWS' error = "Error connecting to %s backend.\n%s" % (backend, e.message) self.fail_with_error(error, 'getting EC2 instances') def get_rds_instances_by_region(self, region): ''' Makes an AWS API call to the list of RDS instances in a particular region ''' try: conn = self.connect_to_aws(rds, region) if conn: marker = None while True: instances = conn.get_all_dbinstances(marker=marker) marker = instances.marker for instance in instances: self.add_rds_instance(instance, region) if not marker: break except boto.exception.BotoServerError as e: error = e.reason if e.error_code == 'AuthFailure': error = self.get_auth_error_message() if not e.reason == "Forbidden": error = "Looks like AWS RDS is down:\n%s" % e.message self.fail_with_error(error, 'getting RDS instances') def include_rds_clusters_by_region(self, region): if not HAS_BOTO3: self.fail_with_error("Working with RDS clusters requires boto3 - please install boto3 and try again", "getting RDS clusters") client = ec2_utils.boto3_inventory_conn('client', 'rds', region, **self.credentials) marker, clusters = '', [] while marker is not None: resp = client.describe_db_clusters(Marker=marker) clusters.extend(resp["DBClusters"]) marker = resp.get('Marker', None) account_id = boto.connect_iam().get_user().arn.split(':')[4] c_dict = {} for c in clusters: # remove these datetime objects as there is no serialisation to json # currently in place and we don't need the data yet if 'EarliestRestorableTime' in c: del c['EarliestRestorableTime'] if 'LatestRestorableTime' in c: del c['LatestRestorableTime'] if self.ec2_instance_filters == {}: matches_filter = True else: matches_filter = False try: # arn:aws:rds:::: tags = client.list_tags_for_resource( ResourceName='arn:aws:rds:' + region + ':' + account_id + ':cluster:' + c['DBClusterIdentifier']) c['Tags'] = tags['TagList'] if self.ec2_instance_filters: for filter_key, filter_values in self.ec2_instance_filters.items(): # get AWS tag key e.g. tag:env will be 'env' tag_name = filter_key.split(":", 1)[1] # Filter values is a list (if you put multiple values for the same tag name) matches_filter = any(d['Key'] == tag_name and d['Value'] in filter_values for d in c['Tags']) if matches_filter: # it matches a filter, so stop looking for further matches break except Exception as e: if e.message.find('DBInstanceNotFound') >= 0: # AWS RDS bug (2016-01-06) means deletion does not fully complete and leave an 'empty' cluster. # Ignore errors when trying to find tags for these pass # ignore empty clusters caused by AWS bug if len(c['DBClusterMembers']) == 0: continue elif matches_filter: c_dict[c['DBClusterIdentifier']] = c self.inventory['db_clusters'] = c_dict def get_elasticache_clusters_by_region(self, region): ''' Makes an AWS API call to the list of ElastiCache clusters (with nodes' info) in a particular region.''' # ElastiCache boto module doesn't provide a get_all_intances method, # that's why we need to call describe directly (it would be called by # the shorthand method anyway...) try: conn = self.connect_to_aws(elasticache, region) if conn: # show_cache_node_info = True # because we also want nodes' information response = conn.describe_cache_clusters(None, None, None, True) except boto.exception.BotoServerError as e: error = e.reason if e.error_code == 'AuthFailure': error = self.get_auth_error_message() if not e.reason == "Forbidden": error = "Looks like AWS ElastiCache is down:\n%s" % e.message self.fail_with_error(error, 'getting ElastiCache clusters') try: # Boto also doesn't provide wrapper classes to CacheClusters or # CacheNodes. Because of that wo can't make use of the get_list # method in the AWSQueryConnection. Let's do the work manually clusters = response['DescribeCacheClustersResponse']['DescribeCacheClustersResult']['CacheClusters'] except KeyError as e: error = "ElastiCache query to AWS failed (unexpected format)." self.fail_with_error(error, 'getting ElastiCache clusters') for cluster in clusters: self.add_elasticache_cluster(cluster, region) def get_elasticache_replication_groups_by_region(self, region): ''' Makes an AWS API call to the list of ElastiCache replication groups in a particular region.''' # ElastiCache boto module doesn't provide a get_all_intances method, # that's why we need to call describe directly (it would be called by # the shorthand method anyway...) try: conn = self.connect_to_aws(elasticache, region) if conn: response = conn.describe_replication_groups() except boto.exception.BotoServerError as e: error = e.reason if e.error_code == 'AuthFailure': error = self.get_auth_error_message() if not e.reason == "Forbidden": error = "Looks like AWS ElastiCache [Replication Groups] is down:\n%s" % e.message self.fail_with_error(error, 'getting ElastiCache clusters') try: # Boto also doesn't provide wrapper classes to ReplicationGroups # Because of that wo can't make use of the get_list method in the # AWSQueryConnection. Let's do the work manually replication_groups = response['DescribeReplicationGroupsResponse']['DescribeReplicationGroupsResult']['ReplicationGroups'] except KeyError as e: error = "ElastiCache [Replication Groups] query to AWS failed (unexpected format)." self.fail_with_error(error, 'getting ElastiCache clusters') for replication_group in replication_groups: self.add_elasticache_replication_group(replication_group, region) def get_auth_error_message(self): ''' create an informative error message if there is an issue authenticating''' errors = ["Authentication error retrieving ec2 inventory."] if None in [os.environ.get('AWS_ACCESS_KEY_ID'), os.environ.get('AWS_SECRET_ACCESS_KEY')]: errors.append(' - No AWS_ACCESS_KEY_ID or AWS_SECRET_ACCESS_KEY environment vars found') else: errors.append(' - AWS_ACCESS_KEY_ID and AWS_SECRET_ACCESS_KEY environment vars found but may not be correct') boto_paths = ['/etc/boto.cfg', '~/.boto', '~/.aws/credentials'] boto_config_found = list(p for p in boto_paths if os.path.isfile(os.path.expanduser(p))) if len(boto_config_found) > 0: errors.append(" - Boto configs found at '%s', but the credentials contained may not be correct" % ', '.join(boto_config_found)) else: errors.append(" - No Boto config found at any expected location '%s'" % ', '.join(boto_paths)) return '\n'.join(errors) def fail_with_error(self, err_msg, err_operation=None): '''log an error to std err for ansible-playbook to consume and exit''' if err_operation: err_msg = 'ERROR: "{err_msg}", while: {err_operation}'.format( err_msg=err_msg, err_operation=err_operation) sys.stderr.write(err_msg) sys.exit(1) def get_instance(self, region, instance_id): conn = self.connect(region) reservations = conn.get_all_instances([instance_id]) for reservation in reservations: for instance in reservation.instances: return instance def add_instance(self, instance, region): ''' Adds an instance to the inventory and index, as long as it is addressable ''' # Only return instances with desired instance states if instance.state not in self.ec2_instance_states: return # Select the best destination address if self.destination_format and self.destination_format_tags: dest = self.destination_format.format(*[ getattr(instance, 'tags').get(tag, '') for tag in self.destination_format_tags ]) elif instance.subnet_id: dest = getattr(instance, self.vpc_destination_variable, None) if dest is None: dest = getattr(instance, 'tags').get(self.vpc_destination_variable, None) else: dest = getattr(instance, self.destination_variable, None) if dest is None: dest = getattr(instance, 'tags').get(self.destination_variable, None) if not dest: # Skip instances we cannot address (e.g. private VPC subnet) return # Set the inventory name hostname = None if self.hostname_variable: if self.hostname_variable.startswith('tag_'): hostname = instance.tags.get(self.hostname_variable[4:], None) else: hostname = getattr(instance, self.hostname_variable) # If we can't get a nice hostname, use the destination address if not hostname: hostname = dest else: hostname = self.to_safe(hostname).lower() # if we only want to include hosts that match a pattern, skip those that don't if self.pattern_include and not self.pattern_include.match(hostname): return # if we need to exclude hosts that match a pattern, skip those if self.pattern_exclude and self.pattern_exclude.match(hostname): return # Add to index self.index[hostname] = [region, instance.id] # Inventory: Group by instance ID (always a group of 1) if self.group_by_instance_id: self.inventory[instance.id] = [hostname] if self.nested_groups: self.push_group(self.inventory, 'instances', instance.id) # Inventory: Group by region if self.group_by_region: self.push(self.inventory, region, hostname) if self.nested_groups: self.push_group(self.inventory, 'regions', region) # Inventory: Group by availability zone if self.group_by_availability_zone: self.push(self.inventory, instance.placement, hostname) if self.nested_groups: if self.group_by_region: self.push_group(self.inventory, region, instance.placement) self.push_group(self.inventory, 'zones', instance.placement) # Inventory: Group by Amazon Machine Image (AMI) ID if self.group_by_ami_id: ami_id = self.to_safe(instance.image_id) self.push(self.inventory, ami_id, hostname) if self.nested_groups: self.push_group(self.inventory, 'images', ami_id) # Inventory: Group by instance type if self.group_by_instance_type: type_name = self.to_safe('type_' + instance.instance_type) self.push(self.inventory, type_name, hostname) if self.nested_groups: self.push_group(self.inventory, 'types', type_name) # Inventory: Group by key pair if self.group_by_key_pair and instance.key_name: key_name = self.to_safe('key_' + instance.key_name) self.push(self.inventory, key_name, hostname) if self.nested_groups: self.push_group(self.inventory, 'keys', key_name) # Inventory: Group by VPC if self.group_by_vpc_id and instance.vpc_id: vpc_id_name = self.to_safe('vpc_id_' + instance.vpc_id) self.push(self.inventory, vpc_id_name, hostname) if self.nested_groups: self.push_group(self.inventory, 'vpcs', vpc_id_name) # Inventory: Group by security group if self.group_by_security_group: try: for group in instance.groups: key = self.to_safe("security_group_" + group.name) self.push(self.inventory, key, hostname) if self.nested_groups: self.push_group(self.inventory, 'security_groups', key) except AttributeError: self.fail_with_error('\n'.join(['Package boto seems a bit older.', 'Please upgrade boto >= 2.3.0.'])) # Inventory: Group by tag keys if self.group_by_tag_keys: for k, v in instance.tags.items(): if self.expand_csv_tags and v and ',' in v: values = map(lambda x: x.strip(), v.split(',')) else: values = [v] for v in values: if v: key = self.to_safe("tag_" + k + "=" + v) else: key = self.to_safe("tag_" + k) self.push(self.inventory, key, hostname) if self.nested_groups: self.push_group(self.inventory, 'tags', self.to_safe("tag_" + k)) if v: self.push_group(self.inventory, self.to_safe("tag_" + k), key) # Inventory: Group by Route53 domain names if enabled if self.route53_enabled and self.group_by_route53_names: route53_names = self.get_instance_route53_names(instance) for name in route53_names: self.push(self.inventory, name, hostname) if self.nested_groups: self.push_group(self.inventory, 'route53', name) # Global Tag: instances without tags if self.group_by_tag_none and len(instance.tags) == 0: self.push(self.inventory, 'tag_none', hostname) if self.nested_groups: self.push_group(self.inventory, 'tags', 'tag_none') # Global Tag: tag all EC2 instances self.push(self.inventory, 'ec2', hostname) self.inventory["_meta"]["hostvars"][hostname] = self.get_host_info_dict_from_instance(instance) self.inventory["_meta"]["hostvars"][hostname]['ansible_ssh_host'] = dest def add_rds_instance(self, instance, region): ''' Adds an RDS instance to the inventory and index, as long as it is addressable ''' # Only want available instances unless all_rds_instances is True if not self.all_rds_instances and instance.status != 'available': return # Select the best destination address dest = instance.endpoint[0] if not dest: # Skip instances we cannot address (e.g. private VPC subnet) return # Set the inventory name hostname = None if self.hostname_variable: if self.hostname_variable.startswith('tag_'): hostname = instance.tags.get(self.hostname_variable[4:], None) else: hostname = getattr(instance, self.hostname_variable) # If we can't get a nice hostname, use the destination address if not hostname: hostname = dest hostname = self.to_safe(hostname).lower() # Add to index self.index[hostname] = [region, instance.id] # Inventory: Group by instance ID (always a group of 1) if self.group_by_instance_id: self.inventory[instance.id] = [hostname] if self.nested_groups: self.push_group(self.inventory, 'instances', instance.id) # Inventory: Group by region if self.group_by_region: self.push(self.inventory, region, hostname) if self.nested_groups: self.push_group(self.inventory, 'regions', region) # Inventory: Group by availability zone if self.group_by_availability_zone: self.push(self.inventory, instance.availability_zone, hostname) if self.nested_groups: if self.group_by_region: self.push_group(self.inventory, region, instance.availability_zone) self.push_group(self.inventory, 'zones', instance.availability_zone) # Inventory: Group by instance type if self.group_by_instance_type: type_name = self.to_safe('type_' + instance.instance_class) self.push(self.inventory, type_name, hostname) if self.nested_groups: self.push_group(self.inventory, 'types', type_name) # Inventory: Group by VPC if self.group_by_vpc_id and instance.subnet_group and instance.subnet_group.vpc_id: vpc_id_name = self.to_safe('vpc_id_' + instance.subnet_group.vpc_id) self.push(self.inventory, vpc_id_name, hostname) if self.nested_groups: self.push_group(self.inventory, 'vpcs', vpc_id_name) # Inventory: Group by security group if self.group_by_security_group: try: if instance.security_group: key = self.to_safe("security_group_" + instance.security_group.name) self.push(self.inventory, key, hostname) if self.nested_groups: self.push_group(self.inventory, 'security_groups', key) except AttributeError: self.fail_with_error('\n'.join(['Package boto seems a bit older.', 'Please upgrade boto >= 2.3.0.'])) # Inventory: Group by engine if self.group_by_rds_engine: self.push(self.inventory, self.to_safe("rds_" + instance.engine), hostname) if self.nested_groups: self.push_group(self.inventory, 'rds_engines', self.to_safe("rds_" + instance.engine)) # Inventory: Group by parameter group if self.group_by_rds_parameter_group: self.push(self.inventory, self.to_safe("rds_parameter_group_" + instance.parameter_group.name), hostname) if self.nested_groups: self.push_group(self.inventory, 'rds_parameter_groups', self.to_safe("rds_parameter_group_" + instance.parameter_group.name)) # Global Tag: all RDS instances self.push(self.inventory, 'rds', hostname) self.inventory["_meta"]["hostvars"][hostname] = self.get_host_info_dict_from_instance(instance) self.inventory["_meta"]["hostvars"][hostname]['ansible_ssh_host'] = dest def add_elasticache_cluster(self, cluster, region): ''' Adds an ElastiCache cluster to the inventory and index, as long as it's nodes are addressable ''' # Only want available clusters unless all_elasticache_clusters is True if not self.all_elasticache_clusters and cluster['CacheClusterStatus'] != 'available': return # Select the best destination address if 'ConfigurationEndpoint' in cluster and cluster['ConfigurationEndpoint']: # Memcached cluster dest = cluster['ConfigurationEndpoint']['Address'] is_redis = False else: # Redis sigle node cluster # Because all Redis clusters are single nodes, we'll merge the # info from the cluster with info about the node dest = cluster['CacheNodes'][0]['Endpoint']['Address'] is_redis = True if not dest: # Skip clusters we cannot address (e.g. private VPC subnet) return # Add to index self.index[dest] = [region, cluster['CacheClusterId']] # Inventory: Group by instance ID (always a group of 1) if self.group_by_instance_id: self.inventory[cluster['CacheClusterId']] = [dest] if self.nested_groups: self.push_group(self.inventory, 'instances', cluster['CacheClusterId']) # Inventory: Group by region if self.group_by_region and not is_redis: self.push(self.inventory, region, dest) if self.nested_groups: self.push_group(self.inventory, 'regions', region) # Inventory: Group by availability zone if self.group_by_availability_zone and not is_redis: self.push(self.inventory, cluster['PreferredAvailabilityZone'], dest) if self.nested_groups: if self.group_by_region: self.push_group(self.inventory, region, cluster['PreferredAvailabilityZone']) self.push_group(self.inventory, 'zones', cluster['PreferredAvailabilityZone']) # Inventory: Group by node type if self.group_by_instance_type and not is_redis: type_name = self.to_safe('type_' + cluster['CacheNodeType']) self.push(self.inventory, type_name, dest) if self.nested_groups: self.push_group(self.inventory, 'types', type_name) # Inventory: Group by VPC (information not available in the current # AWS API version for ElastiCache) # Inventory: Group by security group if self.group_by_security_group and not is_redis: # Check for the existence of the 'SecurityGroups' key and also if # this key has some value. When the cluster is not placed in a SG # the query can return None here and cause an error. if 'SecurityGroups' in cluster and cluster['SecurityGroups'] is not None: for security_group in cluster['SecurityGroups']: key = self.to_safe("security_group_" + security_group['SecurityGroupId']) self.push(self.inventory, key, dest) if self.nested_groups: self.push_group(self.inventory, 'security_groups', key) # Inventory: Group by engine if self.group_by_elasticache_engine and not is_redis: self.push(self.inventory, self.to_safe("elasticache_" + cluster['Engine']), dest) if self.nested_groups: self.push_group(self.inventory, 'elasticache_engines', self.to_safe(cluster['Engine'])) # Inventory: Group by parameter group if self.group_by_elasticache_parameter_group: self.push(self.inventory, self.to_safe("elasticache_parameter_group_" + cluster['CacheParameterGroup']['CacheParameterGroupName']), dest) if self.nested_groups: self.push_group(self.inventory, 'elasticache_parameter_groups', self.to_safe(cluster['CacheParameterGroup']['CacheParameterGroupName'])) # Inventory: Group by replication group if self.group_by_elasticache_replication_group and 'ReplicationGroupId' in cluster and cluster['ReplicationGroupId']: self.push(self.inventory, self.to_safe("elasticache_replication_group_" + cluster['ReplicationGroupId']), dest) if self.nested_groups: self.push_group(self.inventory, 'elasticache_replication_groups', self.to_safe(cluster['ReplicationGroupId'])) # Global Tag: all ElastiCache clusters self.push(self.inventory, 'elasticache_clusters', cluster['CacheClusterId']) host_info = self.get_host_info_dict_from_describe_dict(cluster) self.inventory["_meta"]["hostvars"][dest] = host_info # Add the nodes for node in cluster['CacheNodes']: self.add_elasticache_node(node, cluster, region) def add_elasticache_node(self, node, cluster, region): ''' Adds an ElastiCache node to the inventory and index, as long as it is addressable ''' # Only want available nodes unless all_elasticache_nodes is True if not self.all_elasticache_nodes and node['CacheNodeStatus'] != 'available': return # Select the best destination address dest = node['Endpoint']['Address'] if not dest: # Skip nodes we cannot address (e.g. private VPC subnet) return node_id = self.to_safe(cluster['CacheClusterId'] + '_' + node['CacheNodeId']) # Add to index self.index[dest] = [region, node_id] # Inventory: Group by node ID (always a group of 1) if self.group_by_instance_id: self.inventory[node_id] = [dest] if self.nested_groups: self.push_group(self.inventory, 'instances', node_id) # Inventory: Group by region if self.group_by_region: self.push(self.inventory, region, dest) if self.nested_groups: self.push_group(self.inventory, 'regions', region) # Inventory: Group by availability zone if self.group_by_availability_zone: self.push(self.inventory, cluster['PreferredAvailabilityZone'], dest) if self.nested_groups: if self.group_by_region: self.push_group(self.inventory, region, cluster['PreferredAvailabilityZone']) self.push_group(self.inventory, 'zones', cluster['PreferredAvailabilityZone']) # Inventory: Group by node type if self.group_by_instance_type: type_name = self.to_safe('type_' + cluster['CacheNodeType']) self.push(self.inventory, type_name, dest) if self.nested_groups: self.push_group(self.inventory, 'types', type_name) # Inventory: Group by VPC (information not available in the current # AWS API version for ElastiCache) # Inventory: Group by security group if self.group_by_security_group: # Check for the existence of the 'SecurityGroups' key and also if # this key has some value. When the cluster is not placed in a SG # the query can return None here and cause an error. if 'SecurityGroups' in cluster and cluster['SecurityGroups'] is not None: for security_group in cluster['SecurityGroups']: key = self.to_safe("security_group_" + security_group['SecurityGroupId']) self.push(self.inventory, key, dest) if self.nested_groups: self.push_group(self.inventory, 'security_groups', key) # Inventory: Group by engine if self.group_by_elasticache_engine: self.push(self.inventory, self.to_safe("elasticache_" + cluster['Engine']), dest) if self.nested_groups: self.push_group(self.inventory, 'elasticache_engines', self.to_safe("elasticache_" + cluster['Engine'])) # Inventory: Group by parameter group (done at cluster level) # Inventory: Group by replication group (done at cluster level) # Inventory: Group by ElastiCache Cluster if self.group_by_elasticache_cluster: self.push(self.inventory, self.to_safe("elasticache_cluster_" + cluster['CacheClusterId']), dest) # Global Tag: all ElastiCache nodes self.push(self.inventory, 'elasticache_nodes', dest) host_info = self.get_host_info_dict_from_describe_dict(node) if dest in self.inventory["_meta"]["hostvars"]: self.inventory["_meta"]["hostvars"][dest].update(host_info) else: self.inventory["_meta"]["hostvars"][dest] = host_info def add_elasticache_replication_group(self, replication_group, region): ''' Adds an ElastiCache replication group to the inventory and index ''' # Only want available clusters unless all_elasticache_replication_groups is True if not self.all_elasticache_replication_groups and replication_group['Status'] != 'available': return # Select the best destination address (PrimaryEndpoint) dest = replication_group['NodeGroups'][0]['PrimaryEndpoint']['Address'] if not dest: # Skip clusters we cannot address (e.g. private VPC subnet) return # Add to index self.index[dest] = [region, replication_group['ReplicationGroupId']] # Inventory: Group by ID (always a group of 1) if self.group_by_instance_id: self.inventory[replication_group['ReplicationGroupId']] = [dest] if self.nested_groups: self.push_group(self.inventory, 'instances', replication_group['ReplicationGroupId']) # Inventory: Group by region if self.group_by_region: self.push(self.inventory, region, dest) if self.nested_groups: self.push_group(self.inventory, 'regions', region) # Inventory: Group by availability zone (doesn't apply to replication groups) # Inventory: Group by node type (doesn't apply to replication groups) # Inventory: Group by VPC (information not available in the current # AWS API version for replication groups # Inventory: Group by security group (doesn't apply to replication groups) # Check this value in cluster level # Inventory: Group by engine (replication groups are always Redis) if self.group_by_elasticache_engine: self.push(self.inventory, 'elasticache_redis', dest) if self.nested_groups: self.push_group(self.inventory, 'elasticache_engines', 'redis') # Global Tag: all ElastiCache clusters self.push(self.inventory, 'elasticache_replication_groups', replication_group['ReplicationGroupId']) host_info = self.get_host_info_dict_from_describe_dict(replication_group) self.inventory["_meta"]["hostvars"][dest] = host_info def get_route53_records(self): ''' Get and store the map of resource records to domain names that point to them. ''' r53_conn = route53.Route53Connection() all_zones = r53_conn.get_zones() route53_zones = [ zone for zone in all_zones if zone.name[:-1] not in self.route53_excluded_zones ] self.route53_records = {} for zone in route53_zones: rrsets = r53_conn.get_all_rrsets(zone.id) for record_set in rrsets: record_name = record_set.name if record_name.endswith('.'): record_name = record_name[:-1] for resource in record_set.resource_records: self.route53_records.setdefault(resource, set()) self.route53_records[resource].add(record_name) def get_instance_route53_names(self, instance): ''' Check if an instance is referenced in the records we have from Route53. If it is, return the list of domain names pointing to said instance. If nothing points to it, return an empty list. ''' instance_attributes = [ 'public_dns_name', 'private_dns_name', 'ip_address', 'private_ip_address' ] name_list = set() for attrib in instance_attributes: try: value = getattr(instance, attrib) except AttributeError: continue if value in self.route53_records: name_list.update(self.route53_records[value]) return list(name_list) def get_host_info_dict_from_instance(self, instance): instance_vars = {} for key in vars(instance): value = getattr(instance, key) key = self.to_safe('ec2_' + key) # Handle complex types # state/previous_state changed to properties in boto in https://github.com/boto/boto/commit/a23c379837f698212252720d2af8dec0325c9518 if key == 'ec2__state': instance_vars['ec2_state'] = instance.state or '' instance_vars['ec2_state_code'] = instance.state_code elif key == 'ec2__previous_state': instance_vars['ec2_previous_state'] = instance.previous_state or '' instance_vars['ec2_previous_state_code'] = instance.previous_state_code elif type(value) in [int, bool]: instance_vars[key] = value elif isinstance(value, six.string_types): instance_vars[key] = value.strip() elif type(value) == type(None): instance_vars[key] = '' elif key == 'ec2_region': instance_vars[key] = value.name elif key == 'ec2__placement': instance_vars['ec2_placement'] = value.zone elif key == 'ec2_tags': for k, v in value.items(): if self.expand_csv_tags and ',' in v: v = list(map(lambda x: x.strip(), v.split(','))) key = self.to_safe('ec2_tag_' + k) instance_vars[key] = v elif key == 'ec2_groups': group_ids = [] group_names = [] for group in value: group_ids.append(group.id) group_names.append(group.name) instance_vars["ec2_security_group_ids"] = ','.join([str(i) for i in group_ids]) instance_vars["ec2_security_group_names"] = ','.join([str(i) for i in group_names]) elif key == 'ec2_block_device_mapping': instance_vars["ec2_block_devices"] = {} for k, v in value.items(): instance_vars["ec2_block_devices"][ os.path.basename(k) ] = v.volume_id else: pass # TODO Product codes if someone finds them useful #print key #print type(value) #print value return instance_vars def get_host_info_dict_from_describe_dict(self, describe_dict): ''' Parses the dictionary returned by the API call into a flat list of parameters. This method should be used only when 'describe' is used directly because Boto doesn't provide specific classes. ''' # I really don't agree with prefixing everything with 'ec2' # because EC2, RDS and ElastiCache are different services. # I'm just following the pattern used until now to not break any # compatibility. host_info = {} for key in describe_dict: value = describe_dict[key] key = self.to_safe('ec2_' + self.uncammelize(key)) # Handle complex types # Target: Memcached Cache Clusters if key == 'ec2_configuration_endpoint' and value: host_info['ec2_configuration_endpoint_address'] = value['Address'] host_info['ec2_configuration_endpoint_port'] = value['Port'] # Target: Cache Nodes and Redis Cache Clusters (single node) if key == 'ec2_endpoint' and value: host_info['ec2_endpoint_address'] = value['Address'] host_info['ec2_endpoint_port'] = value['Port'] # Target: Redis Replication Groups if key == 'ec2_node_groups' and value: host_info['ec2_endpoint_address'] = value[0]['PrimaryEndpoint']['Address'] host_info['ec2_endpoint_port'] = value[0]['PrimaryEndpoint']['Port'] replica_count = 0 for node in value[0]['NodeGroupMembers']: if node['CurrentRole'] == 'primary': host_info['ec2_primary_cluster_address'] = node['ReadEndpoint']['Address'] host_info['ec2_primary_cluster_port'] = node['ReadEndpoint']['Port'] host_info['ec2_primary_cluster_id'] = node['CacheClusterId'] elif node['CurrentRole'] == 'replica': host_info['ec2_replica_cluster_address_'+ str(replica_count)] = node['ReadEndpoint']['Address'] host_info['ec2_replica_cluster_port_'+ str(replica_count)] = node['ReadEndpoint']['Port'] host_info['ec2_replica_cluster_id_'+ str(replica_count)] = node['CacheClusterId'] replica_count += 1 # Target: Redis Replication Groups if key == 'ec2_member_clusters' and value: host_info['ec2_member_clusters'] = ','.join([str(i) for i in value]) # Target: All Cache Clusters elif key == 'ec2_cache_parameter_group': host_info["ec2_cache_node_ids_to_reboot"] = ','.join([str(i) for i in value['CacheNodeIdsToReboot']]) host_info['ec2_cache_parameter_group_name'] = value['CacheParameterGroupName'] host_info['ec2_cache_parameter_apply_status'] = value['ParameterApplyStatus'] # Target: Almost everything elif key == 'ec2_security_groups': # Skip if SecurityGroups is None # (it is possible to have the key defined but no value in it). if value is not None: sg_ids = [] for sg in value: sg_ids.append(sg['SecurityGroupId']) host_info["ec2_security_group_ids"] = ','.join([str(i) for i in sg_ids]) # Target: Everything # Preserve booleans and integers elif type(value) in [int, bool]: host_info[key] = value # Target: Everything # Sanitize string values elif isinstance(value, six.string_types): host_info[key] = value.strip() # Target: Everything # Replace None by an empty string elif type(value) == type(None): host_info[key] = '' else: # Remove non-processed complex types pass return host_info def get_host_info(self): ''' Get variables about a specific host ''' if len(self.index) == 0: # Need to load index from cache self.load_index_from_cache() if not self.args.host in self.index: # try updating the cache self.do_api_calls_update_cache() if not self.args.host in self.index: # host might not exist anymore return self.json_format_dict({}, True) (region, instance_id) = self.index[self.args.host] instance = self.get_instance(region, instance_id) return self.json_format_dict(self.get_host_info_dict_from_instance(instance), True) def push(self, my_dict, key, element): ''' Push an element onto an array that may not have been defined in the dict ''' group_info = my_dict.setdefault(key, []) if isinstance(group_info, dict): host_list = group_info.setdefault('hosts', []) host_list.append(element) else: group_info.append(element) def push_group(self, my_dict, key, element): ''' Push a group as a child of another group. ''' parent_group = my_dict.setdefault(key, {}) if not isinstance(parent_group, dict): parent_group = my_dict[key] = {'hosts': parent_group} child_groups = parent_group.setdefault('children', []) if element not in child_groups: child_groups.append(element) def get_inventory_from_cache(self): ''' Reads the inventory from the cache file and returns it as a JSON object ''' cache = open(self.cache_path_cache, 'r') json_inventory = cache.read() return json_inventory def load_index_from_cache(self): ''' Reads the index from the cache file sets self.index ''' cache = open(self.cache_path_index, 'r') json_index = cache.read() self.index = json.loads(json_index) def write_to_cache(self, data, filename): ''' Writes data in JSON format to a file ''' json_data = self.json_format_dict(data, True) cache = open(filename, 'w') cache.write(json_data) cache.close() def uncammelize(self, key): temp = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', key) return re.sub('([a-z0-9])([A-Z])', r'\1_\2', temp).lower() def to_safe(self, word): ''' Converts 'bad' characters in a string to underscores so they can be used as Ansible groups ''' regex = "[^A-Za-z0-9\_" if not self.replace_dash_in_groups: regex += "\-" return re.sub(regex + "]", "_", word) def json_format_dict(self, data, pretty=False): ''' Converts a dict to a JSON object and dumps it as a formatted string ''' if pretty: return json.dumps(data, sort_keys=True, indent=2) else: return json.dumps(data) # Run the script Ec2Inventory() ================================================ FILE: cloud/aws-ansible/files/sources.list ================================================ deb http://mirrors.yun-idc.com/ubuntu/ trusty main restricted universe multiverse deb http://mirrors.yun-idc.com/ubuntu/ trusty-security main restricted universe multiverse deb http://mirrors.yun-idc.com/ubuntu/ trusty-updates main restricted universe multiverse deb http://mirrors.yun-idc.com/ubuntu/ trusty-backports main restricted universe multiverse ================================================ FILE: cloud/aws-ansible/roles/aws/tasks/main.yml ================================================ --- # all cluster use a single vpc - name: vpc setup ec2_vpc: state: present cidr_block: 172.233.0.0/16 resource_tags: Name: pingcap-vpc-ansible ManagedBy: tidb-ansible Creator: ansible-auto subnets: - cidr: 172.233.1.0/24 az: cn-north-1a resource_tags: Name: pingcap-subnet-1a Environment: "test" ManagedBy: tidb-ansible Creator: ansible-auto Tier: "db" - cidr: 172.233.2.0/24 az: cn-north-1b resource_tags: Name: pingcap-subnet-1b Environment: "test" ManagedBy: tidb-ansible Creator: ansible-auto Tier: "db" internet_gateway: yes # assign internet route_tables: - subnets: - 172.233.1.0/24 - 172.233.2.0/24 routes: - dest: 0.0.0.0/0 gw: igw region: cn-north-1 register: vpc # existing cluster - name: tidb cluster group ec2_group: state: present name: "ansible-sg-by-{{ managed_by }}" description: vpc security group by {{ creator }} vpc_id: "{{ vpc.vpc_id }}" rules: - proto: tcp from_port: 0 to_port: 0 group_name: "ansible-sg-by-{{ managed_by }}" - proto: -1 from_port: 0 to_port: 0 cidr_ip: 172.233.0.0/16 - proto: tcp from_port: 22 to_port: 22 cidr_ip: 0.0.0.0/0 - proto: tcp from_port: 3000 to_port: 3000 cidr_ip: 0.0.0.0/0 - proto: tcp from_port: 3306 to_port: 3306 cidr_ip: 0.0.0.0/0 - proto: tcp from_port: 4000 to_port: 4000 cidr_ip: 0.0.0.0/0 - proto: tcp from_port: 8000 to_port: 8000 cidr_ip: 0.0.0.0/0 - proto: tcp from_port: 4567 to_port: 4567 cidr_ip: 0.0.0.0/0 - proto: tcp from_port: 9000 to_port: 9050 cidr_ip: 0.0.0.0/0 - proto: tcp from_port: 9090 # prometheus to_port: 9091 # pushgateway cidr_ip: 0.0.0.0/0 - proto: tcp from_port: 9200 to_port: 9200 cidr_ip: 0.0.0.0/0 # outbound rules_egress: - proto: -1 from_port: 0 to_port: 0 cidr_ip: 0.0.0.0/0 region: cn-north-1 register: security_group - name: tikv servers ec2: region: cn-north-1 key_name: pingcap group_id: "{{ security_group.group_id }}" instance_type: "{{ tikv_instance_type }}" image: "{{ image_ami }}" wait: yes wait_timeout: 500 # volumes: # - device_name: /dev/xvdb # volume_type: gp2 # volume_size: 80 # delete_on_termination: true # count: 1 instance_tags: Name: tikv-by-{{ creator }} ManagedBy: "{{ managed_by }}" Creator: "{{ creator }}" Type: tikv count_tag: Type: tikv ManagedBy: "{{ managed_by }}" exact_count: "{{ tikv_count }}" vpc_subnet_id: "{{ vpc.subnets[0].id }}" assign_public_ip: yes - name: pd servers ec2: region: cn-north-1 key_name: pingcap group_id: "{{ security_group.group_id }}" instance_type: "{{ pd_instance_type }}" image: "{{ image_ami }}" wait: yes wait_timeout: 500 # count: 1 instance_tags: Name: pd-by-{{ creator }} ManagedBy: "{{ managed_by }}" Creator: "{{ creator }}" Type: pd count_tag: Type: pd ManagedBy: "{{ managed_by }}" exact_count: "{{ pd_count }}" vpc_subnet_id: "{{ vpc.subnets[0].id }}" assign_public_ip: yes - name: tidb servers ec2: region: cn-north-1 key_name: pingcap group_id: "{{ security_group.group_id }}" instance_type: "{{ tidb_instance_type }}" image: "{{ image_ami }}" wait: yes wait_timeout: 500 # count: 1 instance_tags: Name: tidb-by-{{ creator }} ManagedBy: "{{ managed_by }}" Creator: "{{ creator }}" Type: tidb count_tag: Type: tidb ManagedBy: "{{ managed_by }}" exact_count: "{{ tidb_count }}" vpc_subnet_id: "{{ vpc.subnets[0].id }}" assign_public_ip: yes - name: monitoring servers ec2: region: cn-north-1 key_name: pingcap group_id: "{{ security_group.group_id }}" instance_type: "{{ monitoring_instance_type }}" image: "{{ image_ami }}" wait: yes wait_timeout: 500 # volumes: # - device_name: /dev/xvdb # volume_type: gp2 # volume_size: 50 # delete_on_termination: false instance_tags: Name: mon-by-{{ creator }} ManagedBy: "{{ managed_by }}" Creator: "{{ creator }}" Type: monitoring count_tag: Type: monitoring ManagedBy: "{{ managed_by }}" exact_count: "{{ monitoring_count }}" vpc_subnet_id: "{{ vpc.subnets[0].id }}" assign_public_ip: yes ================================================ FILE: cloud/aws-ansible/templates/aws.inventory.ini.j2 ================================================ [tidb_servers] {% if groups.tidb_servers is defined %} {% for item in groups.tidb_servers -%} {{ item }} {% endfor %} {% endif %} [tikv_servers] {% if groups.tikv_servers is defined %} {% for item in groups.tikv_servers -%} {{ item }} {% endfor %} {% endif %} [pd_servers] {% if groups.pd_servers is defined %} {% for item in groups.pd_servers -%} {{ item }} {% endfor %} {% endif %} [monitoring_servers] {% if groups.monitoring_servers is defined %} {% for item in groups.monitoring_servers -%} {{ item }} {% endfor %} {% endif %} [grafana_servers] {% if groups.monitoring_servers is defined %} {% for item in groups.monitoring_servers -%} {{ item }} {% endfor %} {% endif %} [monitored_servers] {% if groups.monitored_servers is defined %} {% for item in groups.monitored_servers -%} {{ item }} {% endfor %} {% endif %} [all:vars] ansible_user = ubuntu cluster_name = {{ creator }}-cluster ================================================ FILE: cloud/aws-ansible/vars.yml ================================================ --- tikv_count: 1 pd_count: 1 tidb_count: 1 # 1 or 0 monitoring_count: 1 creator: pingcap-auto managed_by: ansible-pingcap # CentOS 7 # image_ami: ami-c9a06aa4 # Ubuntu 14.04 # image_ami: ami-0220b23b # CoreOS # image_ami: ami-1ce93d71 # ubuntu 16.04, hvm-ssd # image_ami: ami-a0e136cd image_ami: ami-0220b23b # m3.medium: 4core 16G # t2.xlarge: 4 16 # t2.2xlarge: 8 32 # m3.medium: 1 3.75 1 x 4 (SSD) # m3.large: 2 7.5 1 x 32 (SSD) tidb_instance_type: r3.4xlarge # 2core 8G pd_instance_type: r3.2xlarge tikv_instance_type: i2.4xlarge monitoring_instance_type: t2.2xlarge # volume must be use with EBS only ================================================ FILE: collect_diagnosis.yml ================================================ --- # Copyright 2016 PingCAP, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # See the License for the specific language governing permissions and # limitations under the License. - hosts: localhost tags: - always tasks: - name: clean up fetch_tmp_dir file: path={{ fetch_tmp_dir }} state=absent - name: create fetch directories file: path={{ item }} state=directory mode=0755 with_items: - "{{ fetch_tmp_dir }}" - "{{ fetch_dir }}" - name: collect pd diagnosis information hosts: pd_servers tags: - pd roles: - collector_pd - name: collect tikv diagnosis information hosts: tikv_servers tags: - tikv roles: - collector_tikv - name: collect tidb diagnosis information hosts: tidb_servers tags: - tidb roles: - collector_tidb - name: collect pump diagnosis information hosts: pump_servers tags: - pump roles: - collector_pump - name: collect prometheus metric data hosts: monitoring_servers tags: - prometheus roles: - collector_prometheus - name: collect host infomation hosts: monitored_servers roles: - collector_host tags: - host - hosts: localhost tags: - always tasks: - name: collect inventory.ini shell: "cd {{ fetch_tmp_dir }} && cp {{ playbook_dir }}/inventory.ini ." - name: get datetime shell: date +%Y%m%d_%H%M%S register: datetime changed_when: false - set_fact: archive_name: "collect_diagnosis_{{ datetime.stdout | trim }}.tar.gz" - name: archive all diagnosis files shell: "cd {{ playbook_dir }} && tar czvf {{ fetch_dir }}/{{ archive_name }} {{ fetch_tmp_dir | basename }}" - name: clean up fetch_tmp_dir file: path={{ fetch_tmp_dir }} state=absent - name: display the file path of collect_diagnosis tarball debug: msg: "collect_diagnosis tarball: {{ fetch_dir }}/{{ archive_name }}" ================================================ FILE: common_tasks/add_evict_leader_scheduler.yml ================================================ --- - name: remove evict-leader-scheduler uri: url: "http://{{ pd_addr }}/pd/api/v1/schedulers/evict-leader-scheduler-{{ store_id }}" method: DELETE status_code: 200,500,404 return_content: yes register: scheduler_info until: "'scheduler not found' in scheduler_info.content" retries: 3 delay: 5 when: not enable_tls|default(false) - name: remove evict-leader-scheduler when enable_tls|default(false) uri: url: "https://{{ pd_addr }}/pd/api/v1/schedulers/evict-leader-scheduler-{{ store_id_tls }}" validate_certs: no client_cert: "{{ tikv_cert_dir }}/tikv-server-{{ ansible_host }}.pem" client_key: "{{ tikv_cert_dir }}/tikv-server-{{ ansible_host }}-key.pem" method: DELETE status_code: 200,500,404 return_content: yes register: scheduler_info_tls until: "'scheduler not found' in scheduler_info_tls.content" retries: 3 delay: 5 when: enable_tls|default(false) - name: add evict-leader-scheduler uri: url: "http://{{ pd_addr }}/pd/api/v1/schedulers" method: POST status_code: 200 body_format: json body: name: "evict-leader-scheduler" store_id: "{{ store_id }}" when: not enable_tls|default(false) - name: add evict-leader-scheduler when enable_tls|default(false) uri: url: "https://{{ pd_addr }}/pd/api/v1/schedulers" validate_certs: no client_cert: "{{ tikv_cert_dir }}/tikv-server-{{ ansible_host }}.pem" client_key: "{{ tikv_cert_dir }}/tikv-server-{{ ansible_host }}-key.pem" method: POST status_code: 200 body_format: json body: name: "evict-leader-scheduler" store_id: "{{ store_id_tls }}" when: enable_tls|default(false) - name: check tikv's leader count uri: url: "http://{{ pd_addr }}/pd/api/v1/store/{{ store_id }}" method: GET return_content: yes body_format: json status_code: 200 register: store_info until: (store_info.json.status.leader_count is defined and store_info.json.status.leader_count|int < 1) or store_info.json.status.leader_count is not defined retries: 18 delay: 10 failed_when: false when: not enable_tls|default(false) - name: check tikv's leader count when enable_tls|default(false) uri: url: "https://{{ pd_addr }}/pd/api/v1/store/{{ store_id_tls }}" validate_certs: no client_cert: "{{ tikv_cert_dir }}/tikv-server-{{ ansible_host }}.pem" client_key: "{{ tikv_cert_dir }}/tikv-server-{{ ansible_host }}-key.pem" method: GET return_content: yes body_format: json status_code: 200 register: store_info_tls until: (store_info_tls.json.status.leader_count is defined and store_info_tls.json.status.leader_count|int < 1) or store_info_tls.json.status.leader_count is not defined retries: 18 delay: 10 failed_when: false when: enable_tls|default(false) - name: display leader_count debug: msg: "leader_count: {{ store_info.json.status.leader_count|default(0) }}" when: not enable_tls|default(false) - name: display leader_count when enable_tls|default(false) debug: msg: "leader_count: {{ store_info_tls.json.status.leader_count|default(0) }}" when: enable_tls|default(false) ================================================ FILE: common_tasks/create_grafana_api_keys.yml ================================================ --- - name: Ensure grafana API Key directory exists file: path: "{{ grafana_api_keys_dir }}" state: directory delegate_to: localhost - name: Check grafana API Key list uri: url: "http://{{ grafana_host }}:{{ grafana_port }}/api/auth/keys" user: "{{ grafana_admin_user }}" password: "{{ grafana_admin_password }}" force_basic_auth: yes return_content: yes register: existing_api_keys - name: Check grafana API Key file existed stat: path: "{{ grafana_api_keys_dir }}/grafana_apikey.key" register: grafana_apikey_file delegate_to: localhost - set_fact: apikey_id: "{{ item }}" with_items: "{{ existing_api_keys.json|json_query(apikey_id_query) }}" vars: apikey_id_query: "[?name=='grafana_apikey'].id" when: - ((existing_api_keys['json'] | selectattr("name", "equalto", "grafana_apikey")) | list) | length == 1 - grafana_apikey_file.stat.exists == False - debug: var: apikey_id when: - ((existing_api_keys['json'] | selectattr("name", "equalto", "grafana_apikey")) | list) | length == 1 - grafana_apikey_file.stat.exists == False - name: Delete grafana API Key when grafana API Key file is missing uri: url: "http://{{ grafana_host }}:{{ grafana_port }}/api/auth/keys/{{ apikey_id }}" user: "{{ grafana_admin_user }}" password: "{{ grafana_admin_password }}" force_basic_auth: yes method: DELETE when: - ((existing_api_keys['json'] | selectattr("name", "equalto", "grafana_apikey")) | list) | length == 1 - grafana_apikey_file.stat.exists == False - name: Create grafana API Key uri: url: "http://{{ grafana_host }}:{{ grafana_port }}/api/auth/keys" user: "{{ grafana_admin_user }}" password: "{{ grafana_admin_password }}" force_basic_auth: yes method: POST body_format: json body: "{{ item | to_json }}" with_items: "{{ grafana_api_keys }}" when: (((existing_api_keys['json'] | selectattr("name", "equalto", item['name'])) | list) | length == 0) or (((existing_api_keys['json'] | selectattr("name", "equalto", "grafana_apikey")) | list) | length == 1 and grafana_apikey_file.stat.exists == False) register: new_api_keys - name: Create grafana API key file become: no copy: dest: "{{ grafana_api_keys_dir }}/{{ item['item']['name'] }}.key" content: "{{ item['json']['key'] }}" backup: no when: item['json'] is defined with_items: "{{ new_api_keys['results'] }}" delegate_to: localhost ================================================ FILE: common_tasks/get_pd_leader.yml ================================================ --- - name: get PD leader info uri: url: "http://{{ pd_addr }}/pd/api/v1/leader" method: GET return_content: yes status_code: 200 register: pd_leader_info ================================================ FILE: common_tasks/get_pd_leader_tls.yml ================================================ --- - name: get PD leader info when enable_tls|default(false) uri: url: "https://{{ pd_addr }}/pd/api/v1/leader" validate_certs: no client_cert: "{{ pd_cert_dir }}/pd-server-{{ ansible_host }}.pem" client_key: "{{ pd_cert_dir }}/pd-server-{{ ansible_host }}-key.pem" method: GET return_content: yes status_code: 200 register: pd_leader_info ================================================ FILE: common_tasks/get_pd_name.yml ================================================ --- - name: get PD name uri: url: "http://{{ pd_addr }}/pd/api/v1/members" method: GET return_content: yes status_code: 200 register: pd_info - set_fact: pd_name_list: "{{ pd_info.json.members | json_query(query) }}" vars: query: '[?client_urls==[`http://{{ pd_addr }}`]].name' - set_fact: pd_name: "{{ pd_name_list[0] }}" ================================================ FILE: common_tasks/get_pd_name_tls.yml ================================================ --- - name: get PD name uri: url: "https://{{ pd_addr }}/pd/api/v1/members" validate_certs: no client_cert: "{{ pd_cert_dir }}/pd-server-{{ ansible_host }}.pem" client_key: "{{ pd_cert_dir }}/pd-server-{{ ansible_host }}-key.pem" method: GET return_content: yes status_code: 200 register: pd_info - set_fact: pd_name_list: "{{ pd_info.json.members | json_query(query) }}" vars: query: '[?client_urls==[`https://{{ pd_addr }}`]].name' - set_fact: pd_name: "{{ pd_name_list[0] }}" ================================================ FILE: common_tasks/get_pd_tikv_addr.yml ================================================ --- - set_fact: pd_host: "{{ hostvars[groups.pd_servers[0]].ansible_host | default(hostvars[groups.pd_servers[0]].inventory_hostname) }}" pd_client_port: "{{ hostvars[groups.pd_servers[0]].pd_client_port }}" - set_fact: pd_addr: "{{ pd_host }}:{{ pd_client_port }}" tikv_addr: "{{ ansible_host }}:{{ tikv_port }}" - name: display pd addr debug: var: pd_addr - name: display tikv addr debug: var: tikv_addr ================================================ FILE: common_tasks/get_store_id.yml ================================================ --- - name: get store info from PD uri: url: "http://{{ pd_addr }}/pd/api/v1/stores" method: GET return_content: yes status_code: 200 register: stores_info - set_fact: store_id: "{{ item }}" with_items: "{{ stores_info.json|json_query(store_id_query) }}" vars: store_id_query: "stores[?store.address==`{{ tikv_addr }}`].store.id" - name: display store id debug: var: store_id - name: check store_id is defined fail: msg: "The tikv node of {{ tikv_addr }} is not registered in this cluster." when: store_id is not defined ================================================ FILE: common_tasks/get_store_id_tls.yml ================================================ --- - name: get store info from PD when enable_tls|default(false) uri: url: "https://{{ pd_addr }}/pd/api/v1/stores" validate_certs: no client_cert: "{{ tikv_cert_dir }}/tikv-server-{{ ansible_host }}.pem" client_key: "{{ tikv_cert_dir }}/tikv-server-{{ ansible_host }}-key.pem" method: GET return_content: yes status_code: 200 register: stores_info_tls - set_fact: store_id_tls: "{{ item }}" with_items: "{{ stores_info_tls.json|json_query(store_id_query) }}" vars: store_id_query: "stores[?store.address==`{{ tikv_addr }}`].store.id" - name: display store id debug: var: store_id_tls ================================================ FILE: common_tasks/remove_evict_leader_scheduler.yml ================================================ --- - name: remove evict-leader-scheduler uri: url: "http://{{ pd_addr }}/pd/api/v1/schedulers/evict-leader-scheduler-{{ store_id }}" method: DELETE status_code: 200 when: not enable_tls|default(false) - name: remove evict-leader-scheduler when enable_tls|default(false) uri: url: "https://{{ pd_addr }}/pd/api/v1/schedulers/evict-leader-scheduler-{{ store_id_tls }}" validate_certs: no client_cert: "{{ tikv_cert_dir }}/tikv-server-{{ ansible_host }}.pem" client_key: "{{ tikv_cert_dir }}/tikv-server-{{ ansible_host }}-key.pem" method: DELETE status_code: 200 when: enable_tls|default(false) ================================================ FILE: common_tasks/transfer_pd_leader.yml ================================================ --- - set_fact: pd_leader_name: "{{ pd_leader_info.json.name }}" - name: display PD leader name debug: var: pd_leader_name - name: transfer PD leader to another PD server uri: url: "http://{{ pd_addr }}/pd/api/v1/leader/resign" method: POST status_code: 200 when: - groups['pd_servers'] | length >= 3 - pd_leader_name == pd_name - not enable_tls|default(false) - name: transfer PD leader to another PD server when enable_tls|default(false) uri: url: "https://{{ pd_addr }}/pd/api/v1/leader/resign" method: POST validate_certs: no client_cert: "{{ pd_cert_dir }}/pd-server-{{ ansible_host }}.pem" client_key: "{{ pd_cert_dir }}/pd-server-{{ ansible_host }}-key.pem" status_code: 200 when: - groups['pd_servers'] | length >= 3 - pd_leader_name == pd_name - enable_tls|default(false) - name: wait for transfering PD leader pause: seconds: 10 when: - groups['pd_servers'] | length >= 3 - pd_leader_name == pd_name - name: check current PD leader uri: url: "http://{{ pd_addr }}/pd/api/v1/leader" method: GET return_content: yes body_format: json status_code: 200 register: pd_leader_info until: pd_leader_info.json is defined and pd_leader_info.json.name is defined and pd_leader_info.json.name != pd_name retries: 12 delay: 10 failed_when: false when: - groups['pd_servers'] | length >= 3 - pd_leader_name == pd_name - not enable_tls|default(false) - name: check current PD leader when enable_tls|default(false) uri: url: "https://{{ pd_addr }}/pd/api/v1/leader" validate_certs: no client_cert: "{{ pd_cert_dir }}/pd-server-{{ ansible_host }}.pem" client_key: "{{ pd_cert_dir }}/pd-server-{{ ansible_host }}-key.pem" method: GET return_content: yes body_format: json status_code: 200 register: pd_leader_info_tls until: pd_leader_info_tls.json is defined and pd_leader_info_tls.json.name is defined and pd_leader_info_tls.json.name != pd_name retries: 12 delay: 10 failed_when: false when: - groups['pd_servers'] | length >= 3 - pd_leader_name == pd_name - enable_tls|default(false) ================================================ FILE: conf/alertmanager.yml ================================================ global: # The smarthost and SMTP sender used for mail notifications. smtp_smarthost: 'localhost:25' smtp_from: 'alertmanager@example.org' smtp_auth_username: 'alertmanager' smtp_auth_password: 'password' # smtp_require_tls: true # The Slack webhook URL. # slack_api_url: '' route: # A default receiver receiver: "db-alert-email" # The labels by which incoming alerts are grouped together. For example, # multiple alerts coming in for cluster=A and alertname=LatencyHigh would # be batched into a single group. group_by: ['env','instance','alertname','type','group','job'] # When a new group of alerts is created by an incoming alert, wait at # least 'group_wait' to send the initial notification. # This way ensures that you get multiple alerts for the same group that start # firing shortly after another are batched together on the first # notification. group_wait: 30s # When the first notification was sent, wait 'group_interval' to send a batch # of new alerts that started firing for that group. group_interval: 3m # If an alert has successfully been sent, wait 'repeat_interval' to # resend them. repeat_interval: 3m routes: # - match: # receiver: webhook-kafka-adapter # continue: true # - match: # env: test-cluster # receiver: db-alert-slack # - match: # env: test-cluster # receiver: db-alert-email receivers: # - name: 'webhook-kafka-adapter' # webhook_configs: # - send_resolved: true # url: 'http://10.0.3.6:28082/v1/alertmanager' #- name: 'db-alert-slack' # slack_configs: # - channel: '#alerts' # username: 'db-alert' # icon_emoji: ':bell:' # title: '{{ .CommonLabels.alertname }}' # text: '{{ .CommonAnnotations.summary }} {{ .CommonAnnotations.description }} expr: {{ .CommonLabels.expr }} http://172.0.0.1:9093/#/alerts' - name: 'db-alert-email' email_configs: - send_resolved: true to: 'xxx@xxx.com' ================================================ FILE: conf/drainer.toml ================================================ # drainer Configuration. # the interval time (in seconds) of detect pumps' status detect-interval = 10 # Use the specified compressor algorithm to compress payload between pump and drainer # compressor = "gzip" # syncer Configuration. [syncer] # Assume the upstream sql-mode. # If this is setted , drainer will use the sql-mode to parse DDL statment # sql-mode = "STRICT_TRANS_TABLES,NO_ENGINE_SUBSTITUTION # disable sync these schema ignore-schemas = "INFORMATION_SCHEMA,PERFORMANCE_SCHEMA,mysql" # number of binlog events in a transaction batch txn-batch = 20 # work count to execute binlogs # if the latency between drainer and downstream(mysql or tidb) are too high, you might want to increase this # to get higher throughput by higher concurrent write to the downstream worker-count = 16 # whether to disable the SQL feature of splitting a single binlog event. # If it is set to "true", binlog events are restored to a single transaction for synchronization based on the order of binlogs. # If the downstream service is MySQL, set it to "False". disable-dispatch = false # safe mode will split update to delete and insert safe-mode = false # downstream storage, equal to --dest-db-type # valid values are "mysql", "file", "tidb", "kafka" db-type = "mysql" # ignore syncing the txn with specified commit ts to downstream ignore-txn-commit-ts = [] # replicate-do-db priority over replicate-do-table if have same db name # and we support regex expression , start with '~' declare use regex expression. # replicate-do-db = ["~^b.*","s1"] # [[syncer.replicate-do-table]] # db-name ="test" # tbl-name = "log" # [[syncer.replicate-do-table]] # db-name ="test" # tbl-name = "~^a.*" # disable sync these table # [[syncer.ignore-table]] # db-name = "test" # tbl-name = "log" # the downstream mysql protocol database [syncer.to] host = "127.0.0.1" user = "root" password = "" port = 3306 # Uncomment this if you want to use file as db-type. # [syncer.to] # dir = "data.drainer" # when db-type is kafka, you can uncomment this to config the down stream kafka, it will be the globle config kafka default # [syncer.to] # only need config one of zookeeper-addrs and kafka-addrs, will get kafka address if zookeeper-addrs is configed. # zookeeper-addrs = "127.0.0.1:2181" # kafka-addrs = "127.0.0.1:9092" # kafka-version = "0.8.2.0" # kafka-max-messages = 1024 # the topic name drainer will push msg, the default name is _obinlog # be careful don't use the same name if run multi drainer instances # topic-name = "" ================================================ FILE: conf/pd.yml ================================================ --- # default configuration file for pd in yaml format global: # lease: 3 # tso-save-interval: "3s" security: log: #level: "info" # file logging file: # max log file size in MB # max-size: 300 # max log file keep days # max-days: 28 # maximum number of old log files to retain # max-backups: 7 # rotate log by day # log-rotate: true metric: schedule: # max-merge-region-size: 20 # max-merge-region-keys: 200000 # split-merge-interval: "1h" # max-snapshot-count: 3 # max-pending-peer-count: 16 # max-store-down-time: "30m" # leader-schedule-limit: 4 # region-schedule-limit: 64 # replica-schedule-limit: 64 # merge-schedule-limit: 8 # enable-one-way-merge: false replication: # The number of replicas for each region. # max-replicas: 3 # The label keys specified the location of a store. # The placement priorities is implied by the order of label keys. # For example, ["zone", "rack"] means that we should place replicas to # different zones first, then to different racks if we don't have enough zones. # location-labels: [] dashboard: ## Configurations below are for the TiDB Dashboard embedded in the PD. ## The path of the CA certificate used to verify the TiDB server in TLS. # tidb-cacert-path: "" ## The path of the certificate used to connect to TiDB server in TLS. # tidb-cert-path: "" ## The path of the certificate private key. # tidb-key-path: "" ## The public path prefix to serve Dashboard urls. It can be set when Dashboard ## is running behind a reverse proxy. Do not configure it if you access ## Dashboard directly. # public-path-prefix: "/dashboard" ## When enabled, request will be proxied to the instance running Dashboard ## internally instead of result in a 307 redirection. # internal-proxy: false ## When enabled, usage data will be sent to PingCAP for improving user experience. # enable-telemetry: true ================================================ FILE: conf/pump.yml ================================================ --- # default configuration file for pump in yaml format global: # a integer value to control expiry date of the binlog data, indicates for how long (in days) the binlog data would be stored. # must bigger than 0 # gc: 7 # number of seconds between heartbeat ticks (in 2 seconds) # heartbeat-interval: 2 security: # Path of file that contains list of trusted SSL CAs for connection with cluster components. # ssl-ca: "/path/to/ca.pem" # Path of file that contains X509 certificate in PEM format for connection with cluster components. # ssl-cert: "/path/to/drainer.pem" # Path of file that contains X509 key in PEM format for connection with cluster components. # ssl-key: "/path/to/drainer-key.pem" storage: # Set to true (by default) to guarantee reliability by ensuring binlog data is flushed to the disk. # sync-log: true # stop write when disk available space less than the configured size # 42 MB -> 42000000, 42 mib -> 44040192 # default: 10 gib # stop-write-at-available-space = "10 gib" ================================================ FILE: conf/spark-defaults.yml ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # Default system properties included when running spark-submit. # This is useful for setting default environmental settings. # Example: # spark.eventLog.dir: "hdfs://namenode:8021/directory" # spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three" spark.eventLog.enabled: false spark.driver.memory: 2g # TiSpark configuration items # PD Cluster Addresses, split by comma. # Do not need to fill, it will be automatically generated through ansible # spark.tispark.pd.addresses: 127.0.0.1:2379 # Max frame size of GRPC response spark.tispark.grpc.framesize: 2147483647 # GRPC timeout time in seconds spark.tispark.grpc.timeout_in_sec: 100 # Metastore reload period in seconds spark.tispark.meta.reload_period_in_sec: 60 # If allow aggregation pushdown (in case of busy TiKV nodes) # spark.tispark.plan.allow_agg_pushdown: true # If allow index double read (which might cause heavy pressure on TiKV) # spark.tispark.plan.allow_index_double_read: false # How many row key in batch for concurrent index scan # spark.tispark.index.scan_batch_size: 2000000 # Maximal threads for index scan retrieving row keys (shared among tasks inside each JVM) # spark.tispark.index.scan_concurrency: 2 # Maximal threads for table scan (shared among tasks inside each JVM) spark.tispark.table.scan_concurrency: 256 # Can be "Low", "Normal", "High" ,which impacts resource to get in TiKV. Low is recommended for not disturbing OLTP workload spark.tispark.request.command.priority: "Low" # Whether to use streaming for response fetching # spark.tispark.coprocess.streaming: false # A comma separated list of expressions. In case you have very old version of TiKV, you might disable some of the expression push-down if not supported # spark.tispark.plan.unsupported_pushdown_exprs: "" # If index scan handles for one region exceeds this limit in original request, downgrade the request to a full table scan rather than original planned index scan # spark.tispark.plan.downgrade.index_threshold: 100000 # An integer, represents timezone offset to UTC time(like 28800, GMT+8), this value will be added to requests issued to TiKV # spark.tispark.request.timezone.offset: 28800 #Whether to load statistics info automatically during database mapping # spark.tispark.statistics.auto_load: true # spark.tispark.plan.allow_index_read: true spark.sql.extensions: org.apache.spark.sql.TiExtensions ================================================ FILE: conf/spark-env.yml ================================================ #!/usr/bin/env bash # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # This file is sourced when running various Spark programs. # Copy it as spark-env.sh and edit that to configure Spark for your site. # Options read when launching programs locally with # ./bin/run-example or ./bin/spark-submit # - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files # - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node # - SPARK_PUBLIC_DNS, to set the public dns name of the driver program # - SPARK_CLASSPATH, default classpath entries to append # Options read by executors and drivers running inside the cluster # - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node # - SPARK_PUBLIC_DNS, to set the public DNS name of the driver program # - SPARK_CLASSPATH, default classpath entries to append # - SPARK_LOCAL_DIRS, storage directories to use on this node for shuffle and RDD data # - MESOS_NATIVE_JAVA_LIBRARY, to point to your libmesos.so if you use Mesos # Options read in YARN client mode # - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files # - SPARK_EXECUTOR_INSTANCES, Number of executors to start (Default: 2) # - SPARK_EXECUTOR_CORES, Number of cores for the executors (Default: 1). # - SPARK_EXECUTOR_MEMORY, Memory per Executor (e.g. 1000M, 2G) (Default: 1G) # - SPARK_DRIVER_MEMORY, Memory for Driver (e.g. 1000M, 2G) (Default: 1G) # Options for the daemons used in the standalone deploy mode # - SPARK_MASTER_HOST, to bind the master to a different IP address or hostname # - SPARK_MASTER_PORT / SPARK_MASTER_WEBUI_PORT, to use non-default ports for the master # - SPARK_MASTER_OPTS, to set config properties only for the master (e.g. "-Dx=y") # - SPARK_WORKER_CORES, to set the number of cores to use on this machine # - SPARK_WORKER_MEMORY, to set how much total memory workers have to give executors (e.g. 1000m, 2g) # - SPARK_WORKER_PORT / SPARK_WORKER_WEBUI_PORT, to use non-default ports for the worker # - SPARK_WORKER_INSTANCES, to set the number of worker processes per node # - SPARK_WORKER_DIR, to set the working directory of worker processes # - SPARK_WORKER_OPTS, to set config properties only for the worker (e.g. "-Dx=y") # - SPARK_DAEMON_MEMORY, to allocate to the master, worker and history server themselves (default: 1g). # - SPARK_HISTORY_OPTS, to set config properties only for the history server (e.g. "-Dx=y") # - SPARK_SHUFFLE_OPTS, to set config properties only for the external shuffle service (e.g. "-Dx=y") # - SPARK_DAEMON_JAVA_OPTS, to set config properties for all daemons (e.g. "-Dx=y") # - SPARK_PUBLIC_DNS, to set the public dns name of the master or workers # Generic options for the daemons used in the standalone deploy mode # - SPARK_CONF_DIR Alternate conf dir. (Default: ${SPARK_HOME}/conf) # - SPARK_LOG_DIR Where log files are stored. (Default: ${SPARK_HOME}/logs) # - SPARK_PID_DIR Where the pid file is stored. (Default: /tmp) # - SPARK_IDENT_STRING A string representing this instance of spark. (Default: $USER) # - SPARK_NICENESS The scheduling priority for daemons. (Default: 0) # - SPARK_NO_DAEMONIZE Run the proposed command in the foreground. It will not output a PID file. # export JAVA_HOME, to set jdk home # Maybe you can get cores and memory through the following configuration # ansible spark_master,spark_slaves -m shell -a 'cat /proc/cpuinfo| grep "processor"| wc -l' # ansible spark_master,spark_slaves -m shell -a "free" | grep 'Mem' | awk '{print int($2/1024/1024/1.25)}' SPARK_EXECUTOR_CORES: 5 SPARK_EXECUTOR_MEMORY: 10g SPARK_WORKER_CORES: 5 SPARK_WORKER_MEMORY: 10g ================================================ FILE: conf/ssl/ca-config.json ================================================ { "signing": { "default": { "expiry": "876000h" }, "profiles": { "server": { "expiry": "876000h", "usages": [ "signing", "key encipherment", "server auth", "client auth" ] }, "client": { "expiry": "876000h", "usages": [ "signing", "key encipherment", "client auth" ] } } } } ================================================ FILE: conf/ssl/ca-csr.json ================================================ { "CN": "My own CA", "key": { "algo": "rsa", "size": 2048 }, "names": [ { "C": "CN", "L": "Beijing", "O": "PingCAP", "ST": "Beijing" } ] } ================================================ FILE: conf/tidb-lightning.yml ================================================ --- ### tidb-lightning configuration lightning: # check if the cluster satisfies the minimum requirement before starting # check-requirements = true # table-concurrency controls the maximum handled tables concurrently while reading Mydumper SQL files. # index-concurrency controls the maximum handled index concurrently while reading Mydumper SQL files. # They can affect the tikv-importer memory and disk usage. # table-concurrency + index-concurrency must be <= max-open-engines value in tikv-importer.tmol index-concurrency: 2 table-concurrency: 6 # region-concurrency changes the concurrency number of data. It is set to the number of logical CPU cores by default and needs no configuration. # in mixed configuration, you can set it to 75% of the size of logical CPU cores. # region-concurrency default to runtime.NumCPU() # region-concurrency: # io-concurrency controls the maximum IO concurrency io-concurrency: 5 # logging level: "info" max-size: 128 # MB max-days: 28 max-backups: 14 checkpoint: # Whether to enable checkpoints. # While importing, Lightning will record which tables have been imported, so even if Lightning or other component # crashed, we could start from a known good state instead of redoing everything. enable: true # The schema name (database name) to store the checkpoints schema: "tidb_lightning_checkpoint" # Where to store the checkpoints. # Set to "file" to store as a local file. # Set to "mysql" to store into a remote MySQL-compatible database # driver: "file" # The data source name (DSN) indicating the location of the checkpoint storage. # For "file" driver, the DSN is a path. If not specified, Lightning would default to "/tmp/CHKPTSCHEMA.pb". # For "mysql" driver, the DSN is a URL in the form "USER:PASS@tcp(HOST:PORT)/". # If not specified, the TiDB server from the [tidb] section will be used to store the checkpoints. # dsn: "/tmp/tidb_lightning_checkpoint.pb" # Whether to keep the checkpoints after all data are imported. If false, the checkpoints will be deleted. The schema # needs to be dropped manually, however. # keep-after-success: false tikv_importer: # delivery back end ("tidb" or "importer") backend: "importer" # action on duplicated entry ("error", "ignore" or "replace") # on-duplicate: "replace" mydumper: # block size of file reading read-block-size: 65536 # Byte (default = 64 KB) # minimum size (in terms of source data file) of each batch of import. # Lightning will split a large table into multiple engine files according to this size. # batch-size: 107374182400 # Byte (default = 100 GiB) # Engine file needs to be imported sequentially. Due to table-concurrency, multiple engines will be # imported nearly the same time, and this will create a queue and this wastes resources. Therefore, # Lightning will slightly increase the size of the first few batches to properly distribute # resources. The scale up is controlled by this parameter, which expresses the ratio of duration # between the "import" and "write" steps with full concurrency. This can be calculated as the ratio # (import duration / write duration) of a single table of size around 1 GB. The exact timing can be # found in the log. If "import" is faster, the batch size anomaly is smaller, and a ratio of # zero means uniform batch size. This value should be in the range (0 <= batch-import-ratio < 1). # batch-import-ratio: 0.75 # the source data directory of Mydumper. tidb-lightning will automatically create the corresponding database and tables based on the schema file in the directory. # data-source-dir: "/data/mydumper" # If no-schema is set to true, tidb-lightning will obtain the table schema information from tidb-server, # instead of creating the database or tables based on the schema file of data-source-dir. # This applies to manually creating tables or the situation where the table schema exits in TiDB. no-schema: false # the character set of the schema files; only supports one of: # - utf8mb4: the schema files must be encoded as UTF-8, otherwise will emit errors # - gb18030: the schema files must be encoded as GB-18030, otherwise will emit errors # - auto: (default) automatically detect if the schema is UTF-8 or GB-18030, error if the encoding is neither # - binary: do not try to decode the schema files # note that the *data* files are always parsed as binary regardless of schema encoding. # character-set: "auto" # CSV files are imported according to MySQL's LOAD DATA INFILE rules. # See https://pingcap.com/docs/tools/lightning/csv/ for details of these settings csv: separator: ',' delimiter: '"' header: true not-null: false 'null': \N backslash-escape: true trim-last-separator: false # configuration for TiDB (pick one of them if it has many TiDB servers) and the PD server. tidb: # the target cluster information # the listening address of tidb-server. Setting one of them is enough. # host: "127.0.0.1" # port: 4000 # user: "root" # password: "" # table schema information is fetched from TiDB via this status-port. # status-port: 10080 # Lightning uses some code of TiDB (used as a library) and the flag controls its log level. log-level: "error" # Set tidb session variables to speed up checksum/analyze table. # See https://pingcap.com/docs/sql/statistics/#control-analyze-concurrency for the meaning of each setting build-stats-concurrency: 20 distsql-scan-concurrency: 100 index-serial-scan-concurrency: 20 checksum-table-concurrency: 16 # cron performs some periodic actions in background cron: # duration between which Lightning will automatically refresh the import mode status. # should be shorter than the corresponding TiKV setting switch-mode: '5m' # the duration which the an import progress will be printed to the log. log-progress: '5m' # post-restore provide some options which will be executed after all kv data has been imported into the tikv cluster. # the execution order are(if set true): checksum -> compact -> analyze post_restore: # if it is set to true, tidb-lightning will perform the ADMIN CHECKSUM TABLE operation on the tables one by one. checksum: true # compaction is performed automatically starting v2.1.6. These settings should be left as `false`. # level-1-compact: false # compact: false # if it is set to true, tidb-lightning will perform the ANALYZE TABLE
operation on the tables one by one. # If the Analyze operation fails, you can analyze data manually on the Mysql client. analyze: true ================================================ FILE: conf/tidb.yml ================================================ --- # default configuration file for TiDB in yaml format global: # TiDB Configuration. # The socket file to use for connection. # socket: "" # Schema lease duration, very dangerous to change only if you know what you do. # lease: "45s" # The limit of concurrent executed sessions. # token-limit: 1000 # Only print a log when out of memory quota. # Valid options: ["log", "cancel"] # oom-action: "cancel" # Set the memory quota for a query in bytes. Default: 32GB # mem-quota-query: 34359738368 # Make "kill query" behavior compatible with MySQL. It's not recommend to # turn on this option when TiDB server is behind a proxy. # compatible-kill-query: false # check mb4 value in utf8 is used to control whether to check the mb4 characters when the charset is utf8. # check-mb4-value-in-utf8: true # max-index-length is used to deal with compatibility issues from v3.0.7 and previous version upgrades. It can only be in [3072, 3072*4]. # max-index-length: 3072 # alter-primary-key is used to control alter primary key feature. Default is false, indicate the alter primary key feature is disabled. # If it is true, we can add the primary key by "alter table". However, if a table already exists before the switch is turned true and # the data type of its primary key column is an integer, the primary key cannot be dropped. # alter-primary-key: false # server-version is used to change the version string of TiDB in the following scenarios: # 1. the server version returned by builtin-function `VERSION()`. # 2. the server version filled in handshake packets of MySQL Connection Protocol, see https://dev.mysql.com/doc/internals/en/connection-phase-packets.html#packet-Protocol::Handshake for more details. # if server-version = "", the default value(original TiDB version string) is used. server-version: "" # Whether new collations are enabled, as indicated by its name, this configuration entry take effect ONLY when a TiDB cluster bootstraps for the first time. new_collations_enabled_on_first_bootstrap: false # When enabled, usage data (for example, instance versions) will be reported to PingCAP periodically for user experience analytics. # If this config is set to `false` on all TiDB servers, telemetry will be always disabled regardless of the value of the global variable `tidb_enable_telemetry`. # See PingCAP privacy policy for details: https://pingcap.com/en/privacy-policy/ # enable-telemetry: true log: # Log level: debug, info, warn, error, fatal. # level: "info" # Queries with execution time greater than this value will be logged. (Milliseconds) # slow-threshold: 300 # Queries with internal result greater than this value will be logged. # expensive-threshold: 10000 status: # TiDB status host. # status-host: "0.0.0.0" # Prometheus pushgateway address, leaves it empty will disable prometheus push. # TiDB status port. # status-port: 10080 # Prometheus pushgateway address, leaves it empty will disable prometheus push. # metrics-addr: "" # Prometheus client push interval in second, set \"0\" to disable prometheus push. # metrics-interval: 15 performance: # Max CPUs to use, 0 use number of CPUs in the machine. # max-procs: 0 # Max memory size to use, 0 use the total usable memory in the machine. # max-memory: 0 # StmtCountLimit limits the max count of statement inside a transaction. # stmt-count-limit: 5000 # Stats lease duration, which influences the time of analyze and stats load. # stats-lease: "3s" proxy_protocol: prepared_plan_cache: # enabled: false # capacity: 100 # memory-guard-ratio: 0.1 opentracing: # Enable opentracing. # enable: false # Whether to enable the rpc metrics. # rpc-metrics: false sampler: # Type specifies the type of the sampler: const, probabilistic, rateLimiting, or remote # type: "const" # Param is a value passed to the sampler. # Valid values for Param field are: # - for "const" sampler, 0 or 1 for always false/true respectively # - for "probabilistic" sampler, a probability between 0 and 1 # - for "rateLimiting" sampler, the number of spans per second # - for "remote" sampler, param is the same as for "probabilistic" # and indicates the initial sampling rate before the actual one # is received from the mothership # param: 1.0 # SamplingServerURL is the address of jaeger-agent's HTTP sampling server # sampling-server-url: "" # MaxOperations is the maximum number of operations that the sampler # will keep track of. If an operation is not tracked, a default probabilistic # sampler will be used rather than the per operation specific sampler. # max-operations: 0 # SamplingRefreshInterval controls how often the remotely controlled sampler will poll # jaeger-agent for the appropriate sampling strategy. # sampling-refresh-interval: 0 reporter: # QueueSize controls how many spans the reporter can keep in memory before it starts dropping # new spans. The queue is continuously drained by a background go-routine, as fast as spans # can be sent out of process. # queue-size: 0 # BufferFlushInterval controls how often the buffer is force-flushed, even if it's not full. # It is generally not useful, as it only matters for very low traffic services. # buffer-flush-interval: 0 # LogSpans, when true, enables LoggingReporter that runs in parallel with the main reporter # and logs all submitted spans. Main Configuration.Logger must be initialized in the code # for this option to have any effect. # log-spans: false # LocalAgentHostPort instructs reporter to send spans to jaeger-agent at this address # local-agent-host-port: "" tikv_client: # Max gRPC connections that will be established with each tikv-server. # grpc-connection-count: 4 # After a duration of this time in seconds if the client doesn't see any activity it pings # the server to see if the transport is still alive. # grpc-keepalive-time: 10 # After having pinged for keepalive check, the client waits for a duration of Timeout in seconds # and if no activity is seen even after that the connection is closed. # grpc-keepalive-timeout: 3 # max time for commit command, must be twice bigger than raft election timeout. # commit-timeout: "41s" # Max batch size in gRPC. # max-batch-size: 128 # Overload threshold of TiKV. # overload-threshold: 200 # Max batch wait time in nanosecond to avoid waiting too long. 0 means disable this feature. # max-batch-wait-time: 0 # Batch wait size, to avoid waiting too long. # batch-wait-size: 8 txn_local_latches: binlog: # WriteTimeout specifies how long it will wait for writing binlog to pump. # write-timeout: "15s" # If IgnoreError is true, when writting binlog meets error, TiDB would stop writting binlog, # but still provide service. # ignore-error: false pessimistic_txn: # enable pessimistic transaction. # enable: true # max retry count for a statement in a pessimistic transaction. # max-retry-count: 256 experimental: # enable column attribute `auto_random` to be defined on the primary key column. allow-auto-random: false # enable creating expression index. allow-expression-index: false ================================================ FILE: conf/tiflash-learner.yml ================================================ # TiKV config template # Human-readable big numbers: # File size(based on byte): KB, MB, GB, TB, PB # e.g.: 1_048_576: "1MB" # Time(based on ms): ms, s, m, h # e.g.: 78_000: "1.3m" readpool: storage: coprocessor: server: storage: pd: # This section will be overwritten by command line parameters metric: #address: "172.16.30.31:9531" #interval: "15s" #job: "tikv" raftstore: coprocessor: rocksdb: wal-dir: "" defaultcf: lockcf: writecf: raftdb: defaultcf: security: ca-path: "" cert-path: "" key-path: "" import: ================================================ FILE: conf/tiflash.yml ================================================ --- global: display_name: "TiFlash" default_profile: "default" mark_cache_size: 5368709120 listen_host: "0.0.0.0" flash: flash_cluster: refresh_interval: 20 update_rule_interval: 5 master_ttl: 60 proxy: status: logger: count: 20 size: "1000M" level: "debug" application: runAsDaemon: true raft: quotas: default: interval: result_rows: 0 read_rows: 0 execution_time: 0 queries: 0 errors: 0 duration: 3600 users: readonly: quota: "default" profile: "readonly" password: "" networks: ip: "::/0" default: quota: "default" profile: "default" password: "" networks: ip: "::/0" profiles: readonly: readonly: 1 default: load_balancing: "random" use_uncompressed_cache: 0 max_memory_usage: 10000000000 ================================================ FILE: conf/tikv-importer.yml ================================================ --- # TiKV Importer configuration file template global: # log file. # log level: trace, debug, info, warn, error, off. log-level: "info" server: # size of thread pool for the gRPC server. grpc-concurrency: 16 metric: # the Prometheus client push job name. job: "tikv-importer" # the Prometheus client push interval. interval: "15s" # the Prometheus Pushgateway address. # address: "" rocksdb: # the maximum number of concurrent background jobs. max-background-jobs: 32 defaultcf: # amount of data to build up in memory before flushing data to the disk. write-buffer-size: "1GB" # the maximum number of write buffers that are built up in memory. max-write-buffer-number: 8 # the compression algorithms used in different levels. # the algorithm at level-0 is used to compress KV data. # the algorithm at level-6 is used to compress SST files. # the algorithms at level-1 ~ level-5 are not used now. compression-per-level: ["lz4", "no", "no", "no", "no", "no", "lz4"] writecf: compression-per-level: ["lz4", "no", "no", "no", "no", "no", "lz4"] import: # this directory is used to store the data written by `tidb-lightning`. # import-dir: "/tmp/tikv/import" # the number of threads to handle RPC requests. num-threads: 16 # the number of concurrent import jobs. num-import-jobs: 24 # the stream channel window size. Stream will be blocked when the channel is full. stream-channel-window: 128 # maximum duration to prepare regions. # max-prepare-duration = "5m" # split regions into this size according to the importing data. # region-split-size = "512MB" # max-open-engines must be >= index-concurrency + table-concurrency value in tidb-lightning.toml max-open-engines: 8 # speed limit of uploading SST to TiKV (unit: byte/s) # upload-speed-limit: "512MB" # minimum ratio of target store available space: store_available_space / store_capacity # Importer will pause to upload SST to target store if its available ratio less than # this value, and give the store some time window to balance regions. min-available-ratio: 0.05 # Note: the machine's memory size should be more than # (write-buffer-size * max-write-buffer-number * 2) + (num-import-jobs * region-split-size * 2) ================================================ FILE: conf/tikv.yml ================================================ --- ## The default configuration file for TiKV in YAML format ## TiKV config template ## Human-readable big numbers: ## File size(based on byte): KB, MB, GB, TB, PB ## e.g.: 1_048_576 = "1MB" ## Time(based on ms): ms, s, m, h ## e.g.: 78_000 = "1.3m" global: ## Log levels: trace, debug, info, warning, error, critical. ## Note that `debug` and `trace` are only available in development builds. # log-level: "info" ## Timespan between rotating the log files. ## Once this timespan passes, log files will be rotated, i.e. existing log file will have a ## timestamp appended to its name and a new file will be created. # log-rotation-timespan: "24h" readpool: ## Configurations for the single thread pool serving read requests. unified: ## The minimal working thread count of the thread pool. # min-thread-count: 1 ## The maximum working thread count of the thread pool. ## The default value is max(4, LOGICAL_CPU_NUM * 0.8). # max-thread-count: 8 ## Size of the stack for each thread in the thread pool. # stack-size: "10MB" ## Max running tasks of each worker, reject if exceeded. # max-tasks-per-worker: 2000 storage: ## Whether to use the unified read pool to handle storage requests. # use-unified-pool: false ## The following configurations only take effect when `use-unified-pool` is false. ## Size of the thread pool for high-priority operations. # high-concurrency: 4 ## Size of the thread pool for normal-priority operations. # normal-concurrency: 4 ## Size of the thread pool for low-priority operations. # low-concurrency: 4 ## Max running high-priority operations of each worker, reject if exceeded. # max-tasks-per-worker-high: 2000 ## Max running normal-priority operations of each worker, reject if exceeded. # max-tasks-per-worker-normal: 2000 ## Max running low-priority operations of each worker, reject if exceeded. # max-tasks-per-worker-low: 2000 ## Size of the stack for each thread in the thread pool. # stack-size: "10MB" coprocessor: ## Whether to use the unified read pool to handle storage requests. # use-unified-pool: true ## The following configurations only take effect when `use-unified-pool` is false. ## Most read requests from TiDB are sent to the coprocessor of TiKV. high/normal/low-concurrency is ## used to set the number of threads of the coprocessor. ## If there are many read requests, you can increase these config values (but keep it within the ## number of system CPU cores). For example, for a 32-core machine deployed with TiKV, you can even ## set these config to 30 in heavy read scenarios. ## If CPU_NUM > 8, the default thread pool size for coprocessors is set to CPU_NUM * 0.8. # high-concurrency: 8 # normal-concurrency: 8 # low-concurrency: 8 # max-tasks-per-worker-high: 2000 # max-tasks-per-worker-normal: 2000 # max-tasks-per-worker-low: 2000 # stack-size: "10MB" server: ## Advertise listening address for client communication. ## If not set, `addr` will be used. # advertise-addr: "" ## Size of the thread pool for the gRPC server. # grpc-concurrency: 4 ## The number of max concurrent streams/requests on a client connection. # grpc-concurrent-stream: 1024 ## The number of connections with each TiKV server to send Raft messages. # grpc-raft-conn-num: 1 ## Amount to read ahead on individual gRPC streams. # grpc-stream-initial-window-size: "2MB" ## Time to wait before sending out a ping to check if server is still alive. ## This is only for communications between TiKV instances. # grpc-keepalive-time: "10s" ## Time to wait before closing the connection without receiving KeepAlive ping Ack. # grpc-keepalive-timeout: "3s" ## How many snapshots can be sent concurrently. # concurrent-send-snap-limit: 32 ## How many snapshots can be received concurrently. # concurrent-recv-snap-limit: 32 ## Max allowed recursion level when decoding Coprocessor DAG expression. # end-point-recursion-limit: 1000 ## Max time to handle Coprocessor requests before timeout. # end-point-request-max-handle-duration: "60s" ## Max bytes that snapshot can be written to disk in one second. ## It should be set based on your disk performance. # snap-max-write-bytes-per-sec: "100MB" ## Attributes about this server, e.g. `{ zone = "us-west-1", disk = "ssd" }`. # labels: {} storage: ## The number of slots in Scheduler latches, which controls write concurrency. ## In most cases you can use the default value. When importing data, you can set it to a larger ## value, but no more than 2097152 # scheduler-concurrency: 524288 ## Scheduler's worker pool size, i.e. the number of write threads. ## It should be less than total CPU cores. When there are frequent write operations, set it to a ## higher value. More specifically, you can run `top -H -p tikv-pid` to check whether the threads ## named `sched-worker-pool` are busy. # scheduler-worker-pool-size: 4 ## When the pending write bytes exceeds this threshold, the "scheduler too busy" error is displayed. # scheduler-pending-write-threshold: "100MB" ## TiKV will create a temporary file in {{data-dir}} to reserve some space, which is named 'space_placeholder_file'. ## When the disk has no free space you could remove this temporary file so thath TiKV can execute compaction ## job to reclaim disk space, which requires some extra temporary space. # reserve-space: "2GB" block-cache: ## Whether to create a shared block cache for all RocksDB column families. ## ## Block cache is used by RocksDB to cache uncompressed blocks. Big block cache can speed up ## read. It is recommended to turn on shared block cache. Since only the total cache size need ## to be set, it is easier to config. In most cases it should be able to auto-balance cache ## usage between column families with standard LRU algorithm. ## ## The rest of config in the storage.block-cache session is effective only when shared block ## cache is on. # shared: true ## Size of the shared block cache. Normally it should be tuned to 30%-50% of system's total ## memory. When the config is not set, it is decided by the sum of the following fields or ## their default value: ## * rocksdb.defaultcf.block-cache-size or 25% of system's total memory ## * rocksdb.writecf.block-cache-size or 15% of system's total memory ## * rocksdb.lockcf.block-cache-size or 2% of system's total memory ## * raftdb.defaultcf.block-cache-size or 2% of system's total memory ## ## To deploy multiple TiKV nodes on a single physical machine, configure this parameter ## explicitly. Otherwise, the OOM problem might occur in TiKV. # capacity: "1GB" pd: ## PD endpoints. # endpoints: [] metric: ## Prometheus client push interval. ## Setting the value to 0s stops Prometheus client from pushing. # interval: "15s" ## Prometheus PushGateway address. ## Leaving it empty stops Prometheus client from pushing. # address: "" ## Prometheus client push job name. ## Note: A node id will automatically append, e.g., "tikv_1". # job: "tikv" raftstore: ## Store capacity, i.e. max data size allowed. ## If it is not set, disk capacity is used. # capacity: 0 ## Internal notify capacity. ## 40960 is suitable for about 7000 Regions. It is recommended to use the default value. # notify-capacity: 40960 ## Maximum number of internal messages to process in a tick. # messages-per-tick: 4096 ## Region heartbeat tick interval for reporting to PD. # pd-heartbeat-tick-interval: "60s" ## Store heartbeat tick interval for reporting to PD. # pd-store-heartbeat-tick-interval: "10s" ## How long the peer will be considered down and reported to PD when it hasn't been active for this ## time. # max-peer-down-duration: "5m" ## Interval to check whether to start manual compaction for a Region. # region-compact-check-interval: "5m" ## Interval (s) to check Region whether the data are consistent. # consistency-check-interval: 0 ## Delay time before deleting a stale peer. # clean-stale-peer-delay: "10m" ## Use how many threads to handle log apply # apply-pool-size: 2 ## Use how many threads to handle raft messages # store-pool-size: 2 coprocessor: rocksdb: ## Maximum number of threads of RocksDB background jobs. ## The background tasks include compaction and flush. For detailed information why RocksDB needs to ## do compaction, see RocksDB-related materials. When write traffic (like the importing data size) ## is big, it is recommended to enable more threads. But set the number of the enabled threads ## smaller than that of CPU cores. For example, when importing data, for a machine with a 32-core ## CPU, set the value to 28. # max-background-jobs: 8 ## Represents the maximum number of threads that will concurrently perform a sub-compaction job by ## breaking it into multiple, smaller ones running simultaneously. # max-sub-compactions: 1 ## Number of open files that can be used by the DB. ## Value -1 means files opened are always kept open and RocksDB will prefetch index and filter ## blocks into block cache at startup. So if your database has a large working set, it will take ## several minutes to open the DB. You may need to increase this if your database has a large ## working set. You can estimate the number of files based on `target-file-size-base` and ## `target_file_size_multiplier` for level-based compaction. # max-open-files: 40960 ## RocksDB Write-Ahead Logs (WAL) recovery mode. ## 0 : TolerateCorruptedTailRecords, tolerate incomplete record in trailing data on all logs; ## 1 : AbsoluteConsistency, We don't expect to find any corruption in the WAL; ## 2 : PointInTimeRecovery, Recover to point-in-time consistency; ## 3 : SkipAnyCorruptedRecords, Recovery after a disaster; # wal-recovery-mode: 2 ## RocksDB WAL directory. ## This config specifies the absolute directory path for WAL. ## If it is not set, the log files will be in the same directory as data. When you set the path to ## RocksDB directory in memory like in `/dev/shm`, you may want to set`wal-dir` to a directory on a ## persistent storage. See https://github.com/facebook/rocksdb/wiki/How-to-persist-in-memory-RocksDB-database . ## If there are two disks on the machine, storing RocksDB data and WAL logs on different disks can ## improve performance. # wal-dir: "/tmp/tikv/store" ## The following two fields affect how archived WAL will be deleted. ## 1. If both values are set to 0, logs will be deleted ASAP and will not get into the archive. ## 2. If `wal-ttl-seconds` is 0 and `wal-size-limit` is not 0, WAL files will be checked every 10 ## min and if total size is greater than `wal-size-limit`, they will be deleted starting with the ## earliest until `wal-size-limit` is met. All empty files will be deleted. ## 3. If `wal-ttl-seconds` is not 0 and `wal-size-limit` is 0, then WAL files will be checked every ## `wal-ttl-seconds / 2` and those that are older than `wal-ttl-seconds` will be deleted. ## 4. If both are not 0, WAL files will be checked every 10 min and both checks will be performed ## with ttl being first. ## When you set the path to RocksDB directory in memory like in `/dev/shm`, you may want to set ## `wal-ttl-seconds` to a value greater than 0 (like 86400) and backup your DB on a regular basis. ## See https://github.com/facebook/rocksdb/wiki/How-to-persist-in-memory-RocksDB-database . # wal-ttl-seconds: 0 # wal-size-limit: 0 ## Max RocksDB WAL size in total # max-total-wal-size: "4GB" ## RocksDB Statistics provides cumulative stats over time. ## Turning statistics on will introduce about 5%-10% overhead for RocksDB, but it can help you to ## know the internal status of RocksDB. # enable-statistics: true ## Dump statistics periodically in information logs. ## Same as RocksDB's default value (10 min). # stats-dump-period: "10m" ## Refer to: https://github.com/facebook/rocksdb/wiki/RocksDB-FAQ ## If you want to use RocksDB on multi disks or spinning disks, you should set value at least 2MB. # compaction-readahead-size: 0 ## Max buffer size that is used by WritableFileWrite. # writable-file-max-buffer-size: "1MB" ## Use O_DIRECT for both reads and writes in background flush and compactions. # use-direct-io-for-flush-and-compaction: false ## Allows OS to incrementally sync files to disk while they are being written, asynchronously, ## in the background. # bytes-per-sync: "1MB" ## Allows OS to incrementally sync WAL to disk while it is being written. # wal-bytes-per-sync: "512KB" ## Options for `Titan`. titan: ## Enables or disables `Titan`. Note that Titan is still an experimental feature. Once ## enabled, it can't fall back. Forced fallback may result in data loss. # enabled: false ## Maximum number of threads of `Titan` background gc jobs. # max-background-gc: 1 ## Options for "Default" Column Family, which stores actual user data. defaultcf: ## Compression method (if any) is used to compress a block. ## no: kNoCompression ## snappy: kSnappyCompression ## zlib: kZlibCompression ## bzip2: kBZip2Compression ## lz4: kLZ4Compression ## lz4hc: kLZ4HCCompression ## zstd: kZSTD ## `lz4` is a compression algorithm with moderate speed and compression ratio. The compression ## ratio of `zlib` is high. It is friendly to the storage space, but its compression speed is ## slow. This compression occupies many CPU resources. ## Per level compression. ## This config should be chosen carefully according to CPU and I/O resources. For example, if you ## use the compression mode of "no:no:lz4:lz4:lz4:zstd:zstd" and find much I/O pressure of the ## system (run the `iostat` command to find %util lasts 100%, or run the `top` command to find many ## iowaits) when writing (importing) a lot of data while the CPU resources are adequate, you can ## compress level-0 and level-1 and exchange CPU resources for I/O resources. If you use the ## compression mode of "no:no:lz4:lz4:lz4:zstd:zstd" and you find the I/O pressure of the system is ## not big when writing a lot of data, but CPU resources are inadequate. Then run the `top` command ## and choose the `-H` option. If you find a lot of bg threads (namely the compression thread of ## RocksDB) are running, you can exchange I/O resources for CPU resources and change the compression ## mode to "no:no:no:lz4:lz4:zstd:zstd". In a word, it aims at making full use of the existing ## resources of the system and improving TiKV performance in terms of the current resources. # compression-per-level: ["no", "no", "lz4", "lz4", "lz4", "zstd", "zstd"] ## The data block size. RocksDB compresses data based on the unit of block. ## Similar to page in other databases, block is the smallest unit cached in block-cache. Note that ## the block size specified here corresponds to uncompressed data. # block-size: "64KB" ## If you're doing point lookups you definitely want to turn bloom filters on. We use bloom filters ## to avoid unnecessary disk reads. Default bits_per_key is 10, which yields ~1% false positive ## rate. Larger `bloom-filter-bits-per-key` values will reduce false positive rate, but increase ## memory usage and space amplification. # bloom-filter-bits-per-key: 10 # level0-file-num-compaction-trigger: 4 ## Soft limit on number of level-0 files. ## When the number of SST files of level-0 reaches the limit of `level0-slowdown-writes-trigger`, ## RocksDB tries to slow down the write operation, because too many SST files of level-0 can cause ## higher read pressure of RocksDB. # level0-slowdown-writes-trigger: 20 ## Maximum number of level-0 files. ## When the number of SST files of level-0 reaches the limit of `level0-stop-writes-trigger`, ## RocksDB stalls the new write operation. # level0-stop-writes-trigger: 36 ## Amount of data to build up in memory (backed by an unsorted log on disk) before converting to a ## sorted on-disk file. It is the RocksDB MemTable size. # write-buffer-size: "128MB" ## The maximum number of the MemTables. The data written into RocksDB is first recorded in the WAL ## log, and then inserted into MemTables. When the MemTable reaches the size limit of ## `write-buffer-size`, it turns into read only and generates a new MemTable receiving new write ## operations. The flush threads of RocksDB will flush the read only MemTable to the disks to become ## an SST file of level0. `max-background-flushes` controls the maximum number of flush threads. ## When the flush threads are busy, resulting in the number of the MemTables waiting to be flushed ## to the disks reaching the limit of `max-write-buffer-number`, RocksDB stalls the new operation. ## "Stall" is a flow control mechanism of RocksDB. When importing data, you can set the ## `max-write-buffer-number` value higher, like 10. # max-write-buffer-number: 5 ## The minimum number of write buffers that will be merged together before writing to storage. # min-write-buffer-number-to-merge: 1 ## Control maximum total data size for base level (level 1). ## When the level-1 data size reaches the limit value of `max-bytes-for-level-base`, the SST files ## of level-1 and their overlap SST files of level-2 will be compacted. The golden rule: the first ## reference principle of setting `max-bytes-for-level-base` is guaranteeing that the ## `max-bytes-for-level-base` value is roughly equal to the data volume of level-0. Thus ## unnecessary compaction is reduced. For example, if the compression mode is ## "no:no:lz4:lz4:lz4:lz4:lz4", the `max-bytes-for-level-base` value can be `write-buffer-size * 4`, ## because there is no compression of level-0 and level-1 and the trigger condition of compaction ## for level-0 is that the number of the SST files reaches 4 (the default value). When both level-0 ## and level-1 adopt compaction, it is necessary to analyze RocksDB logs to know the size of an SST ## file compressed from a MemTable. For example, if the file size is 32MB, the proposed value of ## `max-bytes-for-level-base` is 32MB * 4 = 128MB. # max-bytes-for-level-base: "512MB" ## Target file size for compaction. ## The SST file size of level-0 is influenced by the compaction algorithm of `write-buffer-size` ## and level0. `target-file-size-base` is used to control the size of a single SST file of level1 to ## level6. # target-file-size-base: "8MB" ## Max bytes for `compaction.max_compaction_bytes`. # max-compaction-bytes: "2GB" ## There are four different compaction priorities. ## 0 : ByCompensatedSize ## 1 : OldestLargestSeqFirst ## 2 : OldestSmallestSeqFirst ## 3 : MinOverlappingRatio # compaction-pri: 3 ## Enable read amplification statistics. ## value => memory usage (percentage of loaded blocks memory) ## 1 => 12.50 % ## 2 => 06.25 % ## 4 => 03.12 % ## 8 => 01.56 % ## 16 => 00.78 % # read-amp-bytes-per-bit: 0 ## Options for "Titan" for "Default" Column Family titan: ## The smallest value to store in blob files. Value smaller than ## this threshold will be inlined in base DB. # min-blob-size: "1KB" ## The compression algorithm used to compress data in blob files. ## Compression method. ## no: kNoCompression ## snappy: kSnappyCompression ## zlib: kZlibCompression ## bzip2: kBZip2Compression ## lz4: kLZ4Compression ## lz4hc: kLZ4HCCompression ## zstd: kZSTD # blob-file-compression: "lz4" ## Specifics cache size for blob records # blob-cache-size: "0GB" ## If the ratio of discardable size of a blob file is larger than ## this threshold, the blob file will be GCed out. # discardable-ratio: 0.5 ## The mode used to process blob files. In read-only mode Titan ## stops writing value into blob log. In fallback mode Titan ## converts blob index into real value on flush and compaction. ## This option is especially useful for downgrading Titan. ## default: kNormal ## read-only: kReadOnly ## fallback: kFallback # blob-run-mode: "normal" ## Options for "Write" Column Family, which stores MVCC commit information writecf: ## Recommend to set it the same as `rocksdb.defaultcf.compression-per-level`. # compression-per-level: ["no", "no", "lz4", "lz4", "lz4", "zstd", "zstd"] # block-size: "64KB" ## Recommend to set it the same as `rocksdb.defaultcf.write-buffer-size`. # write-buffer-size: "128MB" # max-write-buffer-number: 5 # min-write-buffer-number-to-merge: 1 ## Recommend to set it the same as `rocksdb.defaultcf.max-bytes-for-level-base`. # max-bytes-for-level-base: "512MB" # target-file-size-base: "8MB" # level0-file-num-compaction-trigger: 4 # level0-slowdown-writes-trigger: 20 # level0-stop-writes-trigger: 36 # cache-index-and-filter-blocks: true # pin-l0-filter-and-index-blocks: true # compaction-pri: 3 # read-amp-bytes-per-bit: 0 # dynamic-level-bytes: true lockcf: # compression-per-level: ["no", "no", "no", "no", "no", "no", "no"] # block-size: "16KB" # write-buffer-size: "128MB" # max-write-buffer-number: 5 # min-write-buffer-number-to-merge: 1 # max-bytes-for-level-base: "128MB" # target-file-size-base: "8MB" # level0-slowdown-writes-trigger: 20 # level0-stop-writes-trigger: 36 # cache-index-and-filter-blocks: true # pin-l0-filter-and-index-blocks: true # compaction-pri: 0 # read-amp-bytes-per-bit: 0 # dynamic-level-bytes: true raftdb: # max-background-jobs: 4 # max-sub-compactions: 2 # max-open-files: 40960 # max-manifest-file-size: "20MB" # create-if-missing: true # enable-statistics: true # stats-dump-period: "10m" # compaction-readahead-size: 0 # writable-file-max-buffer-size: "1MB" # use-direct-io-for-flush-and-compaction: false # enable-pipelined-write: true # allow-concurrent-memtable-write: false # bytes-per-sync: "1MB" # wal-bytes-per-sync: "512KB" # info-log-max-size: "1GB" # info-log-roll-time: "0" # info-log-keep-log-file-num: 10 # info-log-dir: "" # optimize-filters-for-hits: true defaultcf: ## Recommend to set it the same as `rocksdb.defaultcf.compression-per-level`. # compression-per-level: ["no", "no", "lz4", "lz4", "lz4", "zstd", "zstd"] # block-size: "64KB" ## Recommend to set it the same as `rocksdb.defaultcf.write-buffer-size`. # write-buffer-size: "128MB" # max-write-buffer-number: 5 # min-write-buffer-number-to-merge: 1 ## Recommend to set it the same as `rocksdb.defaultcf.max-bytes-for-level-base`. # max-bytes-for-level-base: "512MB" # target-file-size-base: "8MB" # level0-file-num-compaction-trigger: 4 # level0-slowdown-writes-trigger: 20 # level0-stop-writes-trigger: 36 # cache-index-and-filter-blocks: true # pin-l0-filter-and-index-blocks: true # compaction-pri: 0 # read-amp-bytes-per-bit: 0 # dynamic-level-bytes: true # optimize-filters-for-hits: true security: ## The path for TLS certificates. Empty string means disabling secure connections. # ca-path: "" # cert-path: "" # key-path: "" # cert-allowed-cn: [] ## Configurations for encryption at rest. Experimental. encryption: ## Encryption method to use for data files. ## Possible values are "plaintext", "aes128-ctr", "aes192-ctr" and "aes256-ctr". Value other than ## "plaintext" means encryption is enabled, in which case master key must be specified. # data-encryption-method: "plaintext" ## Specifies how often TiKV rotates data encryption key. # data-key-rotation-period = "7d" ## Specifies master key if encryption is enabled. There are three types of master key: ## ## * "plaintext": ## ## Plaintext as master key means no master key is given and only applicable when ## encryption is not enabled, i.e. data-encryption-method = "plaintext". This type doesn't ## have sub-config items. Example: ## ## master-key: ## type: "plaintext" ## ## * "kms": ## ## Use a KMS service to supply master key. Currently only AWS KMS is supported. This type of ## master key is recommended for production use. Example: ## ## master-key: ## type: "kms" ## ## KMS CMK key id. Must be a valid KMS CMK where the TiKV process has access to. ## ## In production is recommended to grant access of the CMK to TiKV using IAM. ## key-id = "1234abcd-12ab-34cd-56ef-1234567890ab" ## ## AWS region of the KMS CMK. ## region: "us-west-2" ## ## (Optional) AWS KMS service endpoint. Only required when non-default KMS endpoint is ## ## desired. ## endpoint: "https://kms.us-west-2.amazonaws.com" ## ## * "file": ## ## Supply a custom encryption key stored in a file. It is recommended NOT to use in production, ## as it breaks the purpose of encryption at rest, unless the file is stored in tempfs. ## The file must contain a 256-bits (32 bytes, regardless of key length implied by ## data-encryption-method) key encoded as hex string and end with newline ("\n"). Example: ## ## master-key: ## type: "file" ## path: "/path/to/master/key/file" ## # master-key: # type = "plaintext" ## Specifies the old master key when rotating master key. Same config format as master-key. ## The key is only access once during TiKV startup, after that TiKV do not need access to the key. ## And it is okay to leave the stale previous-master-key config after master key rotation. # previous-master-key: # type: "plaintext" import: pessimistic_txn: ## Enable pessimistic transaction # enabled: true ## Time to wait in milliseconds before responding to TiDB when pessimistic ## transactions encounter locks # wait-for-lock-timeout: "1s" ## If more than one transaction is waiting for the same lock, only the one with smallest ## start timestamp will be waked up immediately when the lock is released. Others will ## be waked up after `wake_up_delay_duration(ms)` to reduce contention and make the oldest ## one more likely acquires the lock. # wake-up-delay-duration: "20ms" gc: ## The number of keys to GC in one batch. # batch-keys: 512 ## Max bytes that GC worker can write to rocksdb in one second. ## If it is set to 0, there is no limit. # max-write-bytes-per-sec: "0" ================================================ FILE: create_users.yml ================================================ --- - hosts: all tasks: - name: create user user: name={{ username }} shell=/bin/bash createhome=yes - name: set authorized key authorized_key: user: "{{ username }}" key: "{{ lookup('file', lookup('env','HOME')+ '/.ssh/id_rsa.pub') }}" state: present - name: update sudoers file lineinfile: dest: /etc/sudoers insertafter: EOF line: '{{ username }} ALL=(ALL) NOPASSWD: ALL' regexp: '^{{ username }} .*' state: present ================================================ FILE: deploy.yml ================================================ --- # Copyright 2016 PingCAP, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # See the License for the specific language governing permissions and # limitations under the License. # The Playbook of TiDB - name: check config locally hosts: localhost any_errors_fatal: true tags: - always roles: - check_config_static - name: check system environment hosts: monitored_servers any_errors_fatal: true tags: - always roles: - check_system_dynamic - name: initializing deployment target hosts: all any_errors_fatal: true tags: - always roles: - check_config_dynamic - name: Pre-check PD configuration hosts: pd_servers[0] tags: - pd roles: - check_config_pd - name: Pre-check TiKV configuration hosts: tikv_servers[0] tags: - tikv roles: - check_config_tikv - name: Pre-check TiDB configuration hosts: tidb_servers[0] tags: - tidb roles: - check_config_tidb - name: deploying node_exporter hosts: monitored_servers tags: - node_exporter roles: - node_exporter - name: deploying blackbox_exporter hosts: monitored_servers tags: - blackbox_exporter roles: - blackbox_exporter - name: deploying diagnostic tools hosts: monitored_servers tags: - collect_diagnosis roles: - collect_diagnosis - name: deploying alertmanager hosts: alertmanager_servers tags: - alertmanager roles: - alertmanager - name: deploying pushgateway hosts: monitoring_servers tags: - pushgateway roles: - pushgateway - name: deploying prometheus hosts: monitoring_servers tags: - prometheus roles: - prometheus - name: deploying grafana hosts: grafana_servers tags: - grafana roles: - grafana - name: deploying kafka_exporter hosts: kafka_exporter_servers tags: - kafka_exporter roles: - { role: kafka_exporter, when: 'enable_binlog|default(false) and kafka_addrs|default("") != ""' } # deploying TiDB cluster - name: deploying PD cluster hosts: pd_servers tags: - pd roles: - pd - name: deploying TiKV cluster hosts: tikv_servers tags: - tikv roles: - tikv - name: deploying TiFlash cluster hosts: tiflash_servers tags: - tiflash roles: - { role: tiflash, when: cpu_architecture == 'amd64' } - name: deploying pump cluster hosts: pump_servers tags: - pump roles: - { role: pump, when: enable_binlog|default(false) } - name: deploying TiDB cluster hosts: tidb_servers tags: - tidb roles: - tidb - { role: tispark, when: "(groups.get('spark_master', []) | length == 0 or groups.get('spark_slaves', []) | length == 0) and (deployment_method == 'binary')" } - name: deploying tispark cluster hosts: spark_master,spark_slaves tags: - tispark roles: - { role: tispark, when: "groups.get('spark_master', []) | length != 0 and groups.get('spark_slaves', []) | length != 0 and deployment_method == 'binary'" } - name: deploying tidb-lightning hosts: lightning_server tags: - lightning roles: - tidb_lightning - name: deploying tikv-importer hosts: importer_server tags: - lightning roles: - tikv_importer - name: finalizing deployment target hosts: all become: true roles: - { role: firewalld, when: enable_firewalld is defined and enable_firewalld } - name: deploying perf-tools hosts: monitored_servers tags: - always roles: - perf_tools ================================================ FILE: deploy_drainer.yml ================================================ --- # Copyright 2018 PingCAP, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # See the License for the specific language governing permissions and # limitations under the License. # The Playbook of TiDB - name: check config locally hosts: localhost any_errors_fatal: true tags: - always roles: - check_config_static - name: check system environment hosts: monitored_servers any_errors_fatal: true tags: - always roles: - check_system_dynamic - name: initializing deployment target hosts: all any_errors_fatal: true tags: - always roles: - check_config_dynamic - name: deploying drainer(binlog cluster) hosts: drainer_servers tags: - drainer roles: - { role: drainer, when: enable_binlog|default(false) } - name: finalizing deployment target hosts: all become: true roles: - { role: firewalld, when: enable_firewalld is defined and enable_firewalld } ================================================ FILE: deploy_ntp.yml ================================================ --- - hosts: all tasks: - name: get facts setup: - name: RedHat family Linux distribution - make sure ntp, ntpstat have been installed yum: name: "{{ item }}" state: present with_items: - ntp when: - ansible_os_family == "RedHat" - name: RedHat family Linux distribution - make sure ntpdate have been installed yum: name: "{{ item }}" state: present with_items: - ntpdate when: - ansible_os_family == "RedHat" - ntp_server is defined - name: Debian family Linux distribution - make sure ntp, ntpstat have been installed apt: name: "{{ item }}" state: present with_items: - ntp - ntpstat when: - ansible_os_family == "Debian" - name: Debian family Linux distribution - make sure ntpdate have been installed apt: name: "{{ item }}" state: present with_items: - ntpdate when: - ansible_os_family == "Debian" - ntp_server is defined - name: RedHat family Linux distribution - make sure ntpd service has been stopped service: name: ntpd state: stopped when: - ansible_os_family == "RedHat" - ntp_server is defined - name: Debian family Linux distribution - make sure ntp service has been stopped service: name: ntp state: stopped when: - ansible_os_family == "Debian" - ntp_server is defined - name: Adjust Time | start to adjust time with {{ ntp_server }} shell: ntpdate {{ ntp_server }} when: ntp_server is defined - name: RedHat family Linux distribution - make sure ntpd service has been started service: name: ntpd state: started when: - ansible_os_family == "RedHat" - name: Debian family Linux distribution - Make sure ntp service has been started service: name: ntp state: started when: - ansible_os_family == "Debian" ================================================ FILE: excessive_rolling_update.yml ================================================ --- # Copyright 2016 PingCAP, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # See the License for the specific language governing permissions and # limitations under the License. # The rolling update playbook of TiDB - name: check config locally hosts: localhost any_errors_fatal: true tags: - always roles: - check_config_static - name: check system environment hosts: monitored_servers any_errors_fatal: true tags: - always roles: - check_system_dynamic - name: gather all facts, and check dest hosts: all any_errors_fatal: true tags: - always roles: - check_config_dynamic - name: Pre-check for rolling update hosts: tidb_servers any_errors_fatal: true tags: - always tasks: - shell: "{{ deploy_dir }}/bin/tidb-server -V" register: current_version - name: Check whether can perform rolling update fail: msg: "Rolling update from {{ current_version.stdout_lines[0].replace(' ','').split(':')[1] }} to {{ tidb_version }} is forbidden" when: - current_version.stdout_lines[0].replace(' ','').split(':')[1] < "v2.0.1" - tidb_version >= "v2.1.0" or tidb_version == "latest" - name: Pre-check PD configuration hosts: pd_servers[0] tags: - pd roles: - check_config_pd - name: Pre-check TiKV configuration hosts: tikv_servers[0] tags: - tikv roles: - check_config_tikv - name: Pre-check TiDB configuration hosts: tidb_servers[0] tags: - tidb roles: - check_config_tidb - hosts: pd_servers[0] any_errors_fatal: true serial: 1 tags: - pd tasks: - name: Check pd cluster status uri: url: "http://{{ ansible_host }}:{{ pd_client_port }}/pd/health" method: GET return_content: yes status_code: 200 register: pd_status when: not enable_tls|default(false) - name: Check pd cluster status when enable_tls uri: url: "https://{{ ansible_host }}:{{ pd_client_port }}/pd/health" validate_certs: no client_cert: "{{ pd_cert_dir }}/pd-server-{{ ansible_host }}.pem" client_key: "{{ pd_cert_dir }}/pd-server-{{ ansible_host }}-key.pem" method: GET return_content: yes status_code: 200 register: pd_status_tls when: enable_tls|default(false) - name: Failed when one node of pd is unhealthy fail: msg: "Some pd nodes are unhealthy" when: - not enable_tls|default(false) - "'false' in pd_status.content" - name: Failed when one node of pd is unhealthy when enable_tls fail: msg: "Some pd nodes are unhealthy" when: - enable_tls|default(false) - "'false' in pd_status_tls.content" - hosts: pd_servers any_errors_fatal: true serial: 1 tags: - pd tasks: - set_fact: pd_addr: "{{ ansible_host }}:{{ pd_client_port }}" - include_tasks: "common_tasks/get_pd_leader.yml" when: not enable_tls|default(false) - include_tasks: "common_tasks/get_pd_leader_tls.yml" when: enable_tls|default(false) - set_fact: pd_leader_name: "{{ pd_leader_info.json.name }}" - include_tasks: "common_tasks/get_pd_name.yml" when: not enable_tls|default(false) - include_tasks: "common_tasks/get_pd_name_tls.yml" when: enable_tls|default(false) - name: Set pd follower list add_host: name: "{{ inventory_hostname }}" ansible_host: "{{ ansible_host }}" ansible_ssh_host: "{{ ansible_ssh_host }}" groups: pd_servers_followers deploy_dir: "{{ deploy_dir }}" pd_client_port: "{{ pd_client_port }}" pd_peer_port: "{{ pd_peer_port }}" pd_data_dir: "{{ pd_data_dir }}" pd_log_dir: "{{ pd_log_dir }}" pd_cert_dir: "{{ pd_cert_dir }}" when: pd_leader_name != pd_name - name: Set pd leader list add_host: name: "{{ inventory_hostname }}" ansible_host: "{{ ansible_host }}" ansible_ssh_host: "{{ ansible_ssh_host }}" groups: pd_servers_leader deploy_dir: "{{ deploy_dir }}" pd_client_port: "{{ pd_client_port }}" pd_peer_port: "{{ pd_peer_port }}" pd_data_dir: "{{ pd_data_dir }}" pd_log_dir: "{{ pd_log_dir }}" pd_cert_dir: "{{ pd_cert_dir }}" when: pd_leader_name == pd_name - name: rolling update PD cluster hosts: pd_servers_followers, pd_servers_leader any_errors_fatal: true serial: 1 tags: - pd pre_tasks: - set_fact: pd_addr: "{{ ansible_host }}:{{ pd_client_port }}" - include_tasks: "common_tasks/get_pd_name.yml" when: not enable_tls|default(false) - include_tasks: "common_tasks/get_pd_name_tls.yml" when: enable_tls|default(false) - name: display PD name debug: var: pd_name - name: display PD address debug: var: pd_addr - include_tasks: "common_tasks/get_pd_leader.yml" when: not enable_tls|default(false) - include_tasks: "common_tasks/get_pd_leader_tls.yml" when: enable_tls|default(false) - include_tasks: "common_tasks/transfer_pd_leader.yml" - name: stop PD by supervise shell: cd {{ deploy_dir }}/scripts && ./stop_pd.sh when: process_supervision == 'supervise' - name: stop PD by systemd systemd: name=pd.service state=stopped become: true when: process_supervision == 'systemd' - name: wait until the PD port is down wait_for: host: "{{ ansible_host }}" port: "{{ pd_client_port }}" state: stopped msg: "the PD port {{ pd_client_port }} is not down" roles: - pd post_tasks: - name: start PD by supervise shell: cd {{ deploy_dir }}/scripts && ./start_pd.sh when: process_supervision == 'supervise' - name: start PD by systemd systemd: name=pd-{{ pd_client_port }}.service state=started become: true when: process_supervision == 'systemd' - name: wait until the PD port is up wait_for: host: "{{ ansible_host }}" port: "{{ pd_client_port }}" state: started msg: "the PD port {{ pd_client_port }} is not up" - name: wait until the PD health page is available uri: url: "http://{{ ansible_host }}:{{ pd_client_port }}/health" return_content: yes register: pd_http_result until: pd_http_result.status == 200 and 'true' in pd_http_result.content retries: 12 delay: 5 when: not enable_tls|default(false) - name: wait until the PD health page is available when enable_tls uri: url: "https://{{ ansible_host }}:{{ pd_client_port }}/health" validate_certs: no client_cert: "{{ pd_cert_dir }}/pd-server-{{ ansible_host }}.pem" client_key: "{{ pd_cert_dir }}/pd-server-{{ ansible_host }}-key.pem" return_content: yes register: pd_https_result until: pd_https_result.status == 200 and 'true' in pd_https_result.content retries: 12 delay: 5 when: enable_tls|default(false) - name: wait until the PD cluster is available uri: url: "http://{{ ansible_host }}:{{ pd_client_port }}/pd/health" return_content: yes register: pd_cluster_status until: pd_cluster_status.status == 200 and 'false' not in pd_cluster_status.content retries: 12 delay: 5 when: not enable_tls|default(false) - name: wait until the PD cluster is available when enable_tls uri: url: "https://{{ ansible_host }}:{{ pd_client_port }}/pd/health" validate_certs: no client_cert: "{{ pd_cert_dir }}/pd-server-{{ ansible_host }}.pem" client_key: "{{ pd_cert_dir }}/pd-server-{{ ansible_host }}-key.pem" return_content: yes register: pd_cluster_status until: pd_cluster_status.status == 200 and 'false' not in pd_cluster_status.content retries: 12 delay: 5 when: enable_tls|default(false) - name: rolling update TiKV cluster hosts: tikv_servers any_errors_fatal: true serial: 1 tags: - tikv pre_tasks: - include_tasks: "common_tasks/get_pd_tikv_addr.yml" - include_tasks: "common_tasks/get_store_id.yml" when: not enable_tls|default(false) - include_tasks: "common_tasks/get_store_id_tls.yml" when: enable_tls|default(false) - include_tasks: "common_tasks/add_evict_leader_scheduler.yml" - name: stop TiKV by supervise shell: cd {{ deploy_dir }}/scripts && ./stop_tikv.sh when: process_supervision == 'supervise' - name: stop TiKV by systemd systemd: name=tikv-{{ tikv_port }}.service state=stopped become: true when: process_supervision == 'systemd' - name: wait until the TiKV port is down wait_for: host: "{{ ansible_host }}" port: "{{ tikv_port }}" state: stopped msg: "the TiKV port {{ tikv_port }} is not down" - command: cat {{ deploy_dir }}/status/tikv.pid register: old_tikv_pid ignore_errors: yes changed_when: false - name: display old tikv pid debug: msg: "tikv binary or docker pid: {{ old_tikv_pid.stdout }}" roles: - tikv post_tasks: - name: start TiKV by supervise shell: cd {{ deploy_dir }}/scripts && ./start_tikv.sh when: process_supervision == 'supervise' - name: start TiKV by systemd systemd: name=tikv-{{ tikv_port }}.service state=started become: true when: process_supervision == 'systemd' - name: wait until the TiKV port is up wait_for: host: "{{ ansible_host }}" port: "{{ tikv_port }}" state: started msg: "the TiKV port {{ tikv_port }} is not up" - name: wait until the TiKV status page is available uri: url: "http://{{ ansible_host }}:{{ tikv_status_port }}/status" return_content: yes register: tikv_http_result until: tikv_http_result.status == 200 retries: 12 delay: 5 when: not enable_tls|default(false) - name: wait until the TiKV status page is available when enable_tls uri: url: "https://{{ ansible_host }}:{{ tikv_status_port }}/status" validate_certs: no client_cert: "{{ tikv_cert_dir }}/tikv-server-{{ ansible_host }}.pem" client_key: "{{ tikv_cert_dir }}/tikv-server-{{ ansible_host }}-key.pem" return_content: yes register: tikv_https_result until: tikv_https_result.status == 200 retries: 10 delay: 5 when: enable_tls|default(false) - command: cat {{ deploy_dir }}/status/tikv.pid register: new_tikv_pid ignore_errors: yes changed_when: false - name: display new tikv pid debug: msg: "tikv binary or docker pid: {{ new_tikv_pid.stdout }}" - include_tasks: "common_tasks/remove_evict_leader_scheduler.yml" - name: rolling update pump cluster hosts: pump_servers any_errors_fatal: true serial: 1 tags: - pump pre_tasks: - name: stop pump by supervise shell: cd {{ deploy_dir }}/scripts && ./stop_{{ item }}.sh with_items: - pump when: - enable_binlog|default(false) - process_supervision == 'supervise' - name: stop pump by systemd systemd: name=pump-{{ pump_port }}.service state=stopped become: true when: - enable_binlog|default(false) - process_supervision == 'systemd' - name: wait until the pump port is down wait_for: host: "{{ ansible_host }}" port: "{{ pump_port }}" state: stopped msg: "the pump port {{ pump_port }} is not down" when: - enable_binlog|default(false) roles: - { role: pump, when: enable_binlog|default(false) } post_tasks: - name: start pump by supervise shell: cd {{ deploy_dir }}/scripts && ./start_{{ item }}.sh when: - enable_binlog|default(false) - process_supervision == 'supervise' with_items: - pump - name: start pump by systemd systemd: name=pump-{{ pump_port }}.service state=started become: true when: - enable_binlog|default(false) - process_supervision == 'systemd' - name: wait until the pump port is up wait_for: host: "{{ ansible_host }}" port: "{{ pump_port }}" state: started msg: "the pump port {{ pump_port }} is not up" when: - enable_binlog|default(false) - name: rolling update TiDB cluster hosts: tidb_servers any_errors_fatal: true serial: 1 tags: - tidb pre_tasks: - name: stop TiDB by supervise shell: cd {{ deploy_dir }}/scripts && ./stop_tidb.sh when: process_supervision == 'supervise' - name: stop TiDB by systemd systemd: name=tidb-{{ tidb_port }}.service state=stopped become: true when: process_supervision == 'systemd' - name: wait until the TiDB port is down wait_for: host: "{{ ansible_host }}" port: "{{ tidb_port }}" state: stopped msg: "the TiDB port {{ tidb_port }} is not down" roles: - { role: tidb } post_tasks: - name: start TiDB by supervise shell: cd {{ deploy_dir }}/scripts && ./start_tidb.sh when: process_supervision == 'supervise' - name: start TiDB by systemd systemd: name=tidb-{{ tidb_port }}.service state=started become: true when: process_supervision == 'systemd' - name: wait until the TiDB port is up wait_for: host: "{{ ansible_host }}" port: "{{ tidb_port }}" state: started msg: "the TiDB port {{ tidb_port }} is not up" - name: wait until the TiDB status page is available uri: url: "http://{{ ansible_host }}:{{ tidb_status_port }}/status" return_content: yes register: tidb_http_result until: tidb_http_result.status == 200 and 'TiDB' in tidb_http_result.content retries: 12 delay: 5 when: not enable_tls|default(false) - name: wait until the TiDB status page is available when enable_tls uri: url: "https://{{ ansible_host }}:{{ tidb_status_port }}/status" validate_certs: no client_cert: "{{ tidb_cert_dir }}/tidb-server-{{ ansible_host }}.pem" client_key: "{{ tidb_cert_dir }}/tidb-server-{{ ansible_host }}-key.pem" return_content: yes register: tidb_https_result until: tidb_https_result.status == 200 and 'TiDB' in tidb_https_result.content retries: 10 delay: 5 when: enable_tls|default(false) - name: rolling update TiFlash cluster hosts: tiflash_servers any_errors_fatal: true serial: 1 tags: - tiflash pre_tasks: - name: stop TiFlash by supervise shell: cd {{ deploy_dir }}/scripts && ./stop_tiflash.sh when: process_supervision == 'supervise' and cpu_architecture == 'amd64' - name: stop TiFlash by systemd systemd: name=tiflash-{{ tcp_port }}.service state=stopped become: true when: process_supervision == 'systemd' and cpu_architecture == 'amd64' - name: wait until the TiFlash port is down wait_for: host: "{{ ansible_host }}" port: "{{ http_port }}" state: stopped msg: "the TiFlash port {{ http_port }} is not down" when: cpu_architecture == 'amd64' roles: - { role: tiflash, when: cpu_architecture == 'amd64' } post_tasks: - name: start TiFlash by supervise shell: cd {{ deploy_dir }}/scripts && ./start_tiflash.sh when: process_supervision == 'supervise' and cpu_architecture == 'amd64' - name: start TiFlash by systemd systemd: name=tiflash-{{ tcp_port }}.service state=started become: true when: process_supervision == 'systemd' and cpu_architecture == 'amd64' - name: wait until the TiFlash port is up wait_for: host: "{{ ansible_host }}" port: "{{ http_port }}" state: started msg: "the TiFlash port {{ http_port }} is not up" when: cpu_architecture == 'amd64' - name: wait until the TiFlash status page is available uri: url: "http://{{ ansible_host }}:{{ http_port }}/?query=select%20version()" return_content: yes register: tiflash_http_result until: tiflash_http_result.status == 200 retries: 12 delay: 5 when: not enable_tls|default(false) and cpu_architecture == 'amd64' - hosts: localhost tags: - always roles: - { role: dashboard_topo } ================================================ FILE: filter_plugins/tags.py ================================================ #!/usr/bin/env python import re import time import copy import json def epoch_time_diff(t): return int(int(t) - time.time()) def with_default_dicts(d, *args): ret = copy.deepcopy(d) or {} for arg in args: if arg: ret.update([(k, with_default_dicts(ret[k], arg[k])) for k in arg if k in ret and isinstance(ret[k], (dict, type(None)))]) ret.update([(k, arg[k]) for k in arg if k not in ret]) return ret def split_string(d, seperator=None, maxsplit=-1): try: return d.split(seperator, maxsplit) except: return list(d) def split_regex(d, seperator_pattern): try: return re.split(seperator_pattern, string) except: return list(string) def update_default_dicts(d): ret = copy.deepcopy(d) or {} if ret: ret.update([(k, update_default_dicts(ret[k])) for k in ret if isinstance(ret[k], (dict, type(None)))]) return ret def dictsort_by_value_type(d): vals = list(d.items()) return sorted(vals, key=lambda p: (isinstance(p[1], dict), p[0], p[1])) def tikv_server_labels_format(label_str): label_str = str(label_str or '') labels = set() for tag in set(filter(None, map(lambda s: s.strip(), label_str.split(',')))): k = tag.split('=', 1)[0].strip() v = tag.split('=', 1)[1].strip() assert k, "empty label key" assert v, "empty label value" labels.add((k, v)) return "{ %s }" % (', '.join(["%s = %s" % (k, json.dumps(v)) for (k,v) in labels])) def get_element_by_index(d, index): return d[index] class FilterModule(object): def filters(self): return { 'epoch_time_diff': epoch_time_diff, 'with_default_dicts': with_default_dicts, 'update_default_dicts': update_default_dicts, 'dictsort_by_value_type': dictsort_by_value_type, 'tikv_server_labels_format': tikv_server_labels_format, 'split_string': split_string, 'split_regex': split_regex, 'get_element_by_index': get_element_by_index, } ================================================ FILE: graceful_stop.yml ================================================ --- # Copyright 2016 PingCAP, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # See the License for the specific language governing permissions and # limitations under the License. # The playbook of TiDB - hosts: all tasks: - name: check ansible_play_batch fail: msg: "Only one host can be specified at a time when performing graceful stop." run_once: true when: ansible_play_batch | length != 1 - name: check config locally hosts: localhost any_errors_fatal: true tags: - always roles: - check_config_static - name: gather all facts, and check dest hosts: all any_errors_fatal: true tags: - always roles: - check_config_dynamic - hosts: monitored_servers tags: - node_exporter tasks: - name: stop node_exporter by supervise shell: cd {{ deploy_dir }}/scripts && ./stop_{{ item }}.sh with_items: - node_exporter when: process_supervision == 'supervise' - name: stop node_exporter by systemd systemd: name={{ item }} state=stopped become: true when: process_supervision == 'systemd' with_items: - node_exporter-{{ node_exporter_port }}.service - name: wait until the node_exporter port is down wait_for: host: "{{ ansible_host }}" port: "{{ node_exporter_port }}" state: stopped msg: "the node_exporter port {{ node_exporter_port }} is not down" - hosts: monitored_servers tags: - blackbox_exporter tasks: - name: stop blackbox_exporter by supervise shell: cd {{ deploy_dir }}/scripts && ./stop_{{ item }}.sh with_items: - blackbox_exporter when: process_supervision == 'supervise' - name: stop node_exporter/blackbox_exporter by systemd systemd: name={{ item }} state=stopped become: true when: process_supervision == 'systemd' with_items: - blackbox_exporter-{{ blackbox_exporter_port }}.service - name: wait until the blackbox_exporter port is down wait_for: host: "{{ ansible_host }}" port: "{{ blackbox_exporter_port }}" state: stopped msg: "the blackbox_exporter port {{ blackbox_exporter_port }} is not down" - hosts: alertmanager_servers tags: - alertmanager tasks: - name: stop alertmanager by supervise shell: cd {{ deploy_dir }}/scripts && ./stop_alertmanager.sh when: process_supervision == 'supervise' - name: stop alertmanager by systemd systemd: name=alertmanager-{{ alertmanager_port }}.service state=stopped become: true when: process_supervision == 'systemd' - name: wait until the alertmanager port is down wait_for: host: "{{ ansible_host }}" port: "{{ alertmanager_port }}" state: stopped msg: "the alertmanager port {{ alertmanager_port }} is not down" - hosts: monitoring_servers tags: - pushgateway tasks: - name: stop pushgateway by supervise shell: cd {{ deploy_dir }}/scripts && ./stop_{{ item }}.sh with_items: - pushgateway when: process_supervision == 'supervise' - name: stop pushgateway by systemd systemd: name={{ item }} state=stopped when: process_supervision == 'systemd' become: true with_items: - pushgateway-{{ pushgateway_port }}.service - name: wait until the pushgateway port is down wait_for: host: "{{ ansible_host }}" port: "{{ pushgateway_port }}" state: stopped msg: "the pushgateway port {{ pushgateway_port }} is not down" - hosts: monitoring_servers tags: - prometheus tasks: - name: stop prometheus by supervise shell: cd {{ deploy_dir }}/scripts && ./stop_{{ item }}.sh with_items: - prometheus when: process_supervision == 'supervise' - name: stop prometheus by systemd systemd: name={{ item }} state=stopped when: process_supervision == 'systemd' become: true with_items: - prometheus-{{ prometheus_port }}.service - name: wait until the prometheus port is down wait_for: host: "{{ ansible_host }}" port: "{{ prometheus_port }}" state: stopped msg: "the prometheus port {{ prometheus_port }} is not down" - hosts: kafka_exporter_servers tags: - kafka_exporter tasks: - name: stop kafka_exporter by supervise shell: cd {{ deploy_dir }}/scripts && ./stop_kafka_exporter.sh when: - enable_binlog|default(false) - process_supervision == 'supervise' - name: stop kafka_exporter by systemd become: true systemd: name=kafka_exporter-{{ kafka_exporter_port }}.service state=stopped enabled=no when: - enable_binlog|default(false) - process_supervision == 'systemd' - name: wait until the kafka_exporter port is down wait_for: host: "{{ ansible_host }}" port: "{{ kafka_exporter_port }}" state: stopped msg: "the kafka_exporter port {{ kafka_exporter_port }} is not down" when: enable_binlog|default(false) - hosts: tidb_servers tags: - tidb tasks: - name: stop TiDB by supervise shell: cd {{ deploy_dir }}/scripts && ./stop_{{ item }}.sh when: process_supervision == 'supervise' with_items: - tidb - name: stop TiDB by systemd systemd: name=tidb-{{ tidb_port }}.service state=stopped become: true when: process_supervision == 'systemd' - name: wait until the TiDB port is down wait_for: host: "{{ ansible_host }}" port: "{{ tidb_port }}" state: stopped msg: "the TiDB port {{ tidb_port }} is not down" - hosts: pump_servers tags: - pump tasks: - name: stop pump by supervise shell: cd {{ deploy_dir }}/scripts && ./stop_{{ item }}.sh when: - enable_binlog|default(false) - process_supervision == 'supervise' with_items: - pump - name: stop pump by systemd systemd: name=pump-{{ pump_port }}.service state=stopped become: true when: - enable_binlog|default(false) - process_supervision == 'systemd' - name: wait until the pump port is down wait_for: host: "{{ ansible_host }}" port: "{{ pump_port }}" state: stopped msg: "the pump port {{ pump_port }} is not down" when: enable_binlog|default(false) - hosts: tikv_servers tags: - tikv tasks: - include_tasks: "common_tasks/get_pd_tikv_addr.yml" - include_tasks: "common_tasks/get_store_id.yml" when: not enable_tls|default(false) - include_tasks: "common_tasks/get_store_id_tls.yml" when: enable_tls|default(false) - include_tasks: "common_tasks/add_evict_leader_scheduler.yml" - name: stop TiKV by supervise shell: cd {{ deploy_dir }}/scripts && ./stop_tikv.sh when: process_supervision == 'supervise' - name: stop TiKV by systemd systemd: name=tikv-{{ tikv_port }}.service state=stopped become: true when: process_supervision == 'systemd' - name: wait until the TiKV port is down wait_for: host: "{{ ansible_host }}" port: "{{ tikv_port }}" state: stopped msg: "the TiKV port {{ tikv_port }} is not down" - command: cat {{ deploy_dir }}/status/tikv.pid register: old_tikv_pid ignore_errors: yes changed_when: false - name: display old tikv pid debug: msg: "tikv binary or docker pid: {{ old_tikv_pid.stdout }}" - include_tasks: "common_tasks/remove_evict_leader_scheduler.yml" - hosts: pd_servers tags: - pd tasks: - set_fact: pd_addr: "{{ ansible_host }}:{{ pd_client_port }}" - include_tasks: "common_tasks/get_pd_name.yml" - name: display PD name debug: var: pd_name - name: display PD address debug: var: pd_addr - include_tasks: "common_tasks/get_pd_leader.yml" when: not enable_tls|default(false) - include_tasks: "common_tasks/get_pd_leader_tls.yml" when: enable_tls|default(false) - include_tasks: "common_tasks/transfer_pd_leader.yml" - name: stop PD by supervise shell: cd {{ deploy_dir }}/scripts && ./stop_pd.sh when: process_supervision == 'supervise' - name: stop PD by systemd systemd: name=pd-{{ pd_client_port }}.service state=stopped become: true when: process_supervision == 'systemd' - name: wait until the PD port is down wait_for: host: "{{ ansible_host }}" port: "{{ pd_client_port }}" state: stopped msg: "the PD port {{ pd_client_port }} is not down" - hosts: grafana_servers tags: - grafana tasks: - name: stop grafana by supervise shell: cd {{ deploy_dir }}/scripts && ./stop_{{ item }}.sh when: process_supervision == 'supervise' with_items: - grafana - name: stop grafana by systemd systemd: name=grafana-{{ grafana_port }}.service state=stopped become: true when: process_supervision == 'systemd' - name: wait until the grafana port is down wait_for: host: "{{ ansible_host }}" port: "{{ grafana_port }}" state: stopped msg: "the grafana port {{ grafana_port }} is not down" ================================================ FILE: group_vars/alertmanager_servers.yml ================================================ --- alertmanager_port: 9093 alertmanager_cluster_port: 9094 ================================================ FILE: group_vars/all.yml ================================================ --- # Variables here are applicable to all host groups deploy_user: "{{ ansible_user }}" status_dir: "{{ deploy_dir }}/status" backup_dir: "{{ deploy_dir }}/backup" images_dir: "{{ deploy_dir }}/images" # Local downloads_dir: "{{ playbook_dir }}/downloads" resources_dir: "{{ playbook_dir }}/resources" fetch_tmp_dir: "{{ playbook_dir }}/collect_diagnosis_data" fetch_dir: "{{ playbook_dir }}/collect_diagnosis" cert_dir: "{{ playbook_dir }}/conf/ssl" script_dir: "{{ playbook_dir }}/scripts" binary_dir: "{{ playbook_dir }}/resources/bin" # default configuration for multiple host groups and roles node_exporter_port: 9100 blackbox_exporter_port: 9115 kafka_exporter_port: 9308 # docker docker_bin_dir: "/usr/bin" # Random shifts for retrying failed ops like downloading retry_stagger: 5 # deployment methods, [binary, docker] docker deployment method is not recommended and deprecated. deployment_method: binary enable_log_clean: False log_retain_days: 28 dev_mode: False # systemd: Specifies whether to send SIGKILL to remaining processes after a timeout. disable_send_sigkill: False # pump pump_port: 8250 pump_data_dir: "{{ deploy_dir }}/data.pump" pump_log_dir: "{{ deploy_dir }}/log" pump_cert_dir: "{{ deploy_dir }}/conf/ssl" # drainer drainer_port: 8249 ================================================ FILE: group_vars/drainer_servers.yml ================================================ --- ================================================ FILE: group_vars/grafana_servers.yml ================================================ --- grafana_port: 3000 grafana_api_keys_dir: "{{ playbook_dir }}/conf/keys" ================================================ FILE: group_vars/importer_server.yml ================================================ --- dummy: # this directory is used to store the data written by `tidb-lightning` import_dir: "{{ deploy_dir }}/data.import" # the listening address of tikv-importer. tidb-lightning needs to connect to this address to write data. Set it to the actual IP address. tikv_importer_port: 8287 ================================================ FILE: group_vars/lightning_server.yml ================================================ --- dummy: # background profile for debugging tidb_lightning_pprof_port: 8289 # the source data directory of Mydumper data_source_dir: "{{ deploy_dir }}/mydumper" # Tidb cluster information to import data # tidb_host: "" # tidb_port: 4000 # tidb_user: "" # tidb_password: "" # tidb_status_port: 10080 ================================================ FILE: group_vars/monitored_servers.yml ================================================ --- node_exporter_log_dir: "{{ deploy_dir }}/log" ================================================ FILE: group_vars/monitoring_servers.yml ================================================ --- prometheus_port: 9090 pushgateway_port: 9091 # How long to retain samples in the storage prometheus_storage_retention: "30d" ================================================ FILE: group_vars/pd_servers.yml ================================================ --- dummy: pd_client_port: 2379 pd_peer_port: 2380 pd_data_dir: "{{ deploy_dir }}/data.pd" pd_log_dir: "{{ deploy_dir }}/log" pd_cert_dir: "{{ deploy_dir }}/conf/ssl" ================================================ FILE: group_vars/pump_servers.yml ================================================ --- ================================================ FILE: group_vars/tidb_servers.yml ================================================ --- dummy: tidb_port: 4000 tidb_status_port: 10080 tidb_log_dir: "{{ deploy_dir }}/log" tidb_cert_dir: "{{ deploy_dir }}/conf/ssl" ================================================ FILE: group_vars/tiflash_servers.yml ================================================ --- tcp_port: 9000 http_port: 8123 flash_service_port: 3930 flash_proxy_port: 20170 flash_proxy_status_port: 20292 metrics_port: 8234 ================================================ FILE: group_vars/tikv_servers.yml ================================================ --- dummy: tikv_port: 20160 tikv_status_port: 20180 tikv_data_dir: "{{ deploy_dir }}/data" tikv_log_dir: "{{ deploy_dir }}/log" tikv_cert_dir: "{{ deploy_dir }}/conf/ssl" ================================================ FILE: hosts.ini ================================================ [servers] 192.168.0.2 192.168.0.3 192.168.0.4 192.168.0.5 192.168.0.6 192.168.0.7 192.168.0.8 192.168.0.10 [all:vars] username = tidb ntp_server = pool.ntp.org ================================================ FILE: inventory.ini ================================================ ## TiDB Cluster Part [tidb_servers] 192.168.0.2 [tikv_servers] 192.168.0.3 192.168.0.4 192.168.0.5 [pd_servers] 192.168.0.6 192.168.0.7 192.168.0.8 [spark_master] [spark_slaves] [lightning_server] [importer_server] ## Monitoring Part # prometheus and pushgateway servers [monitoring_servers] 192.168.0.10 [grafana_servers] 192.168.0.10 # node_exporter and blackbox_exporter servers [monitored_servers] 192.168.0.2 192.168.0.3 192.168.0.4 192.168.0.5 192.168.0.6 192.168.0.7 192.168.0.8 192.168.0.10 [alertmanager_servers] 192.168.0.10 [kafka_exporter_servers] ## Binlog Part [pump_servers] [drainer_servers] ## For TiFlash Part, please contact us for beta-testing and user manual [tiflash_servers] ## Group variables [pd_servers:vars] # location_labels = ["zone","rack","host"] ## Global variables [all:vars] deploy_dir = /home/tidb/deploy ## Connection # ssh via normal user ansible_user = tidb cluster_name = test-cluster # CPU architecture: amd64, arm64 cpu_architecture = amd64 tidb_version = nightly # process supervision, [systemd, supervise] process_supervision = systemd timezone = Asia/Shanghai enable_firewalld = False # check NTP service enable_ntpd = True set_hostname = False ## binlog trigger enable_binlog = False # kafka cluster address for monitoring, example: # kafka_addrs = "192.168.0.11:9092,192.168.0.12:9092,192.168.0.13:9092" kafka_addrs = "" # zookeeper address of kafka cluster for monitoring, example: # zookeeper_addrs = "192.168.0.11:2181,192.168.0.12:2181,192.168.0.13:2181" zookeeper_addrs = "" # enable TLS authentication in the TiDB cluster enable_tls = False # KV mode deploy_without_tidb = False # wait for region replication complete before start tidb-server. wait_replication = True # Optional: Set if you already have a alertmanager server. # Format: alertmanager_host:alertmanager_port alertmanager_target = "" grafana_admin_user = "admin" grafana_admin_password = "admin" ### Collect diagnosis collect_log_recent_hours = 2 enable_bandwidth_limit = True # default: 10Mb/s, unit: Kbit/s collect_bandwidth_limit = 10000 ================================================ FILE: library/coreos_facts ================================================ #!/bin/bash set -e _default_gw=$(ip route list match 0.0.0.0 | cut '-d ' -f3) _default_if=$(ip route list match 0.0.0.0 | cut '-d ' -f5) _default_ipv4_addr=$(echo $(ip addr show dev $_default_if scope global | grep inet | grep -v inet6) | cut '-d ' -f 2 | cut '-d/' -f 1) _default_ipv4_mask=$(echo $(ip addr show dev $_default_if scope global | grep inet | grep -v inet6) | cut '-d ' -f 4) _default_hw_addr=$(echo $(ip addr show dev $_default_if scope global | grep ether) | cut '-d ' -f 2) _default_mtu=$(echo $(ip addr show dev $_default_if scope global | head -n1) | cut '-d ' -f 5) if [ -f /etc/lsb-release ]; then source /etc/lsb-release fi physicalNumber=0 coreNumber=0 logicalNumber=0 HTNumber=0 logicalNumber=$(grep "processor" /proc/cpuinfo|sort -u|wc -l) physicalNumber=$(grep "physical id" /proc/cpuinfo|sort -u|wc -l) coreNumber=$(grep "cpu cores" /proc/cpuinfo|uniq|awk -F':' '{print $2}'|xargs) HTNumber=$((logicalNumber / (physicalNumber * coreNumber))) cat <= thread_num: # found, success! break except (OSError, IOError): pass # Conditions not yet met, wait and try again time.sleep(params['sleep']) else: # while-else # Timeout expired elapsed = datetime.datetime.now() - start if pid_file: module.fail_json(msg="Timeout when waiting for PID:%s to stop." % (pid_file), elapsed=elapsed.seconds) elif pid: module.fail_json(msg="Timeout when waiting for PID:%s to be absent." % (pid), elapsed=elapsed.seconds) elapsed = datetime.datetime.now() - start module.exit_json(state=state, pid=pid, thread_name_regex=thread_name_regex, pid_file=pid_file, elapsed=elapsed.seconds) # import module snippets from ansible.module_utils.basic import * if __name__ == '__main__': main() ================================================ FILE: local_prepare.yml ================================================ --- - name: do local preparation hosts: localhost connection: local gather_facts: false roles: - local ================================================ FILE: log/.gitignore ================================================ *.log ================================================ FILE: migrate_monitor.yml ================================================ --- # Copyright 2016 PingCAP, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # See the License for the specific language governing permissions and # limitations under the License. # The rolling update playbook of TiDB - name: check config locally hosts: localhost any_errors_fatal: true tags: - always roles: - check_config_static - name: gather all facts, and check dest hosts: all any_errors_fatal: true tags: - always roles: - check_config_dynamic - name: rolling update node_exporter hosts: monitored_servers any_errors_fatal: true tags: - node_exporter pre_tasks: - name: stop node_exporter by supervise shell: cd {{ deploy_dir }}/scripts && ./stop_node_exporter.sh when: process_supervision == 'supervise' - name: stop node_exporter by systemd systemd: name=node_exporter.service state=stopped enabled=no become: true when: process_supervision == 'systemd' - name: wait for node_exporter down wait_for: host={{ ansible_host }} port={{ node_exporter_port }} state=stopped - name: clean systemd config file: path="/etc/systemd/system/{{ item }}" state=absent become: true when: process_supervision == 'systemd' with_items: - node_exporter.service roles: - node_exporter post_tasks: - name: start node_exporter by supervise shell: cd {{ deploy_dir }}/scripts && ./start_node_exporter.sh when: process_supervision == 'supervise' - name: start node_exporter by systemd systemd: name=node_exporter-{{ node_exporter_port }}.service state=started become: true when: process_supervision == 'systemd' - name: wait for node_exporter up wait_for: | host={{ ansible_host }} port={{ node_exporter_port }} state=present send='GET /metrics HTTP/1.0\r\n\r\n' search_regex='200 OK' - name: rolling update blackbox_exporter hosts: monitored_servers any_errors_fatal: true tags: - blackbox_exporter pre_tasks: - name: check blackbox_exporter existed stat: path: "{{ deploy_dir }}/conf/blackbox.yml" register: blackbox_exporter_configure_file - name: stop blackbox_exporter by supervise shell: cd {{ deploy_dir }}/scripts && ./stop_blackbox_exporter.sh when: process_supervision == 'supervise' and blackbox_exporter_configure_file.stat.exists == True - name: stop blackbox_exporter by systemd systemd: name=blackbox_exporter.service state=stopped enabled=no become: true when: process_supervision == 'systemd' and blackbox_exporter_configure_file.stat.exists == True - name: wait for blackbox_exporter down wait_for: host={{ ansible_host }} port={{ blackbox_exporter_port }} state=stopped - name: clean systemd config file: path="/etc/systemd/system/{{ item }}" state=absent become: true when: process_supervision == 'systemd' and blackbox_exporter_configure_file.stat.exists == True with_items: - blackbox_exporter.service roles: - blackbox_exporter post_tasks: - name: start blackbox_exporter by supervise shell: cd {{ deploy_dir }}/scripts && ./start_blackbox_exporter.sh when: process_supervision == 'supervise' - name: start blackbox_exporter by systemd systemd: name=blackbox_exporter-{{ blackbox_exporter_port }}.service state=started become: true when: process_supervision == 'systemd' - name: wait for blackbox_exporter up wait_for: | host={{ ansible_host }} port={{ blackbox_exporter_port }} state=present send='GET / HTTP/1.0\r\n\r\n' search_regex='200 OK' - name: rolling update pushgateway hosts: monitoring_servers any_errors_fatal: true tags: - pushgateway pre_tasks: - name: stop pushgateway by supervise shell: cd {{ deploy_dir }}/scripts && ./stop_{{ item }}.sh with_items: - pushgateway when: process_supervision == 'supervise' - name: stop pushgateway by systemd systemd: name={{ item }} state=stopped enabled=no when: process_supervision == 'systemd' become: true with_items: - pushgateway.service - name: wait for pushgateway down wait_for: host={{ ansible_host }} port={{ pushgateway_port }} state=stopped - name: clean systemd config file: path="/etc/systemd/system/{{ item }}" state=absent become: true when: process_supervision == 'systemd' with_items: - pushgateway.service roles: - pushgateway post_tasks: - name: start pushgateway by supervise shell: cd {{ deploy_dir }}/scripts && ./start_{{ item }}.sh when: process_supervision == 'supervise' with_items: - pushgateway - name: start pushgateway by systemd systemd: name={{ item }} state=started enabled=no when: process_supervision == 'systemd' become: true with_items: - pushgateway-{{ pushgateway_port }}.service - name: wait for pushgateway up wait_for: | host={{ ansible_host }} port={{ pushgateway_port }} state=present send='GET /metrics HTTP/1.0\r\n\r\n' search_regex='200 OK' - name: rolling update prometheus hosts: monitoring_servers any_errors_fatal: true tags: - prometheus pre_tasks: - name: stop prometheus by supervise shell: cd {{ deploy_dir }}/scripts && ./stop_{{ item }}.sh with_items: - prometheus when: process_supervision == 'supervise' - name: stop prometheus by systemd systemd: name={{ item }} state=stopped enabled=no when: process_supervision == 'systemd' become: true with_items: - prometheus.service - name: wait for prometheus down wait_for: host={{ ansible_host }} port={{ prometheus_port }} state=stopped - name: clean systemd config file: path="/etc/systemd/system/{{ item }}" state=absent become: true when: process_supervision == 'systemd' with_items: - prometheus.service roles: - prometheus post_tasks: - name: start prometheus by supervise shell: cd {{ deploy_dir }}/scripts && ./start_{{ item }}.sh when: process_supervision == 'supervise' with_items: - prometheus - name: start prometheus by systemd systemd: name={{ item }} state=started enabled=no when: process_supervision == 'systemd' become: true with_items: - prometheus-{{ prometheus_port }}.service - name: wait for prometheus up wait_for: | host={{ ansible_host }} port={{ prometheus_port }} state=present send='GET /metrics HTTP/1.0\r\n\r\n' search_regex='200 OK' - name: rolling update grafana hosts: grafana_servers any_errors_fatal: true tags: - grafana pre_tasks: - name: stop grafana by supervise shell: cd {{ deploy_dir }}/scripts && ./stop_{{ item }}.sh when: process_supervision == 'supervise' with_items: - grafana - name: stop grafana by systemd systemd: name=grafana.service state=stopped enabled=no become: true when: process_supervision == 'systemd' - name: wait for grafana down wait_for: host={{ ansible_host }} port={{ grafana_port }} state=stopped - name: clean systemd config file: path="/etc/systemd/system/{{ item }}" state=absent become: true when: process_supervision == 'systemd' with_items: - grafana.service roles: - grafana post_tasks: - name: start grafana by supervise shell: cd {{ deploy_dir }}/scripts && ./start_{{ item }}.sh when: process_supervision == 'supervise' with_items: - grafana - name: start grafana by systemd systemd: name=grafana-{{ grafana_port }}.service state=started enabled=no when: process_supervision == 'systemd' become: true - name: wait for grafana up wait_for: | host={{ ansible_host }} port={{ grafana_port }} state=present send='GET /login HTTP/1.0\r\n\r\n' search_regex='200 OK' - set_fact: grafana_host: "{{ ansible_host }}" - include_tasks: "common_tasks/create_grafana_api_keys.yml" - name: import grafana data source shell: > chdir={{ grafana_data_dir }} warn=no curl -q -X POST -d @data_source.json --header 'Content-Type: application/json' "http://{{ grafana_admin_user }}:{{ grafana_admin_password }}@127.0.0.1:{{ grafana_port }}/api/datasources" - name: import grafana dashboards - prepare config delegate_to: localhost template: src=grafana.dest.json.j2 dest={{ playbook_dir }}/scripts/dests.json vars: - ansible_become: false - ansible_connection: local - grafana_dest_config: name: "{{ cluster_name | title }}" url: "http://{{ grafana_host }}:{{ grafana_port }}/" user: "{{ grafana_admin_user }}" password: "{{ grafana_admin_password }}" apikey: "{{ lookup('file', grafana_api_keys_dir + '/grafana_apikey.key') }}" datasource: "{{ cluster_name }}" titles: br: "{{ cluster_name | title }}-Backup-Restore" node: "{{ cluster_name | title }}-Node_exporter" pd: "{{ cluster_name | title }}-PD" tidb: "{{ cluster_name | title }}-TiDB" tidb_summary: "{{ cluster_name | title }}-TiDB-Summary" tikv_summary: "{{ cluster_name | title }}-TiKV-Summary" tikv_details: "{{ cluster_name | title }}-TiKV-Details" tikv_trouble_shot: "{{ cluster_name | title }}-TiKV-Trouble-Shooting" binlog: "{{ cluster_name | title }}-Binlog" overview: "{{ cluster_name | title }}-Overview" disk_performance: "{{ cluster_name | title }}-Disk-Performance" blackbox_exporter: "{{ cluster_name | title }}-Blackbox_exporter" kafka_overview: "{{ cluster_name | title }}-Kafka-Overview" performance_read: "{{ cluster_name | title }}-Performance-Read" performance_write: "{{ cluster_name | title }}-Performance-Write" - name: import grafana dashboards - run import script delegate_to: localhost shell: >- chdir={{ playbook_dir }}/scripts ./grafana-config-copy.py vars: - ansible_become: false - ansible_connection: local ================================================ FILE: requirements.txt ================================================ ansible==2.7.11 jinja2>=2.9.6 jmespath>=0.9.0 ================================================ FILE: roles/alertmanager/defaults/main.yml ================================================ --- # default configuration for alertmanager alertmanager_data_dir: "{{ deploy_dir }}/data.alertmanager" alertmanager_log_level: info alertmanager_log_dir: "{{ deploy_dir }}/log" alertmanager_log_filename: "alertmanager.log" alertmanager_tag: v0.14.0 ================================================ FILE: roles/alertmanager/meta/main.yml ================================================ --- dependencies: - role: common_dir ================================================ FILE: roles/alertmanager/tasks/binary_deployment.yml ================================================ --- - name: create deploy directories file: path="{{ item }}" state=directory mode=0755 with_items: - "{{ alertmanager_log_dir }}" - "{{ alertmanager_data_dir }}" - name: deploy alertmanager binary copy: src="{{ resources_dir }}/bin/alertmanager" dest="{{ deploy_dir }}/bin/" mode=0755 - name: create run script template: src: "{{ item }}_{{ role_name }}_binary.sh.j2" dest: "{{ deploy_dir }}/scripts/{{ item }}_{{ role_name }}.sh" mode: "0755" backup: yes with_items: - run vars: role_status_dir: status/{{ role_name }} - include_tasks: "{{ process_supervision }}_deployment.yml" ================================================ FILE: roles/alertmanager/tasks/docker_deployment.yml ================================================ --- - name: deploy alertmanager image copy: src="{{ downloads_dir }}/alertmanager.tar" dest="{{ deploy_dir }}/images" mode=0755 - name: create run script template: src: "{{ item }}_{{ role_name }}_docker.sh.j2" dest: "{{ deploy_dir }}/scripts/{{ item }}_{{ role_name }}.sh" mode: "0755" backup: yes with_items: - run - name: load docker image from archive docker_image: state: present force: yes name: prom/alertmanager tag: "{{ alertmanager_tag }}" load_path: "{{ images_dir }}/alertmanager.tar" - include_tasks: "{{ process_supervision }}_deployment.yml" ================================================ FILE: roles/alertmanager/tasks/main.yml ================================================ --- - include_tasks: "{{ deployment_method }}_deployment.yml" - name: create configuration file copy: src="{{ playbook_dir }}/conf/alertmanager.yml" dest="{{ deploy_dir }}/conf/alertmanager.yml" mode=0644 backup=yes register: alertmanager_conf_st - name: backup conf file command: mv "{{ alertmanager_conf_st.backup_file }}" "{{ backup_dir }}" when: alertmanager_conf_st.changed and alertmanager_conf_st.backup_file is defined - name: prepare firewalld white list set_fact: firewalld_ports: "{{ [alertmanager_port ~ '/tcp'] + firewalld_ports }}" ================================================ FILE: roles/alertmanager/tasks/supervise_deployment.yml ================================================ --- - name: deploy supervise include_role: name: supervise vars: this_role_name: alertmanager service_name: alertmanager-{{ alertmanager_port }} ================================================ FILE: roles/alertmanager/tasks/systemd_deployment.yml ================================================ --- - name: deploy systemd include_role: name: systemd vars: this_role_name: alertmanager service_name: alertmanager-{{ alertmanager_port }} ================================================ FILE: roles/alertmanager/templates/run_alertmanager_binary.sh.j2 ================================================ #!/bin/bash set -e ulimit -n 1000000 DEPLOY_DIR={{ deploy_dir }} cd "${DEPLOY_DIR}" || exit 1 # WARNING: This file was auto-generated. Do not edit! # All your edit might be overwritten! exec > >(tee -i -a "{{ alertmanager_log_dir }}/{{ alertmanager_log_filename }}") exec 2>&1 exec bin/alertmanager \ --config.file="conf/alertmanager.yml" \ --storage.path="{{ alertmanager_data_dir }}" \ --data.retention=120h \ --log.level="{{ alertmanager_log_level }}" \ --web.listen-address=":{{ alertmanager_port }}" \ --cluster.listen-address=":{{ alertmanager_cluster_port }}" ================================================ FILE: roles/alertmanager/templates/run_alertmanager_docker.sh.j2 ================================================ #!/bin/bash set -e ulimit -n 1000000 # WARNING: This file was auto-generated. Do not edit! # All your edit might be overwritten! DEPLOY_DIR={{ deploy_dir }} cd "${DEPLOY_DIR}" || exit 1 exec docker run -p {{ alertmanager_port }}:9093 \ -p {{ alertmanager_cluster_port }}:9094 \ -v /etc/localtime:/etc/localtime:ro \ -v "{{ alertmanager_data_dir }}:/alertmanager" \ -v "{{ deploy_dir }}/conf/alertmanager.yml:/etc/alertmanager/config.yml" \ -u `id -u {{ deploy_user }}` \ --name="alertmanager-{{ alertmanager_port }}" \ prom/alertmanager:{{ alertmanager_tag }} --config.file="/etc/alertmanager/config.yml" \ --storage.path="/alertmanager" \ --data.retention=120h \ --log.level="{{ alertmanager_log_level }}" ================================================ FILE: roles/blackbox_exporter/defaults/main.yml ================================================ --- # default configuration for blackbox_exporter blackbox_exporter_log_level: info blackbox_exporter_log_dir: "{{ deploy_dir }}/log" blackbox_exporter_log_filename: "blackbox_exporter.log" blackbox_exporter_dir: "{{ deploy_dir }}/conf" blackbox_exporter_tag: v0.12.0 ================================================ FILE: roles/blackbox_exporter/meta/main.yml ================================================ --- dependencies: - role: common_dir ================================================ FILE: roles/blackbox_exporter/tasks/binary_deployment.yml ================================================ --- - name: create deploy directories file: path={{ item }} state=directory mode=0755 with_items: - "{{ blackbox_exporter_log_dir }}" - name: deploy blackbox_exporter binary copy: src="{{ resources_dir }}/bin/blackbox_exporter" dest="{{ deploy_dir }}/bin" mode=0755 - name: blackbox_exporter binary add CAP_NET_RAW capability command: setcap cap_net_raw+ep "{{ deploy_dir }}/bin/blackbox_exporter" become: true - name: create run script template: src: "{{ item }}_{{ role_name }}_binary.sh.j2" dest: "{{ deploy_dir }}/scripts/{{ item }}_{{ role_name }}.sh" mode: "0755" backup: yes with_items: - run vars: role_status_dir: status/{{ role_name }} - include_tasks: "{{ process_supervision }}_deployment.yml" ================================================ FILE: roles/blackbox_exporter/tasks/docker_deployment.yml ================================================ --- - name: deploy blackbox_exporter image copy: src="{{ downloads_dir }}/blackbox-exporter.tar" dest="{{ deploy_dir }}/images" mode=0755 - name: create run script template: src: "{{ item }}_{{ role_name }}_docker.sh.j2" dest: "{{ deploy_dir }}/scripts/{{ item }}_{{ role_name }}.sh" mode: "0755" backup: yes with_items: - run - name: load docker image from archive docker_image: state: present force: yes name: prom/blackbox_exporter tag: "{{ blackbox_exporter_tag }}" load_path: "{{ images_dir }}/blackbox-exporter.tar" - include_tasks: "{{ process_supervision }}_deployment.yml" ================================================ FILE: roles/blackbox_exporter/tasks/main.yml ================================================ --- - include_tasks: "{{ deployment_method }}_deployment.yml" - name: create config file template: src=blackbox.yml.j2 dest={{ deploy_dir }}/conf/blackbox.yml mode=0644 backup=yes register: blackbox_conf_st - name: backup conf file command: mv "{{ blackbox_conf_st.backup_file }}" "{{ backup_dir }}" when: blackbox_conf_st.changed and blackbox_conf_st.backup_file is defined - name: prepare firewalld white list set_fact: firewalld_ports: "{{ [blackbox_exporter_port ~ '/tcp'] + firewalld_ports }}" ================================================ FILE: roles/blackbox_exporter/tasks/supervise_deployment.yml ================================================ --- - name: deploy supervise include_role: name: supervise vars: this_role_name: blackbox_exporter service_name: blackbox_exporter-{{ blackbox_exporter_port }} ================================================ FILE: roles/blackbox_exporter/tasks/systemd_deployment.yml ================================================ --- - name: deploy systemd include_role: name: systemd vars: this_role_name: blackbox_exporter service_name: blackbox_exporter-{{ blackbox_exporter_port }} ================================================ FILE: roles/blackbox_exporter/templates/blackbox.yml.j2 ================================================ modules: http_2xx: prober: http http: method: GET http_post_2xx: prober: http http: method: POST tcp_connect: prober: tcp pop3s_banner: prober: tcp tcp: query_response: - expect: "^+OK" tls: true tls_config: insecure_skip_verify: false ssh_banner: prober: tcp tcp: query_response: - expect: "^SSH-2.0-" irc_banner: prober: tcp tcp: query_response: - send: "NICK prober" - send: "USER prober prober prober :prober" - expect: "PING :([^ ]+)" send: "PONG ${1}" - expect: "^:[^ ]+ 001" icmp: prober: icmp timeout: 5s icmp: preferred_ip_protocol: "ip4" ================================================ FILE: roles/blackbox_exporter/templates/run_blackbox_exporter_binary.sh.j2 ================================================ #!/bin/bash set -e ulimit -n 1000000 # WARNING: This file was auto-generated. Do not edit! # All your edit might be overwritten! DEPLOY_DIR={{ deploy_dir }} cd "${DEPLOY_DIR}" || exit 1 exec > >(tee -i -a "{{ blackbox_exporter_log_dir }}/{{ blackbox_exporter_log_filename }}") exec 2>&1 exec bin/blackbox_exporter --web.listen-address=":{{ blackbox_exporter_port }}" \ --log.level="{{ blackbox_exporter_log_level }}" \ --config.file="conf/blackbox.yml" ================================================ FILE: roles/blackbox_exporter/templates/run_blackbox_exporter_docker.sh.j2 ================================================ #!/bin/bash set -e ulimit -n 1000000 # WARNING: This file was auto-generated. Do not edit! # All your edit might be overwritten! DEPLOY_DIR={{ deploy_dir }} cd "${DEPLOY_DIR}" || exit 1 exec docker run \ --net="host" \ --pid="host" \ --name="blackbox_exporter-{{ blackbox_exporter_port }}" \ -v "{{ blackbox_exporter_dir }}/blackbox.yml:/etc/blackbox.yml:ro" \ prom/blackbox-exporter:{{ blackbox_exporter_tag }} \ --config.file="/etc/blackbox.yml" ================================================ FILE: roles/bootstrap/defaults/main.yml ================================================ --- tuning_kernel_parameters: true tuning_irqbalance_value: true ================================================ FILE: roles/bootstrap/tasks/main.yml ================================================ --- # bootstrap a machine and begin deployment - name: gather facts setup: gather_timeout: 30 - name: group hosts by distribution group_by: key="{{ ansible_distribution }}-{{ ansible_distribution_version }}" changed_when: false - name: Set deploy_dir if not presented set_fact: deploy_dir="/home/{{ deploy_user }}/deploy" when: deploy_dir is not defined - include_tasks: root_tasks.yml ================================================ FILE: roles/bootstrap/tasks/root_tasks.yml ================================================ --- - name: setting absent kernel params sysctl: name: "{{ item.name }}" value: "{{ item.value }}" sysctl_set: yes ignoreerrors: yes state: absent with_items: - { name: 'net.ipv4.tcp_tw_recycle', value: 0 } when: tuning_kernel_parameters - name: setting present kernel params sysctl: name="{{ item.name }}" value="{{ item.value }}" ignoreerrors=yes state=present with_items: - { name: 'net.core.somaxconn', value: 32768 } - { name: 'vm.swappiness', value: 0 } - { name: 'net.ipv4.tcp_syncookies', value: 0 } - { name: 'fs.file-max', value: 1000000 } when: tuning_kernel_parameters - name: disable THP shell: echo never > /sys/kernel/mm/transparent_hugepage/enabled && echo never > /sys/kernel/mm/transparent_hugepage/defrag when: - tuning_kernel_parameters - name: update /etc/security/limits.conf blockinfile: dest: /etc/security/limits.conf insertbefore: '# End of file' block: | {{ deploy_user }} soft nofile 1000000 {{ deploy_user }} hard nofile 1000000 {{ deploy_user }} soft stack 10240 when: tuning_kernel_parameters - name: disable swap command: swapoff -a when: ansible_swaptotal_mb > 0 - name: create group group: name={{ deploy_user }} - name: create account user: name={{ deploy_user }} group={{ deploy_user }} - name: create top deploy dir when under root file: path="{{ deploy_dir }}" state=directory mode=0755 owner={{ deploy_user }} group={{ deploy_user }} - name: create wal_dir deploy dir when under root file: path="{{ wal_dir }}" state=directory mode=0755 owner={{ deploy_user }} group={{ deploy_user }} when: wal_dir is defined - name: create raftdb_path deploy dir when under root file: path="{{ raftdb_path }}" state=directory mode=0755 owner={{ deploy_user }} group={{ deploy_user }} when: raftdb_path is defined - name: set hostname if hostname is not distinguishable hostname: name=ip-{{ ansible_default_ipv4.address | replace(".","-") }} register: hostname_set when: - set_hostname - "ansible_default_ipv4.address | replace('.','-') not in ansible_hostname" - name: set hostname in hosts file lineinfile: dest=/etc/hosts line='127.0.0.1 ip-{{ ansible_default_ipv4.address | replace(\".\",\"-\") }}' when: set_hostname - name: determine if firewalld is running command: bash -c 'firewall-cmd --state || exit 0' register: firewalld_running ignore_errors: true changed_when: false - name: disable firewalld service: name=firewalld enabled=no state=stopped when: | not (enable_firewalld is defined and enable_firewalld) and firewalld_running.stdout.strip() == "running" - name: or to enable firewalld service: name=firewalld enabled=yes state=started when: | enable_firewalld is defined and enable_firewalld and firewalld_running.stdout.strip() != "running" # modify irqbalance configuration file - name: check centos configuration file exists stat: path=/etc/sysconfig/irqbalance register: centos_irq_config_file - name: check debian configuration file exists stat: path=/etc/default/irqbalance register: debian_irq_config_file - name: modify centos irqbalance configuration file lineinfile: dest=/etc/sysconfig/irqbalance regexp='(?- {%- if enable_tls|default(false) -%}{{ pd_cert_dir }}/ca.pem{%- else -%}{%- endif -%} cert-path: >- {%- if enable_tls|default(false) -%}{{ pd_cert_dir }}/pd-server-{{ pd_host }}.pem{%- else -%}{%- endif -%} key-path: >- {%- if enable_tls|default(false) -%}{{ pd_cert_dir }}/pd-server-{{ pd_host }}-key.pem{%- else -%}{%- endif -%} - name: Generate final config set_fact: pd_conf: "{{ pd_conf_custom_check | with_default_dicts(pd_conf_generated_check, pd_conf_default_check) | update_default_dicts }}" - name: Create configuration file template: src={{ playbook_dir }}/roles/pd/templates/pd.toml.j2 dest={{ tidb_check_dir }}/pd.toml mode=0644 backup=yes - name: Deploy PD binary copy: src="{{ resources_dir }}/bin/pd-server" dest="{{ tidb_check_dir }}/" mode=0755 backup=yes - name: Check PD config shell: cd {{ tidb_check_dir }} && ./pd-server -config ./pd.toml -config-check register: pd_check_result - name: Delete temporary check directory file: name={{ tidb_check_dir }} state=absent - name: Check result fail: msg: "PD config error" when: "'successful' not in pd_check_result.stdout" ================================================ FILE: roles/check_config_static/tasks/main.yml ================================================ --- # Common Tasks - name: Ensure monitoring_servers exists fail: msg="monitoring_servers should be specified." when: groups['monitoring_servers'] | length < 1 - name: Ensure monitored_servers exists fail: msg="monitored_servers should be specified." when: groups['monitored_servers'] | length < 1 - name: Ensure TiDB host exists fail: msg="No tidb host is specified. This cluster will run into a RawKV mode." when: - groups['tidb_servers'] | length < 1 - deploy_without_tidb is defined - "not deploy_without_tidb" - name: Ensure PD host exists fail: msg="One, or more pd hosts should be specified." when: groups['pd_servers'] | length < 1 - name: Ensure TiKV host exists fail: msg="One, or more tikv hosts should be specified." when: groups['tikv_servers'] | length < 1 - name: Check ansible_user variable fail: msg="ansible_user == 'root' is not supported, please ssh via normal user" when: ansible_user == 'root' - name: Ensure timezone variable is set fail: msg="Please set timezone variable in inventory.ini." when: (timezone is undefined) or (timezone is defined and timezone == "") - name: Close old SSH control master processes shell: pkill -f "ssh.*ansible.*" ignore_errors: true changed_when: false failed_when: false - name: Check ansible version fail: msg: "Stop if ansible version is too low, make sure that the Ansible version is 2.4.2 or later, otherwise a compatibility issue occurs. Current ansible version is {{ ansible_version.full }}" when: ansible_version.full | version_compare('2.4.2', '<') - name: Check if jmespath installed shell: pip list | grep -iw jmespath | wc -l register: jmespath_exist - name: Check if jinja2 installed shell: pip list | grep -iw jinja2 | wc -l register: jinja2_exist - name: Preflight check - Fail when jmespath or jinja2 isn't installed fail: msg: "Jmespath or Jinja2 does not exist, Please run `pip install -r requirements.txt` to install." when: jmespath_exist.stdout | int == 0 or jinja2_exist.stdout | int == 0 - name: Get jmespath info shell: pip show jmespath | grep Version |grep -v Metadata-Version register: jmespath - name: Get jmespath version set_fact: jmespath_version: "{{ jmespath.stdout_lines[0] | replace('Version: ', '') }}" - name: Get jinja2 info shell: pip show jinja2 | grep Version |grep -v Metadata-Version register: jinja2 - name: Get jinja2 version set_fact: jinja2_version: "{{ jinja2.stdout_lines[0] | replace('Version: ', '') }}" - name: Preflight check - Fail when the versions of jmespath and jinja2 doesn't meet the requirements fail: msg: "Jmespath({{ jmespath_version }}) or jinja2({{ jinja2_version }}) version is too low, Please run `pip install --upgrade -r requirements.txt` to upgrade." when: jmespath_version | version_compare('0.9.0', '<') or jinja2_version | version_compare('2.9.6', '<') - name: Check inventory configuration shell: python2 {{ playbook_dir }}/scripts/inventory_check.py {{ ansible_inventory_sources.0 }} register: inventory_check_result - name: Preflight check - If the inventory configuration is correct fail: msg: "{{ inventory_check_result.stdout }}" when: "'Check ok' not in inventory_check_result.stdout" ================================================ FILE: roles/check_config_tidb/tasks/main.yml ================================================ --- - set_fact: tidb_check_dir: "/tmp/tidb_check_config" tidb_host: "{{ hostvars[groups.tidb_servers[0]].ansible_host | default(hostvars[groups.tidb_servers[0]].inventory_hostname) }}" - name: Create temporary check directory file: name={{ tidb_check_dir }} state=directory - name: Load TiDB default vars include_vars: file={{ playbook_dir }}/roles/tidb/defaults/main.yml name=tidb_vars_check - name: Load TiDB group vars include_vars: file={{ playbook_dir }}/group_vars/tidb_servers.yml name=tidb_vars_check - name: "Load customized config: tidb-ansible/conf/tidb.yml" include_vars: file={{ playbook_dir }}/conf/tidb.yml name=tidb_conf_custom_check - name: Load default config include_vars: file={{ playbook_dir }}/roles/tidb/vars/default.yml name=tidb_conf_default_check - name: generate dynamic config set_fact: tidb_conf_generated_check: security: cluster-ssl-ca: >- {%- if enable_tls|default(false) -%}{{ tidb_cert_dir }}/ca.pem{%- else -%}{%- endif -%} cluster-ssl-cert: >- {%- if enable_tls|default(false) -%}{{ tidb_cert_dir }}/tidb-server-{{ tidb_host }}.pem{%- else -%}{%- endif -%} cluster-ssl-key: >- {%- if enable_tls|default(false) -%}{{ tidb_cert_dir }}/tidb-server-{{ tidb_host }}-key.pem{%- else -%}{%- endif -%} - name: Generate final config set_fact: tidb_conf: "{{ tidb_conf_custom_check | with_default_dicts(tidb_conf_generated_check, tidb_conf_default_check) | update_default_dicts }}" - name: Create configuration file template: src={{ playbook_dir }}/roles/tidb/templates/tidb.toml.j2 dest={{ tidb_check_dir }}/tidb.toml mode=0644 backup=yes - name: Deploy TiDB binary copy: src="{{ resources_dir }}/bin/tidb-server" dest="{{ tidb_check_dir }}/" mode=0755 backup=yes - name: Check TiDB config shell: cd {{ tidb_check_dir }} && ./tidb-server -config ./tidb.toml -config-check register: tidb_check_result - name: Delete temporary check directory file: name={{ tidb_check_dir }} state=absent - name: Check result fail: msg: "TiDB config error" when: "'successful' not in tidb_check_result.stdout" ================================================ FILE: roles/check_config_tikv/tasks/main.yml ================================================ --- - set_fact: tidb_check_dir: "/tmp/tidb_check_config" tikv_host: "{{ hostvars[groups.tikv_servers[0]].ansible_host | default(hostvars[groups.tikv_servers[0]].inventory_hostname) }}" - name: Create temporary check directory file: name={{ tidb_check_dir }} state=directory - set_fact: tikv_log_dir: "{{ deploy_dir }}/log" - name: Load TiKV vars include_vars: file={{ playbook_dir }}/roles/tikv/defaults/main.yml name=tikv_vars_check - name: "Load customized config: tidb-ansible/conf/tikv.yml" include_vars: file={{ playbook_dir }}/conf/tikv.yml name=tikv_conf_custom_check - name: Load default config include_vars: file={{ playbook_dir }}/roles/tikv/vars/default.yml name=tikv_conf_default_check - name: generate dynamic config set_fact: tikv_conf_generated_check: server: labels: "{{ tikv_vars_check.labels }}" rocksdb: wal-dir: "{{ tikv_vars_check.wal_dir }}" raftstore: raftdb-path: "{{ tikv_vars_check.raftdb_path }}" security: ca-path: >- {%- if enable_tls|default(false) -%}{{ tikv_cert_dir }}/ca.pem{%- else -%}{%- endif -%} cert-path: >- {%- if enable_tls|default(false) -%}{{ tikv_cert_dir }}/tikv-server-{{ tikv_host }}.pem{%- else -%}{%- endif -%} key-path: >- {%- if enable_tls|default(false) -%}{{ tikv_cert_dir }}/tikv-server-{{ tikv_host }}-key.pem{%- else -%}{%- endif -%} - name: Generate final config set_fact: tikv_conf: "{{ tikv_conf_custom_check | with_default_dicts(tikv_conf_generated_check, tikv_conf_default_check) | update_default_dicts }}" - name: Create configuration file template: src={{ playbook_dir }}/roles/tikv/templates/tikv.toml.j2 dest={{ tidb_check_dir }}/tikv.toml mode=0644 backup=yes - name: Deploy TiKV binary copy: src="{{ resources_dir }}/bin/tikv-server" dest="{{ tidb_check_dir }}/" mode=0755 backup=yes - name: Check TiKV config shell: cd {{ tidb_check_dir }} && ./tikv-server --pd-endpoints pd:port --config ./tikv.toml --config-check register: tikv_check_result - name: Delete temporary check directory file: name={{ tidb_check_dir }} state=absent - name: Check result fail: msg: "TiKV config error" when: "'successful' not in tikv_check_result.stdout" ================================================ FILE: roles/check_system_dynamic/defaults/main.yml ================================================ --- # ulimit -n, hard-coded in startup scrips min_open_fds: 1000000 ================================================ FILE: roles/check_system_dynamic/tasks/main.yml ================================================ --- - name: Disk space check - Fail task when disk is full shell: df -h . | tail -n1 register: disk_space_st failed_when: " '100%' in disk_space_st.stdout " changed_when: false - name: get facts setup: gather_subset: hardware - name: Preflight check - Get hostnames of all nodes in cluster set_fact: all_hostnames: |- [ {% set all_hosts = groups['monitored_servers']|unique|sort -%} {% for host in all_hosts -%} {% set hostname = hostvars[host].ansible_hostname -%} "{{ hostname }}", {% endfor %} ] run_once: true when: ansible_play_hosts | length == groups['monitored_servers'] | length - name: Preflight check - Does every node in cluster have different hostname fail: msg: 'hostnames of all nodes in cluster: {{ all_hostnames | to_yaml }}' run_once: true when: - ansible_play_hosts | length == groups['monitored_servers'] | length - all_hostnames | unique | length != groups['monitored_servers'] | length - name: Preflight check - Get NTP service status shell: ntpstat | grep -w synchronised | wc -l register: ntp_st changed_when: false when: enable_ntpd - name: Preflight check - NTP service fail: msg: "Make sure NTP service is running and ntpstat is synchronised to NTP server. See https://github.com/pingcap/docs/blob/master/online-deployment-using-ansible.md#how-to-check-whether-the-ntp-service-is-normal." when: - enable_ntpd - ntp_st.stdout|int != 1 - name: Preflight check - Get umask shell: umask register: umask changed_when: False - name: Preflight check - Does the system have a standard umask fail: msg: 'The umask of the system ({{ umask.stdout.strip() }}) prevents successful installation. We suggest a standard umask such as 0022.' when: umask.stdout.strip()[-2:] not in ('00', '02', '20', '22') - name: Preflight check - Get maximum number of open file descriptors limit shell: ulimit -H -n register: ulimit changed_when: False - name: Preflight check - ulimit -n fail: msg: 'The default maximum number of open file descriptors is too low {{ ulimit.stdout }}, should be {{ min_open_fds }}' when: ulimit.stdout|int < min_open_fds|int - name: Preflight check - Check swap fail: msg: "Swap is on, for best performance, turn swap off" when: ansible_swaptotal_mb != 0 ================================================ FILE: roles/check_system_optional/defaults/main.yml ================================================ --- # CPU tidb_min_cpu: 8 tikv_min_cpu: 8 pd_min_cpu: 4 monitor_min_cpu: 4 # Mem tidb_min_ram: 16000 tikv_min_ram: 16000 pd_min_ram: 8000 monitor_min_ram: 8000 # Disk tidb_min_disk: 500000000000 tikv_min_disk: 500000000000 pd_min_disk: 200000000000 monitor_min_disk: 500000000000 ================================================ FILE: roles/check_system_optional/tasks/main.yml ================================================ --- - name: Preflight check - Check TiDB server's CPU fail: msg: "This machine does not have sufficient CPU to run TiDB, at least {{ tidb_min_cpu }} cores." when: - "'tidb_servers' in group_names" - ansible_processor_vcpus < tidb_min_cpu|int - name: Preflight check - Check TiKV server's CPU fail: msg: "This machine does not have sufficient CPU to run TiKV, at least {{ tikv_min_cpu }} cores." when: - "'tikv_servers' in group_names" - ansible_processor_vcpus < tikv_min_cpu|int - name: Preflight check - Check PD server's CPU fail: msg: "This machine does not have sufficient CPU to run PD, at least {{ pd_min_cpu }} cores." when: - "'pd_servers' in group_names" - ansible_processor_vcpus < pd_min_cpu|int - name: Preflight check - Check Monitor server's CPU fail: msg: "This machine does not have sufficient CPU to run Monitor, at least {{ monitor_min_cpu }} cores." when: - "'monitoring_servers' in group_names" - ansible_processor_vcpus < monitor_min_cpu|int - name: Preflight check - Check TiDB server's RAM fail: msg: "This machine does not have sufficient RAM to run TiDB, at least {{ tidb_min_ram }} MB." when: - "'tidb_servers' in group_names" - ansible_memtotal_mb < tidb_min_ram|int - name: Preflight check - Check TiKV server's RAM fail: msg: "This machine does not have sufficient RAM to run TiKV, at least {{ tikv_min_ram }} MB." when: - "'tikv_servers' in group_names" - ansible_memtotal_mb < tikv_min_ram|int - name: Preflight check - Check PD server's RAM fail: msg: "This machine does not have sufficient RAM to run PD, at least {{ pd_min_ram }} MB." when: - "'pd_servers' in group_names" - ansible_memtotal_mb < pd_min_ram|int - name: Preflight check - Check Monitor server's RAM fail: msg: "This machine does not have sufficient RAM to run Monitor, at least {{ monitor_min_ram }} MB." when: - "'monitoring_servers' in group_names" - ansible_memtotal_mb < monitor_min_ram|int - name: Set deploy_dir if not presented set_fact: deploy_dir="/home/{{ deploy_user }}/deploy" when: deploy_dir is not defined - name: Determine which mountpoint deploy dir exists on shell: "df {{ deploy_dir }} | tail -n1 | awk '{print $NF}'" register: deploy_partition changed_when: False - set_fact: tidb_disk_alert: "true" when: - "'tidb_servers' in group_names" - item.mount == deploy_partition.stdout - item.size_available < tidb_min_disk|int with_items: "{{ ansible_mounts }}" - name: Preflight check - Check TiDB server's disk space debug: msg: 'The file system mounted at {{ deploy_partition.stdout }} does not meet minimum disk space requirement: at least {{ tidb_min_disk/1000000000 }} GB.' when: - tidb_disk_alert is defined - tidb_disk_alert - "'tidb_servers' in group_names" - set_fact: tikv_disk_alert: "true" when: - "'tikv_servers' in group_names" - item.mount == deploy_partition.stdout - item.size_available < tikv_min_disk|int with_items: "{{ ansible_mounts }}" - name: Preflight check - Check TiKV server's disk space debug: msg: 'The file system mounted at {{ deploy_partition.stdout }} does not meet minimum disk space requirement: at least {{ tikv_min_disk/1000000000 }} GB.' when: - tikv_disk_alert is defined - tikv_disk_alert - "'tikv_servers' in group_names" - set_fact: pd_disk_alert: "true" when: - "'pd_servers' in group_names" - item.mount == deploy_partition.stdout - item.size_available < pd_min_disk|int with_items: "{{ ansible_mounts }}" - name: Preflight check - Check PD server's disk space debug: msg: 'The file system mounted at {{ deploy_partition.stdout }} does not meet minimum disk space requirement: at least {{ pd_min_disk/1000000000 }} GB.' when: - pd_disk_alert is defined - pd_disk_alert - "'pd_servers' in group_names" - set_fact: monitor_disk_alert: "true" when: - "'monitoring_servers' in group_names" - item.mount == deploy_partition.stdout - item.size_available < monitor_min_disk|int with_items: "{{ ansible_mounts }}" - name: Preflight check - Check Monitor server's disk space debug: msg: 'The file system mounted at {{ deploy_partition.stdout }} does not meet minimum disk space requirement: at least {{ monitor_min_disk/1000000000 }} GB.' when: - monitor_disk_alert is defined - monitor_disk_alert - "'monitoring_servers' in group_names" ================================================ FILE: roles/check_system_static/tasks/main.yml ================================================ --- - name: Disk space check - Fail task when disk is full shell: df -h . | tail -n1 register: disk_space_st failed_when: " '100%' in disk_space_st.stdout " changed_when: false - name: get facts setup: gather_subset: hardware gather_timeout: 30 - name: Preflight check - Linux OS family and distribution version fail: msg: "System versions lower than Red Hat Enterprise Linux / CentOS 7.3 have been deprecated. Please use CentOS 7.3 and above. See https://github.com/pingcap/docs/blob/master/hardware-and-software-requirements.md." when: - ansible_os_family == 'RedHat' - ansible_distribution in ['CentOS', 'RedHat'] - ansible_distribution_major_version != '7' - name: Get systemd version yum: list: systemd register: systemd_info when: ansible_os_family == 'RedHat' - set_fact: systemd_version: "{{ systemd_info.results | json_query(query) }}" vars: query: "[?yumstate=='installed'].version" when: ansible_os_family == 'RedHat' - set_fact: systemd_release: "{{ systemd_info.results | json_query(query) }}" vars: query: "[?yumstate=='installed'].release" when: ansible_os_family == 'RedHat' - name: Preflight check - Systemd version fail: msg: "Current systemd version is {{ systemd_version.0 }}-{{ systemd_release.0 }} (below 219-52.el7), there are some memory bugs. Refer to https://access.redhat.com/discussions/3536621" when: - ansible_os_family == 'RedHat' - systemd_version[0] < '219' or (systemd_version[0] == '219' and systemd_release[0] < '52.el7') - name: Deploy epollexclusive script copy: src="{{ script_dir }}/check/epollexclusive-{{ cpu_architecture }}" dest="{{ deploy_dir }}/epollexclusive-{{ cpu_architecture }}" mode=0755 - name: Preflight check - Check if the operating system supports EPOLLEXCLUSIVE shell: "{{ deploy_dir }}/epollexclusive-{{ cpu_architecture }}" register: epollexclusive_check - name: Clean epollexclusive script file: path={{ deploy_dir }}/epollexclusive-{{ cpu_architecture }} state=absent - name: Preflight check - Fail when epollexclusive is unavailable fail: msg: "The current machine may be a docker virtual machine, and the corresponding physical machine operating system does not support epollexclusive, please upgrade the Linux kernel (the minimum version is 3.10.0-386.el7)" when: epollexclusive_check.stdout.find("True") == -1 - name: Deploy check_cpufreq script copy: src="{{ script_dir }}/check/check_cpufreq.py" dest="{{ deploy_dir }}/check_cpufreq.py" mode=0755 - name: Preflight check - Check CPUfreq governors available in the kernel shell: "python {{ deploy_dir }}/check_cpufreq.py --available-governors" register: cpufreq_available_governors - name: Preflight check - Check the currently active governor shell: "python {{ deploy_dir }}/check_cpufreq.py --current-governor" register: cpufreq_current_governor - name: Preflight check - Fail when CPU frequency governor is not set to performance mode fail: msg: "To achieve maximum performance, it is recommended to set The CPU frequency governor to performance mode, see https://github.com/pingcap/docs/blob/master/online-deployment-using-ansible.md#step-7-configure-the-cpufreq-governor-mode-on-the-target-machine." when: - cpufreq_available_governors.stdout.find("performance") != -1 - cpufreq_current_governor.stdout.find("performance") == -1 - name: Clean check_cpufreq script file: path={{ deploy_dir }}/check_cpufreq.py state=absent - name: Preflight check - Check Linux kernel overcommit_memory parameter shell: "sysctl -n vm.overcommit_memory" become: true register: vm_overcommit_memory - name: Preflight check - Fail when Linux kernel vm.overcommit_memory parameter is set to 2 fail: msg: "It is not recommended to set vm.overcommit_memory to 2, set it to 0 or 1." when: vm_overcommit_memory.stdout | int == 2 ================================================ FILE: roles/clean_log_pd/tasks/add_cron.yml ================================================ --- - name: add crontab cron: name: "pd-{{ pd_client_port }}" user: "{{ ansible_user }}" minute: 0 state: present job: 'find {{ pd_log_dir }} -type f -name "pd*.log" -mtime +{{ log_retain_days }} -exec rm -f {} \;' ================================================ FILE: roles/clean_log_pd/tasks/del_cron.yml ================================================ --- - name: delete crontab if exist cron: name: 'pd-{{ pd_client_port }}' user: "{{ ansible_user }}" state: absent ================================================ FILE: roles/clean_log_pd/tasks/main.yml ================================================ --- - include_tasks: add_cron.yml when: - enable_log_clean|default(false) - include_tasks: del_cron.yml when: - not enable_log_clean|default(false) - name: restart crond become: true systemd: name: crond state: restarted daemon_reload: yes ================================================ FILE: roles/clean_log_tidb/tasks/add_cron.yml ================================================ --- - name: add crontab cron: name: "tidb-{{ tidb_port }}" user: "{{ ansible_user }}" minute: 0 state: present job: 'find {{ tidb_log_dir }} -type f -name "tidb*.log" -mtime +{{ log_retain_days }} -exec rm -f {} \;' ================================================ FILE: roles/clean_log_tidb/tasks/del_cron.yml ================================================ --- - name: delete crontab if exist cron: name: "tidb-{{ tidb_port }}" user: "{{ ansible_user }}" state: absent ================================================ FILE: roles/clean_log_tidb/tasks/main.yml ================================================ --- - include_tasks: add_cron.yml when: - enable_log_clean|default(false) - include_tasks: del_cron.yml when: - not enable_log_clean|default(false) - name: restart crond become: true systemd: name: crond state: restarted daemon_reload: yes ================================================ FILE: roles/clean_log_tikv/tasks/add_cron.yml ================================================ --- - name: add crontab cron: name: "tikv-{{ tikv_port }}" user: "{{ ansible_user }}" minute: 0 state: present job: 'find {{ tikv_log_dir }} -type f -name "tikv.log*" -mtime +{{ log_retain_days }} -exec rm -f {} \;' ================================================ FILE: roles/clean_log_tikv/tasks/del_cron.yml ================================================ --- - name: delete crontab if exist cron: name: "tikv-{{ tikv_port }}" user: "{{ ansible_user }}" state: absent ================================================ FILE: roles/clean_log_tikv/tasks/main.yml ================================================ --- - include_tasks: add_cron.yml when: - enable_log_clean|default(false) - include_tasks: del_cron.yml when: - not enable_log_clean|default(false) - name: restart crond become: true systemd: name: crond state: restarted daemon_reload: yes ================================================ FILE: roles/collect_diagnosis/meta/main.yml ================================================ --- dependencies: - role: common_dir ================================================ FILE: roles/collect_diagnosis/tasks/main.yml ================================================ --- - name: uncompress tidb-insight scripts unarchive: > mode=0755 dest={{ deploy_dir }}/scripts/ src={{ downloads_dir }}/tidb-insight.tar.gz ================================================ FILE: roles/collector_host/tasks/collect_log.yml ================================================ --- - name: check node_exporter log directory stat: path={{ node_exporter_log_dir }} get_md5=false get_checksum=false register: log_dir_st - fail: msg: "{{ node_exporter_log_dir }} must exist and is a directory" when: log_dir_st.stat.isdir is not defined or log_dir_st.stat.isdir == False - name: collect system log shell: "python {{ collector_dir }}/scripts/tidb-insight/insight.py --output={{ node_exporter_log_dir }} --alias={{ inventory_hostname }} log --syslog --retention={{ collect_log_recent_hours | default('2') }}" become: true ================================================ FILE: roles/collector_host/tasks/main.yml ================================================ --- - set_fact: collector_dir: "{{ hostvars[groups.monitored_servers[0]].deploy_dir }}" service_host: "{{ ansible_host }}" - name: create fetch directory delegate_to: localhost file: path={{ item }} state=directory mode=0755 with_items: - "{{ fetch_tmp_dir }}/{{ service_host }}" - name: collect basic system information shell: "python {{ collector_dir }}/scripts/tidb-insight/insight.py --output={{ node_exporter_log_dir }} --alias={{ inventory_hostname }} system --collector" become: true - include_tasks: collect_log.yml - name: compress collected data shell: "python {{ collector_dir }}/scripts/tidb-insight/insight.py --output={{ node_exporter_log_dir }} --alias={{ inventory_hostname }} archive" become: true - name: fetch host diagnosis tarball with bandwidth limit delegate_to: localhost shell: "scp -P {{ ansible_port|default(22) }} -l {{ collect_bandwidth_limit|default('10000') }} -o StrictHostKeyChecking=no {{ service_host }}:{{ node_exporter_log_dir }}/{{ inventory_hostname }}.tar.gz {{ fetch_tmp_dir }}/{{ service_host }}/{{ inventory_hostname }}.tar.gz" when: enable_bandwidth_limit|default(true) - name: fetch host diagnosis tarball without bandwidth limit fetch: src: "{{ node_exporter_log_dir }}/{{ inventory_hostname }}.tar.gz" dest: "{{ fetch_tmp_dir }}/{{ service_host }}/{{ inventory_hostname }}.tar.gz" flat: yes when: not enable_bandwidth_limit|default(true) - name: remove host system temporary diagnosis tarball file: path: "{{ node_exporter_log_dir }}/{{ item }}" state: absent with_items: - "{{ inventory_hostname }}.tar.gz" become: true ================================================ FILE: roles/collector_pd/tasks/collect_config.yml ================================================ --- - name: check pd config path set_fact: pd_conf_dir: "{{ deploy_dir }}/conf" when: pd_conf_dir is undefined - name: check pd config directory stat: path={{ pd_conf_dir }} get_md5=false get_checksum=false register: conf_dir_st - fail: msg: "{{ pd_conf_dir }} must exist and is a directory" when: conf_dir_st.stat.isdir is not defined or conf_dir_st.stat.isdir == False - name: collect pd config shell: "python {{ collector_dir }}/scripts/tidb-insight/insight.py --output={{ pd_log_dir }} --alias=pd_{{ inventory_hostname }} config --dir={{ pd_conf_dir }} --prefix=pd" ================================================ FILE: roles/collector_pd/tasks/collect_log.yml ================================================ --- - name: check pd log directory stat: path={{ pd_log_dir }} get_md5=false get_checksum=false register: log_dir_st - fail: msg: "{{ pd_log_dir }} must exist and is a directory" when: log_dir_st.stat.isdir is not defined or log_dir_st.stat.isdir == False - name: collect pd log shell: "python {{ collector_dir }}/scripts/tidb-insight/insight.py --output={{ pd_log_dir }} --alias=pd_{{ inventory_hostname }} log --dir={{ pd_log_dir }} --prefix=pd --retention={{ collect_log_recent_hours | default('2') }}" ================================================ FILE: roles/collector_pd/tasks/main.yml ================================================ --- - set_fact: collector_dir: "{{ hostvars[groups.monitored_servers[0]].deploy_dir }}" service_host: "{{ ansible_host }}" - name: create pd fetch directory delegate_to: localhost file: path={{ item }} state=directory mode=0755 with_items: - "{{ fetch_tmp_dir }}/{{ service_host }}" - include_tasks: collect_log.yml - include_tasks: collect_config.yml - name: collect PD process information shell: "python {{ collector_dir }}/scripts/tidb-insight/insight.py --output={{ pd_log_dir }} --alias=pd_{{ inventory_hostname }} system --collector --port {{ pd_client_port }}" become: true - name: collect PD information shell: "python {{ collector_dir }}/scripts/tidb-insight/insight.py --output={{ pd_log_dir }} --alias=pd_{{ inventory_hostname }} tidb pdctl --host={{ ansible_host }} --port={{ pd_client_port }}" - name: compress collected data shell: "python {{ collector_dir }}/scripts/tidb-insight/insight.py --output={{ pd_log_dir }} --alias=pd_{{ inventory_hostname }} archive" - name: fetch pd diagnosis tarball with bandwidth limit delegate_to: localhost shell: "scp -P {{ ansible_port|default(22) }} -l {{ collect_bandwidth_limit|default('10000') }} -o StrictHostKeyChecking=no {{ service_host }}:{{ pd_log_dir }}/pd_{{ inventory_hostname }}.tar.gz {{ fetch_tmp_dir }}/{{ service_host }}/pd_{{ inventory_hostname }}.tar.gz" when: enable_bandwidth_limit|default(true) - name: fetch pd diagnosis tarball without bandwidth limit fetch: src: "{{ pd_log_dir }}/pd_{{ inventory_hostname }}.tar.gz" dest: "{{ fetch_tmp_dir }}/{{ service_host }}/pd_{{ inventory_hostname }}.tar.gz" flat: yes when: not enable_bandwidth_limit|default(true) - name: remove pd temporary diagnosis tarball file: path: "{{ pd_log_dir }}/{{ item }}" state: absent with_items: - "pd_{{ inventory_hostname }}.tar.gz" ================================================ FILE: roles/collector_prometheus/tasks/main.yml ================================================ --- - set_fact: collector_dir: "{{ hostvars[groups.monitored_servers[0]].deploy_dir }}" service_host: "{{ ansible_host }}" - name: create prometheus fetch directory delegate_to: localhost file: path={{ item }} state=directory mode=0755 with_items: - "{{ fetch_tmp_dir }}/{{ service_host }}" - name: collect Prometheus metrics shell: "python {{ collector_dir }}/scripts/tidb-insight/insight.py --output={{ deploy_dir }}/log --alias={{ inventory_hostname }} metric prom --host={{ ansible_host }} --port={{ prometheus_port }} --retention {{ collect_log_recent_hours | default('2') }}" - name: compress collected data shell: "python {{ collector_dir }}/scripts/tidb-insight/insight.py --output={{ deploy_dir }}/log --alias={{ inventory_hostname }} archive" - name: fetch prometheus diagnosis tarball with bandwidth limit delegate_to: localhost shell: "scp -P {{ ansible_port|default(22) }} -l {{ collect_bandwidth_limit|default('10000') }} -o StrictHostKeyChecking=no {{ service_host }}:{{ deploy_dir }}/log/{{ inventory_hostname }}.tar.gz {{ fetch_tmp_dir }}/{{ service_host }}/prometheus_{{ inventory_hostname }}.tar.gz" when: enable_bandwidth_limit|default(true) - name: fetch prometheus diagnosis tarball without bandwidth limit fetch: src: "{{ deploy_dir }}/log/{{ inventory_hostname }}.tar.gz" dest: "{{ fetch_tmp_dir }}/{{ service_host }}/prometheus_{{ inventory_hostname }}.tar.gz" flat: yes when: not enable_bandwidth_limit|default(true) - name: remove prometheus temporary diagnosis tarball file: path: "{{ deploy_dir }}/log/{{ item }}" state: absent with_items: - "{{ inventory_hostname }}.tar.gz" ================================================ FILE: roles/collector_pump/tasks/collect_log.yml ================================================ --- - name: check pump log directory stat: path={{ pump_log_dir }} get_md5=false get_checksum=false register: log_dir_st - fail: msg: "{{ pump_log_dir }} must exist and is a directory" when: log_dir_st.stat.isdir is not defined or log_dir_st.stat.isdir == False - name: collect pump log shell: "python {{ collector_dir }}/scripts/tidb-insight/insight.py --output={{ pump_log_dir }} --alias=pump_{{ inventory_hostname }} log --dir={{ pump_log_dir }} --prefix=pump --retention={{ collect_log_recent_hours | default('2') }}" ================================================ FILE: roles/collector_pump/tasks/main.yml ================================================ --- - set_fact: collector_dir: "{{ hostvars[groups.monitored_servers[0]].deploy_dir }}" service_host: "{{ ansible_host }}" - name: create pump fetch directory delegate_to: localhost file: path={{ item }} state=directory mode=0755 with_items: - "{{ fetch_tmp_dir }}/{{ service_host }}" - include_tasks: collect_log.yml - name: compress collected data shell: "python {{ collector_dir }}/scripts/tidb-insight/insight.py --output={{ pump_log_dir }} --alias=pump_{{ inventory_hostname }} archive" - name: fetch pump diagnosis tarball with bandwidth limit delegate_to: localhost shell: "scp -P {{ ansible_port|default(22) }} -l {{ collect_bandwidth_limit|default('10000') }} -o StrictHostKeyChecking=no {{ service_host }}:{{ pump_log_dir }}/pump_{{ inventory_hostname }}.tar.gz {{ fetch_tmp_dir }}/{{ service_host }}/pump_{{ inventory_hostname }}.tar.gz" when: enable_bandwidth_limit|default(true) - name: fetch pump diagnosis tarball without bandwidth limit fetch: src: "{{ pump_log_dir }}/pump_{{ inventory_hostname }}.tar.gz" dest: "{{ fetch_tmp_dir }}/{{ service_host }}/pump_{{ inventory_hostname }}.tar.gz" flat: yes when: not enable_bandwidth_limit|default(true) - name: remove pump temporary diagnosis tarball file: path: "{{ pump_log_dir }}/{{ item }}" state: absent with_items: - "pump_{{ inventory_hostname }}.tar.gz" ================================================ FILE: roles/collector_tidb/tasks/collect_config.yml ================================================ --- - name: check tidb config path set_fact: tidb_conf_dir: "{{ deploy_dir }}/conf" when: tidb_conf_dir is undefined - name: check tidb config directory stat: path={{ tidb_conf_dir }} get_md5=false get_checksum=false register: conf_dir_st - fail: msg: "{{ tidb_conf_dir }} must exist and is a directory" when: conf_dir_st.stat.isdir is not defined or conf_dir_st.stat.isdir == False - name: collect tidb config shell: "python {{ collector_dir }}/scripts/tidb-insight/insight.py --output={{ tidb_log_dir }} --alias=tidb_{{ inventory_hostname }} config --dir={{ tidb_conf_dir }} --prefix=tidb" ================================================ FILE: roles/collector_tidb/tasks/collect_log.yml ================================================ --- - name: check tidb log directory stat: path={{ tidb_log_dir }} get_md5=false get_checksum=false register: log_dir_st - fail: msg: "{{ tidb_log_dir }} must exist and is a directory" when: log_dir_st.stat.isdir is not defined or log_dir_st.stat.isdir == False - name: collect tidb log shell: "python {{ collector_dir }}/scripts/tidb-insight/insight.py --output={{ tidb_log_dir }} --alias=tidb_{{ inventory_hostname }} log --dir={{ tidb_log_dir }} --prefix=tidb --retention={{ collect_log_recent_hours | default('2') }}" ================================================ FILE: roles/collector_tidb/tasks/main.yml ================================================ --- - set_fact: collector_dir: "{{ hostvars[groups.monitored_servers[0]].deploy_dir }}" service_host: "{{ ansible_host }}" - name: create tidb fetch directory delegate_to: localhost file: path={{ item }} state=directory mode=0755 with_items: - "{{ fetch_tmp_dir }}/{{ service_host }}" - include_tasks: collect_log.yml - include_tasks: collect_config.yml - name: collect tidb process information shell: "python {{ collector_dir }}/scripts/tidb-insight/insight.py --output={{ tidb_log_dir }} --alias=tidb_{{ inventory_hostname }} system --collector --port {{ tidb_port }}" become: true - name: collect tidb server information shell: "python {{ collector_dir }}/scripts/tidb-insight/insight.py --output={{ tidb_log_dir }} --alias=tidb_{{ inventory_hostname }} tidb tidbinfo" - name: compress collected data shell: "python {{ collector_dir }}/scripts/tidb-insight/insight.py --output={{ tidb_log_dir }} --alias=tidb_{{ inventory_hostname }} archive" - name: fetch tidb diagnosis tarball with bandwidth limit delegate_to: localhost shell: "scp -P {{ ansible_port|default(22) }} -l {{ collect_bandwidth_limit|default('10000') }} -o StrictHostKeyChecking=no {{ service_host }}:{{ tidb_log_dir }}/tidb_{{ inventory_hostname }}.tar.gz {{ fetch_tmp_dir }}/{{ service_host }}/tidb_{{ inventory_hostname }}.tar.gz" when: enable_bandwidth_limit|default(true) - name: fetch tidb diagnosis tarball without bandwidth limit fetch: src: "{{ tidb_log_dir }}/tidb_{{ inventory_hostname }}.tar.gz" dest: "{{ fetch_tmp_dir }}/{{ service_host }}/tidb_{{ inventory_hostname }}.tar.gz" flat: yes when: not enable_bandwidth_limit|default(true) - name: remove tidb temporary diagnosis tarball file: path: "{{ tidb_log_dir }}/{{ item }}" state: absent with_items: - "tidb_{{ inventory_hostname }}.tar.gz" ================================================ FILE: roles/collector_tikv/tasks/collect_config.yml ================================================ --- - name: check tikv config path set_fact: tikv_conf_dir: "{{ deploy_dir }}/conf" when: tikv_conf_dir is undefined - name: check tikv config directory stat: path={{ tikv_conf_dir }} get_md5=false get_checksum=false register: conf_dir_st - fail: msg: "{{ tikv_conf_dir }} must exist and is a directory" when: conf_dir_st.stat.isdir is not defined or conf_dir_st.stat.isdir == False - name: collect tikv config shell: "python {{ collector_dir }}/scripts/tidb-insight/insight.py --output={{ tikv_log_dir }} --alias=tikv_{{ inventory_hostname }} config --dir={{ tikv_conf_dir }} --prefix=tikv" ================================================ FILE: roles/collector_tikv/tasks/collect_log.yml ================================================ --- - name: check tikv log directory stat: path={{ tikv_log_dir }} get_md5=false get_checksum=false register: log_dir_st - fail: msg: "{{ tikv_log_dir }} must exist and is a directory" when: log_dir_st.stat.isdir is not defined or log_dir_st.stat.isdir == False - name: collect tikv log shell: "python {{ collector_dir }}/scripts/tidb-insight/insight.py --output={{ tikv_log_dir }} --alias=tikv_{{ inventory_hostname }} log --dir={{ tikv_log_dir }} --prefix=tikv --retention={{ collect_log_recent_hours | default('2') }}" ================================================ FILE: roles/collector_tikv/tasks/main.yml ================================================ --- - set_fact: collector_dir: "{{ hostvars[groups.monitored_servers[0]].deploy_dir }}" service_host: "{{ ansible_host }}" - name: create tikv fetch directory delegate_to: localhost file: path={{ item }} state=directory mode=0755 with_items: - "{{ fetch_tmp_dir }}/{{ service_host }}" - include_tasks: collect_log.yml - include_tasks: collect_config.yml - name: collect tikv process information shell: "python {{ collector_dir }}/scripts/tidb-insight/insight.py --output={{ tikv_log_dir }} --alias=tikv_{{ inventory_hostname }} system --collector --port {{ tikv_port }}" become: true - name: compress collected data shell: "python {{ collector_dir }}/scripts/tidb-insight/insight.py --output={{ tikv_log_dir }} --alias=tikv_{{ inventory_hostname }} archive" - name: fetch tikv diagnosis tarball with bandwidth limit delegate_to: localhost shell: "scp -P {{ ansible_port|default(22) }} -l {{ collect_bandwidth_limit|default('10000') }} -o StrictHostKeyChecking=no {{ service_host }}:{{ tikv_log_dir }}/tikv_{{ inventory_hostname }}.tar.gz {{ fetch_tmp_dir }}/{{ service_host }}/tikv_{{ inventory_hostname }}.tar.gz" when: enable_bandwidth_limit|default(true) - name: fetch tikv diagnosis tarball without bandwidth limit fetch: src: "{{ tikv_log_dir }}/tikv_{{ inventory_hostname }}.tar.gz" dest: "{{ fetch_tmp_dir }}/{{ service_host }}/tikv_{{ inventory_hostname }}.tar.gz" flat: yes when: not enable_bandwidth_limit|default(true) - name: remove tikv temporary diagnosis tarball file: path: "{{ tikv_log_dir }}/{{ item }}" state: absent with_items: - "tikv_{{ inventory_hostname }}.tar.gz" ================================================ FILE: roles/common_dir/tasks/main.yml ================================================ --- # Common Tasks - name: create deploy directories file: path={{ item }} state=directory mode=0755 with_items: - "{{ deploy_dir }}/scripts" - "{{ deploy_dir }}/conf" - "{{ backup_dir }}" - name: create status directory file: path={{ item }} state=directory mode=0755 with_items: - "{{ status_dir }}" - name: create deploy binary directory file: path={{ item }} state=directory mode=0755 with_items: - "{{ deploy_dir }}/bin" when: deployment_method != 'docker' - name: create docker image directory file: path={{ item }} state=directory mode=0755 with_items: - "{{ images_dir }}" when: deployment_method == 'docker' ================================================ FILE: roles/dashboard_topo/tasks/main.yml ================================================ --- - name: generate init_dashboard_topo script template: src: "init_dashboard_topo.sh.j2" dest: "{{ playbook_dir }}/scripts/dashboard_topo.sh" mode: "0755" - name: init pd topo shell: "./dashboard_topo.sh" args: chdir: "{{ playbook_dir }}/scripts" ================================================ FILE: roles/dashboard_topo/templates/init_dashboard_topo.sh.j2 ================================================ #!/bin/bash set -e {% set all_pd = [] -%} {% set pd_hosts = groups.pd_servers %} {% for host in pd_hosts -%} {% set pd_ip = hostvars[host].ansible_host | default(hostvars[host].inventory_hostname) -%} {% set pd_port = hostvars[host].pd_client_port -%} {% set pd_path = hostvars[host].deploy_dir -%} {% set _ = all_pd.append("%s:%s%s" % (pd_ip, pd_port, pd_path)) -%} {% endfor -%} {% set all_grafana = [] -%} {% set grafana_hosts = groups.grafana_servers %} {% for host in grafana_hosts -%} {% set grafana_ip = hostvars[host].ansible_host | default(hostvars[host].inventory_hostname) -%} {% set grafana_port = hostvars[host].grafana_port -%} {% set grafana_path = hostvars[host].deploy_dir -%} {% set _ = all_grafana.append("%s:%s%s" % (grafana_ip, grafana_port, grafana_path)) -%} {% endfor -%} {% set all_alertmanager = [] -%} {% set alertmanager_hosts = groups.alertmanager_servers %} {% for host in alertmanager_hosts -%} {% set alertmanager_ip = hostvars[host].ansible_host | default(hostvars[host].inventory_hostname) -%} {% set alertmanager_port = hostvars[host].alertmanager_port -%} {% set alertmanager_path = hostvars[host].deploy_dir -%} {% set _ = all_alertmanager.append("%s:%s%s" % (alertmanager_ip, alertmanager_port, alertmanager_path)) -%} {% endfor -%} {% set all_prometheus = [] -%} {% set prometheus_hosts = groups.monitoring_servers %} {% for host in prometheus_hosts -%} {% set prometheus_ip = hostvars[host].ansible_host | default(hostvars[host].inventory_hostname) -%} {% set prometheus_port = hostvars[host].prometheus_port -%} {% set prometheus_path = hostvars[host].deploy_dir -%} {% set _ = all_prometheus.append("%s:%s%s" % (prometheus_ip, prometheus_port, prometheus_path)) -%} {% endfor -%} {% set flag = "" %} {% if all_grafana -%} {% set flag = flag + " --grafana " + ','.join(all_grafana) -%} {% endif -%} {% if all_alertmanager -%} {% set flag = flag + " --alertmanager " + ','.join(all_alertmanager) -%} {% endif -%} {% if all_prometheus %} {% set flag = flag + " --prometheus " + ','.join(all_prometheus) -%} {% endif -%} python2 dashboard_topo.py --pd {{ all_pd | join(',') }} {{ flag }} ================================================ FILE: roles/drainer/defaults/main.yml ================================================ --- drainer_log_dir: "{{ deploy_dir }}/log" drainer_log_filename: "drainer.log" drainer_stderr_filename: "drainer_stderr.log" drainer_data_dir: "{{ deploy_dir }}/data.drainer" pd_scheme: http # systemd: Specifies when to restart the service. restart: on-failure ================================================ FILE: roles/drainer/files/make-ssl.sh ================================================ #!/bin/bash # Author: Smana smainklh@gmail.com # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. set -o errexit set -o pipefail usage() { cat << EOF Create self signed certificates Usage : $(basename $0) [-d ] -h | --help : Show this message -d | --ssldir : Directory where the certificates will be located Environmental variables HOSTS and CN should be set to generate keys for each host. EOF } # Options parsing while (($#)); do case "$1" in -h | --help) usage; exit 0;; -d | --ssldir) SSLDIR="${2}"; shift 2;; *) usage echo "ERROR : Unknown option" exit 3 ;; esac done if [ -z ${SSLDIR} ]; then echo "ERROR: the directory where the certificates will be located is missing. option -d" exit 1 fi tmpdir=$(mktemp -d /tmp/tidb_cacert.XXXXXX) trap 'rm -rf "${tmpdir}"' EXIT cd "${tmpdir}" mkdir -p "${SSLDIR}" if [ -e "$SSLDIR/ca-config.json" ]; then # Reuse existing CA cp $SSLDIR/{ca-config.json,ca-csr.json} . else echo "ERROR: ca-config.json and ca-csr.json is missing in $SSLDIR." exit 1 fi # Root CA if [ -e "$SSLDIR/ca-key.pem" ]; then # Reuse existing CA cp $SSLDIR/{ca.pem,ca-key.pem} . else cfssl gencert -initca ca-csr.json | cfssljson -bare ca - > /dev/null 2>&1 fi # client cert if [ ! -e "$SSLDIR/client-key.pem" ]; then echo '{"CN":"client","hosts":[""],"key":{"algo":"rsa","size":2048}}' | cfssl gencert -ca=ca.pem -ca-key=ca-key.pem -config=ca-config.json -profile=client -hostname="" - | cfssljson -bare client > /dev/null 2>&1 fi gen_key_and_cert() { local host=$1 local cn=$2 local name=$3 echo "{\"CN\":\"${cn}\",\"hosts\":[\"\"],\"key\":{\"algo\":\"rsa\",\"size\":2048}}" | cfssl gencert -ca=ca.pem -ca-key=ca-key.pem -config=ca-config.json -profile=server -hostname="${host},127.0.0.1" - | cfssljson -bare ${name} > /dev/null 2>&1 } # Nodes if [ -n "$HOSTS" ]; then for host in $HOSTS; do gen_key_and_cert "${host}" "${CN}" "${CN}-${host}" done fi # Install certs mv *.pem ${SSLDIR}/ ================================================ FILE: roles/drainer/meta/main.yml ================================================ --- dependencies: - role: common_dir ================================================ FILE: roles/drainer/tasks/binary_deployment.yml ================================================ --- - name: deploy drainer binary copy: src="{{ resources_dir }}/bin/drainer" dest="{{ deploy_dir }}/bin/" mode=0755 - name: create run script template: src: "{{ item }}_drainer_binary.sh.j2" dest: "{{ deploy_dir }}/scripts/{{ item }}_drainer.sh" mode: "0755" backup: yes with_items: - run vars: role_status_dir: status/drainer - include_tasks: "{{ process_supervision }}_deployment.yml" ================================================ FILE: roles/drainer/tasks/check_certs.yml ================================================ --- - name: "Check_certs | check if the certs have already been generated on control machine" find: paths: "{{ cert_dir }}" patterns: "*.pem" get_checksum: true delegate_to: localhost register: cert_control_node run_once: true - debug: var: cert_control_node - name: "Check_certs | Set default value for 'sync_certs', 'gen_certs' to false" set_fact: sync_certs: false gen_certs: false - set_fact: drainer_host: "{{ hostvars[inventory_hostname].ansible_host | default(inventory_hostname) }}" - name: "Check certs | check if a cert already exists on node" stat: path: "{{ drainer_cert_dir }}/{{ item }}" register: cert_drainer_node with_items: - ca.pem - drainer-server-{{ drainer_host }}-key.pem - drainer-server-{{ drainer_host }}.pem - debug: var: cert_drainer_node - name: "Check_certs | Set 'gen_certs' to true" set_fact: gen_certs: true when: not item in cert_control_node.files|map(attribute='path') | list delegate_to: localhost run_once: true with_items: >- ['{{cert_dir}}/ca.pem', {% set all_drainer_hosts = groups['drainer_servers']|unique|sort %} {% for host in all_drainer_hosts %} {% set drainer_ip = hostvars[host].ansible_host | default(hostvars[host].inventory_hostname) -%} '{{cert_dir}}/drainer-server-{{ drainer_ip }}-key.pem' {% if not loop.last %}{{','}}{% endif %} {% endfor %}] - debug: var: gen_certs - name: "Check_certs | Set 'gen_node_certs' to true" set_fact: gen_node_certs: |- { {% set all_drainer_hosts = groups['drainer_servers']|unique|sort -%} {% set existing_certs = cert_control_node.files|map(attribute='path')|list|sort %} {% for host in all_drainer_hosts -%} {% set drainer_ip = hostvars[host].ansible_host | default(hostvars[host].inventory_hostname) -%} {% set host_cert = "%s/drainer-server-%s-key.pem"|format(cert_dir, drainer_ip) %} {% if host_cert in existing_certs -%} "{{ host }}": False, {% else -%} "{{ host }}": True, {% endif -%} {% endfor %} } run_once: true - debug: var: gen_node_certs - name: "Check_certs | Set drainer_cert_key" set_fact: drainer_cert_key_path: "{{ cert_dir }}/drainer-server-{{ hostvars[inventory_hostname].drainer_host }}-key.pem" - debug: var: drainer_cert_key_path - name: "Check_certs | Set 'sync_certs' to true" set_fact: sync_certs: true when: gen_node_certs[inventory_hostname] or (not cert_drainer_node.results[0].stat.exists|default(False)) or (not cert_drainer_node.results[1].stat.exists|default(False)) or (cert_drainer_node.results[1].stat.checksum|default('') != cert_control_node.files|selectattr("path","equalto",drainer_cert_key_path)|map(attribute="checksum")|first|default('')) - debug: var: sync_certs ================================================ FILE: roles/drainer/tasks/gen_certs.yml ================================================ --- - name: Gen_certs | copy certs generation script copy: src: "make-ssl.sh" dest: "{{ script_dir }}/make-ssl.sh" mode: 0700 run_once: yes delegate_to: localhost when: gen_certs|default(false) - name: Gen_certs | run cert generation script command: "{{ script_dir }}/make-ssl.sh -d {{ cert_dir }}" environment: - HOSTS: "{% for h in groups['drainer_servers'] %} {% if gen_node_certs[h]|default(true) %} {{ hostvars[h].ansible_host | default(hostvars[h].inventory_hostname) }} {% endif %} {% endfor %}" - PATH: "{{ ansible_env.PATH }}:{{ binary_dir }}" - CN: "drainer-server" run_once: yes delegate_to: localhost when: gen_certs|default(false) ================================================ FILE: roles/drainer/tasks/install_certs.yml ================================================ --- - name: "Deploy_certs | Make sure the certificate directory exits" file: path: "{{ drainer_cert_dir }}" state: directory mode: 0700 - name: "Deploy_certs | Deploy certificates" copy: src: "{{ cert_dir }}/{{ item }}" dest: "{{ drainer_cert_dir }}/{{ item }}" mode: 0600 backup: yes with_items: - ca.pem - drainer-server-{{ drainer_host }}-key.pem - drainer-server-{{ drainer_host }}.pem when: sync_certs|default(false) ================================================ FILE: roles/drainer/tasks/main.yml ================================================ --- # tasks file for drainer - name: Preflight check - ensure initial_commit_ts variable is set fail: msg="Please set initial_commit_ts variable for drainer server in inventory.ini." when: (initial_commit_ts is undefined) or (initial_commit_ts is defined and initial_commit_ts == "") - name: "check if the customized config file `{{ playbook_dir }}/conf/{{ inventory_hostname }}_drainer.toml` existed" delegate_to: localhost stat: path: "{{ playbook_dir }}/conf/{{ inventory_hostname }}_drainer.toml" register: drainer_customized_conf_st - name: "Preflight check - ensure that the customized config file `{{ playbook_dir }}/conf/{{ inventory_hostname }}_drainer.toml` exists" delegate_to: localhost fail: msg: 'You need to create customized config file `{{ playbook_dir }}/conf/{{ inventory_hostname }}_drainer.toml`.' when: drainer_customized_conf_st.stat.exists == False - name: create deploy directories file: path="{{ item }}" state=directory mode=0755 with_items: - "{{ drainer_data_dir }}" - "{{ drainer_log_dir }}" - "{{ status_dir }}" # - include_tasks: check_certs.yml # when: enable_tls|default(false) # # - include_tasks: gen_certs.yml # when: enable_tls|default(false) # # - include_tasks: install_certs.yml # when: enable_tls|default(false) - name: create configuration file copy: src="{{ playbook_dir }}/conf/{{ inventory_hostname }}_drainer.toml" dest="{{ deploy_dir }}/conf/drainer.toml" mode=0644 backup=yes register: drainer_conf_st - name: backup conf file command: mv "{{ drainer_conf_st.backup_file }}" "{{ backup_dir }}" when: drainer_conf_st.changed and drainer_conf_st.backup_file is defined - include_tasks: "binary_deployment.yml" - name: prepare firewalld white list set_fact: firewalld_ports: "{{ [drainer_port ~ '/tcp'] + firewalld_ports }}" ================================================ FILE: roles/drainer/tasks/supervise_deployment.yml ================================================ --- - name: deploy supervise include_role: name: supervise vars: this_role_name: drainer service_name: drainer-{{ drainer_port }} ================================================ FILE: roles/drainer/tasks/systemd_deployment.yml ================================================ --- - name: deploy systemd include_role: name: systemd vars: this_role_name: drainer service_name: drainer-{{ drainer_port }} disable_send_sigkill: true ================================================ FILE: roles/drainer/templates/run_drainer_binary.sh.j2 ================================================ #!/bin/bash set -e ulimit -n 1000000 DEPLOY_DIR={{ deploy_dir }} cd "${DEPLOY_DIR}" || exit 1 # WARNING: This file was auto-generated. Do not edit! # All your edit might be overwritten! {% set my_ip = hostvars[inventory_hostname].ansible_host | default(hostvars[inventory_hostname].inventory_hostname) -%} {% if enable_tls|default(false) %} {% set pd_scheme = 'https' -%} {% endif %} {% set all_pd = [] -%} {% set pd_hosts = groups.pd_servers %} {% for host in pd_hosts -%} {% set pd_ip = hostvars[host].ansible_host | default(hostvars[host].inventory_hostname) -%} {% set pd_port = hostvars[host].pd_client_port -%} {% set _ = all_pd.append("%s://%s:%s" % (pd_scheme, pd_ip, pd_port)) -%} {% endfor -%} exec bin/drainer \ --addr="{{ my_ip }}:{{ drainer_port }}" \ --pd-urls="{{ all_pd | join(',') }}" \ --data-dir="{{ drainer_data_dir }}" \ --log-file="{{ drainer_log_dir }}/{{ drainer_log_filename }}" \ --config=conf/drainer.toml \ --initial-commit-ts="{{ initial_commit_ts }}" 2>> "{{ drainer_log_dir }}/{{ drainer_stderr_filename }}" ================================================ FILE: roles/drainer/vars/default.yml ================================================ --- # default configuration file for drainer in yaml format security: # Path of file that contains list of trusted SSL CAs for connection with cluster components. ssl-ca: "" # Path of file that contains X509 certificate in PEM format for connection with cluster components. ssl-cert: "" # Path of file that contains X509 key in PEM format for connection with cluster components. ssl-key: "" ================================================ FILE: roles/firewalld/defaults/main.yml ================================================ --- firewalld_ports: [] ================================================ FILE: roles/firewalld/handlers/main.yml ================================================ --- # Handlers for firewalld - name: reload firewalld service: name=firewalld state=reloaded ================================================ FILE: roles/firewalld/tasks/main.yml ================================================ --- # Tasks to configure firewalld rules - name: All enabled ports debug: var=firewalld_ports # need root - name: determine if firewalld is running command: bash -c 'firewall-cmd --state || exit 0' register: firewalld_running ignore_errors: true changed_when: false - name: enable firewalld ports # shell: firewall-cmd --zone=public --add-port={{ item }} --permanent firewalld: port: '{{ item }}' permanent: true state: enabled when: firewalld_running.stdout.strip() == "running" with_items: "{{ firewalld_ports }}" register: firewalld - name: reload firewalld # shell: firewall-cmd --reload service: name=firewalld state=reloaded when: firewalld.changed is defined and firewalld.changed ================================================ FILE: roles/grafana/defaults/main.yml ================================================ --- grafana_log_dir: "{{ deploy_dir }}/log" grafana_log_filename: "grafana.log" grafana_data_dir: "{{ deploy_dir }}/data.grafana" grafana_dashboards_dir: "{{ deploy_dir }}/opt/grafana/dashboards" grafana_plugins_dir: "{{ deploy_dir }}/opt/grafana/plugins" grafana_admin_user: "admin" grafana_admin_password: "admin" grafana_api_keys: - name: "grafana_apikey" role: "Admin" grafana_exec_vars_only: false grafana_version: 6.1.6 # docker settings grafana_tag: 6.1.6 ================================================ FILE: roles/grafana/meta/main.yml ================================================ --- dependencies: - { role: 'common_dir', when: 'grafana_exec_vars_only == false' } ================================================ FILE: roles/grafana/tasks/binary_deployment.yml ================================================ --- - name: create binary deploy directories (1/2) file: path="{{ item }}" state=directory mode=0755 with_items: - "{{ deploy_dir }}/opt" - name: deploy grafana binary unarchive: > src={{ downloads_dir }}/grafana-{{ grafana_version }}.tar.gz dest={{ deploy_dir }}/opt/ - name: rename grafana deploy dir shell: > warn=no removes="{{ deploy_dir }}/opt/grafana/bin/grafana-server" rm -rf {{ deploy_dir }}/opt/grafana && mv {{ deploy_dir }}/opt/grafana-{{ grafana_version }} "{{ deploy_dir }}/opt/grafana" - name: rename grafana deploy dir shell: > creates="{{ deploy_dir }}/opt/grafana/bin/grafana-server" mv {{ deploy_dir }}/opt/grafana-{{ grafana_version }} "{{ deploy_dir }}/opt/grafana" - name: create binary deploy directories (2/2) file: path="{{ item }}" state=directory mode=0755 with_items: - "{{ grafana_dashboards_dir }}" - "{{ grafana_plugins_dir }}" - name: create grafana configuration file template: > src=grafana.ini.j2 dest={{ deploy_dir }}/opt/grafana/conf/grafana.ini mode=0644 - name: create run script template: src: "{{ item }}_{{ role_name }}_binary.sh.j2" dest: "{{ deploy_dir }}/scripts/{{ item }}_{{ role_name }}.sh" mode: "0755" backup: yes with_items: - run vars: role_status_dir: status/{{ role_name }} - include_tasks: "{{ process_supervision }}_deployment.yml" ================================================ FILE: roles/grafana/tasks/docker_deployment.yml ================================================ --- - name: deploy grafana image copy: src="{{ downloads_dir }}/grafana.tar" dest="{{ deploy_dir }}/images" mode=0755 - name: create run script template: src: "{{ item }}_{{ role_name }}_docker.sh.j2" dest: "{{ deploy_dir }}/scripts/{{ item }}_{{ role_name }}.sh" mode: "0755" backup: yes with_items: - run - name: load docker image from archive docker_image: state: present force: yes name: grafana/grafana tag: "{{ grafana_tag }}" load_path: "{{ images_dir }}/grafana.tar" - include_tasks: "{{ process_supervision }}_deployment.yml" ================================================ FILE: roles/grafana/tasks/main.yml ================================================ --- - include_tasks: tasks.yml when: "grafana_exec_vars_only == false" ================================================ FILE: roles/grafana/tasks/supervise_deployment.yml ================================================ --- - name: deploy supervise include_role: name: supervise vars: this_role_name: grafana service_name: grafana-{{ grafana_port }} ================================================ FILE: roles/grafana/tasks/systemd_deployment.yml ================================================ --- - name: deploy systemd include_role: name: systemd vars: this_role_name: grafana service_name: grafana-{{ grafana_port }} ================================================ FILE: roles/grafana/tasks/tasks.yml ================================================ --- - name: create common deploy directories file: path="{{ item }}" state=directory mode=0755 with_items: - "{{ grafana_log_dir }}" - "{{ grafana_data_dir }}" - name: push data source file template: src=data_source.json.j2 dest={{ grafana_data_dir }}/data_source.json mode=0644 - include_tasks: "{{ deployment_method }}_deployment.yml" - name: prepare firewalld white list set_fact: firewalld_ports: "{{ [grafana_port ~ '/tcp'] + firewalld_ports }}" ================================================ FILE: roles/grafana/templates/data_source.json.j2 ================================================ {% if groups.monitoring_servers | length == groups.grafana_servers | length -%} {% set index = [] -%} {% for host in groups.grafana_servers -%} {% if inventory_hostname == hostvars[host].inventory_hostname -%} {% set _ = index.append(loop.index0) -%} {% endif -%} {% endfor -%} {% set metric_host = hostvars[groups.monitoring_servers[index.0]].ansible_host | default(hostvars[groups.monitoring_servers[index.0]].inventory_hostname) -%} {% set metric_port = hostvars[groups.monitoring_servers[index.0]].prometheus_port -%} {% else -%} {% set metric_host = hostvars[groups.monitoring_servers[0]].ansible_host | default(hostvars[groups.monitoring_servers[0]].inventory_hostname) -%} {% set metric_port = hostvars[groups.monitoring_servers[0]].prometheus_port -%} {% endif -%} { "name":"{{ cluster_name }}", "type":"prometheus", "access":"proxy", "url":"http://{{ metric_host }}:{{ metric_port }}/", "basicAuth":false } ================================================ FILE: roles/grafana/templates/grafana.ini.j2 ================================================ ##################### Grafana Configuration Example ##################### # # Everything has defaults so you only need to uncomment things you want to # change # possible values : production, development ; app_mode = production # instance name, defaults to HOSTNAME environment variable value or hostname if HOSTNAME var is empty ; instance_name = ${HOSTNAME} #################################### Paths #################################### [paths] # Path to where grafana can store temp files, sessions, and the sqlite3 db (if that is used) # data = {{ grafana_data_dir }} # # Directory where grafana can store logs # logs = {{ grafana_log_dir }} # # Directory where grafana will automatically scan and look for plugins # plugins = {{ grafana_plugins_dir }} # #################################### Server #################################### [server] # Protocol (http or https) ;protocol = http # The ip address to bind to, empty will bind to all interfaces ;http_addr = # The http port to use http_port = {{ grafana_port }} # The public facing domain name used to access grafana from a browser domain = {{ ansible_host }} # Redirect to correct domain if host header does not match domain # Prevents DNS rebinding attacks ;enforce_domain = false # The full public facing url ;root_url = %(protocol)s://%(domain)s:%(http_port)s/ # Log web requests ;router_logging = false # the path relative working path ;static_root_path = public # enable gzip ;enable_gzip = false # https certs & key file ;cert_file = ;cert_key = #################################### Database #################################### [database] # Either "mysql", "postgres" or "sqlite3", it's your choice ;type = sqlite3 ;host = 127.0.0.1:3306 ;name = grafana ;user = root ;password = # For "postgres" only, either "disable", "require" or "verify-full" ;ssl_mode = disable # For "sqlite3" only, path relative to data_path setting ;path = grafana.db #################################### Session #################################### [session] # Either "memory", "file", "redis", "mysql", "postgres", default is "file" ;provider = file # Provider config options # memory: not have any config yet # file: session dir path, is relative to grafana data_path # redis: config like redis server e.g. `addr=127.0.0.1:6379,pool_size=100,db=grafana` # mysql: go-sql-driver/mysql dsn config string, e.g. `user:password@tcp(127.0.0.1:3306)/database_name` # postgres: user=a password=b host=localhost port=5432 dbname=c sslmode=disable ;provider_config = sessions # Session cookie name ;cookie_name = grafana_sess # If you use session in https only, default is false ;cookie_secure = false # Session life time, default is 86400 ;session_life_time = 86400 #################################### Analytics #################################### [analytics] # Server reporting, sends usage counters to stats.grafana.org every 24 hours. # No ip addresses are being tracked, only simple counters to track # running instances, dashboard and error counts. It is very helpful to us. # Change this option to false to disable reporting. ;reporting_enabled = true # Set to false to disable all checks to https://grafana.net # for new vesions (grafana itself and plugins), check is used # in some UI views to notify that grafana or plugin update exists # This option does not cause any auto updates, nor send any information # only a GET request to http://grafana.net to get latest versions check_for_updates = true # Google Analytics universal tracking code, only enabled if you specify an id here ;google_analytics_ua_id = #################################### Security #################################### [security] # default admin user, created on startup admin_user = {{ grafana_admin_user }} # default admin password, can be changed before first start of grafana, or in profile settings admin_password = {{ grafana_admin_password }} # used for signing ;secret_key = SW2YcwTIb9zpOOhoPsMm # Auto-login remember days ;login_remember_days = 7 ;cookie_username = grafana_user ;cookie_remember_name = grafana_remember # disable gravatar profile images ;disable_gravatar = false # data source proxy whitelist (ip_or_domain:port separated by spaces) ;data_source_proxy_whitelist = [snapshots] # snapshot sharing options ;external_enabled = true ;external_snapshot_url = https://snapshots-origin.raintank.io ;external_snapshot_name = Publish to snapshot.raintank.io #################################### Users #################################### [users] # disable user signup / registration ;allow_sign_up = true # Allow non admin users to create organizations ;allow_org_create = true # Set to true to automatically assign new users to the default organization (id 1) ;auto_assign_org = true # Default role new users will be automatically assigned (if disabled above is set to true) ;auto_assign_org_role = Viewer # Background text for the user field on the login page ;login_hint = email or username # Default UI theme ("dark" or "light") ;default_theme = dark #################################### Anonymous Auth ########################## [auth.anonymous] # enable anonymous access ;enabled = false # specify organization name that should be used for unauthenticated users ;org_name = Main Org. # specify role for unauthenticated users ;org_role = Viewer #################################### Basic Auth ########################## [auth.basic] ;enabled = true #################################### Auth LDAP ########################## [auth.ldap] ;enabled = false ;config_file = /etc/grafana/ldap.toml #################################### SMTP / Emailing ########################## [smtp] ;enabled = false ;host = localhost:25 ;user = ;password = ;cert_file = ;key_file = ;skip_verify = false ;from_address = admin@grafana.localhost [emails] ;welcome_email_on_sign_up = false #################################### Logging ########################## [log] # Either "console", "file", "syslog". Default is console and file # Use space to separate multiple modes, e.g. "console file" mode = file # Either "trace", "debug", "info", "warn", "error", "critical", default is "info" ;level = info # For "console" mode only [log.console] ;level = # log line format, valid options are text, console and json ;format = console # For "file" mode only [log.file] level = info # log line format, valid options are text, console and json format = text # This enables automated log rotate(switch of following options), default is true ;log_rotate = true # Max line number of single file, default is 1000000 ;max_lines = 1000000 # Max size shift of single file, default is 28 means 1 << 28, 256MB ;max_size_shift = 28 # Segment log daily, default is true ;daily_rotate = true # Expired days of log file(delete after max days), default is 7 ;max_days = 7 [log.syslog] ;level = # log line format, valid options are text, console and json ;format = text # Syslog network type and address. This can be udp, tcp, or unix. If left blank, the default unix endpoints will be used. ;network = ;address = # Syslog facility. user, daemon and local0 through local7 are valid. ;facility = # Syslog tag. By default, the process' argv[0] is used. ;tag = #################################### AMQP Event Publisher ########################## [event_publisher] ;enabled = false ;rabbitmq_url = amqp://localhost/ ;exchange = grafana_events ;#################################### Dashboard JSON files ########################## [dashboards.json] enabled = false path = {{ grafana_dashboards_dir }} #################################### Internal Grafana Metrics ########################## # Metrics available at HTTP API Url /api/metrics [metrics] # Disable / Enable internal metrics ;enabled = true # Publish interval ;interval_seconds = 10 # Send internal metrics to Graphite ; [metrics.graphite] ; address = localhost:2003 ; prefix = prod.grafana.%(instance_name)s. #################################### Internal Grafana Metrics ########################## # Url used to to import dashboards directly from Grafana.net [grafana_net] url = https://grafana.net ================================================ FILE: roles/grafana/templates/run_grafana_binary.sh.j2 ================================================ #!/bin/bash set -e ulimit -n 1000000 # WARNING: This file was auto-generated. Do not edit! # All your edit might be overwritten! DEPLOY_DIR={{ deploy_dir }} cd "${DEPLOY_DIR}" || exit 1 LANG=en_US.UTF-8 \ exec opt/grafana/bin/grafana-server \ --homepath="{{ deploy_dir }}/opt/grafana" \ --config="{{ deploy_dir }}/opt/grafana/conf/grafana.ini" ================================================ FILE: roles/grafana/templates/run_grafana_docker.sh.j2 ================================================ #!/bin/bash set -e ulimit -n 1000000 # WARNING: This file was auto-generated. Do not edit! # All your edit might be overwritten! DEPLOY_DIR={{ deploy_dir }} cd "${DEPLOY_DIR}" || exit 1 exec docker run -p {{ grafana_port }}:3000 \ -v /etc/localtime:/etc/localtime:ro \ --name="grafana-{{ grafana_port }}" \ grafana/grafana:{{ grafana_tag }} ================================================ FILE: roles/kafka_exporter/defaults/main.yml ================================================ --- # default configuration for kafka_exporter kafka_exporter_log_level: "info" kafka_exporter_log_dir: "{{ deploy_dir }}/log" kafka_exporter_log_filename: "kafka_exporter.log" kafka_version: "1.0.0" ================================================ FILE: roles/kafka_exporter/meta/main.yml ================================================ --- dependencies: - role: common_dir ================================================ FILE: roles/kafka_exporter/tasks/binary_deployment.yml ================================================ --- - name: create deploy directories file: path="{{ item }}" state=directory mode=0755 with_items: - "{{ kafka_exporter_log_dir }}" - name: deploy kafka_exporter binary copy: src="{{ resources_dir }}/bin/kafka_exporter" dest="{{ deploy_dir }}/bin/" mode=0755 - name: create run script template: src: "{{ item }}_{{ role_name }}_binary.sh.j2" dest: "{{ deploy_dir }}/scripts/{{ item }}_{{ role_name }}.sh" mode: "0755" backup: yes with_items: - run vars: role_status_dir: status/{{ role_name }} - include_tasks: "{{ process_supervision }}_deployment.yml" ================================================ FILE: roles/kafka_exporter/tasks/main.yml ================================================ --- - include_tasks: binary_deployment.yml - name: prepare firewalld white list set_fact: firewalld_ports: "{{ [kafka_exporter_port ~ '/tcp'] + firewalld_ports }}" ================================================ FILE: roles/kafka_exporter/tasks/supervise_deployment.yml ================================================ --- - name: deploy supervise include_role: name: supervise vars: this_role_name: kafka_exporter service_name: kafka_exporter-{{ kafka_exporter_port }} ================================================ FILE: roles/kafka_exporter/tasks/systemd_deployment.yml ================================================ --- - name: deploy systemd include_role: name: systemd vars: this_role_name: kafka_exporter service_name: kafka_exporter-{{ kafka_exporter_port }} ================================================ FILE: roles/kafka_exporter/templates/run_kafka_exporter_binary.sh.j2 ================================================ #!/bin/bash set -e ulimit -n 1000000 DEPLOY_DIR={{ deploy_dir }} cd "${DEPLOY_DIR}" || exit 1 # WARNING: This file was auto-generated. Do not edit! # All your edit might be overwritten! exec > >(tee -i -a "{{ kafka_exporter_log_dir }}/{{ kafka_exporter_log_filename }}") exec 2>&1 exec bin/kafka_exporter \ --web.listen-address=":{{ kafka_exporter_port }}" \ --kafka.version="{{ kafka_version }}" \ {% for kafka_addr in kafka_addrs.split(',') %} --kafka.server={{ kafka_addr }} \ {% endfor %} --log.level="{{ kafka_exporter_log_level }}" ================================================ FILE: roles/local/tasks/binary_deployment.yml ================================================ --- - name: download other binary get_url: url: "{{ item.url }}" dest: "{{ downloads_dir }}/{{ item.name }}-{{ item.version }}.tar.gz" checksum: "{{ item.checksum | default(omit) }}" force: yes validate_certs: no register: get_url_result until: "'OK' in get_url_result.msg or 'file already exists' in get_url_result.msg" retries: 4 delay: "{{ retry_stagger | random + 3 }}" with_items: "{{ third_party_packages }}" when: - has_outbound_network - not under_outbound - name: download other binary under outbound get_url: url: "{{ item.url }}" dest: "{{ downloads_dir }}/{{ item.name }}-{{ item.version }}.tar.gz" checksum: "{{ item.checksum | default(omit) }}" force: yes validate_certs: no register: get_url_result until: "'OK' in get_url_result.msg or 'file already exists' in get_url_result.msg" retries: 4 delay: "{{ retry_stagger | random + 3 }}" with_items: "{{ third_party_packages_under_outbound }}" when: - has_outbound_network - under_outbound - name: download TiSpark packages get_url: url: "{{ item.url }}" dest: "{{ downloads_dir }}/{{ item.name }}" checksum: "{{ item.checksum | default(omit) }}" force: yes validate_certs: no register: get_url_result until: "'OK' in get_url_result.msg or 'file already exists' in get_url_result.msg" retries: 4 delay: "{{ retry_stagger | random + 3 }}" with_items: "{{ tispark_packages }}" when: - has_outbound_network - not deploy_without_tidb|default(false) - name: download TiFlash packages get_url: url: "{{ item.url }}" dest: "{{ downloads_dir }}/{{ item.name }}-{{ item.version }}.tar.gz" checksum: "{{ item.checksum | default(omit) }}" force: yes validate_certs: no register: get_url_result until: "'OK' in get_url_result.msg or 'file already exists' in get_url_result.msg" retries: 4 delay: "{{ retry_stagger | random + 3 }}" with_items: "{{ tiflash_packages }}" when: - has_outbound_network - cpu_architecture == 'amd64' - name: unarchive third party binary shell: ls -1 {{ item.name }}-{{ item.version }}.tar.gz | xargs -n1 tar xzf args: chdir: "{{ downloads_dir }}" warn: no with_items: "{{ third_party_packages }}" - name: unarchive tispark shell: tar xzf tispark-latest.tar.gz args: chdir: "{{ downloads_dir }}" warn: no when: not deploy_without_tidb|default(false) - name: unarchive tispark-sample-data shell: ls -1 tispark-sample-data.tar.gz | xargs -n1 tar xzf args: chdir: "{{ downloads_dir }}" warn: no when: not deploy_without_tidb|default(false) - name: unarchive tiflash shell: ls -1 {{ item.name }}-{{ item.version }}.tar.gz | xargs tar xzf args: chdir: "{{ downloads_dir }}" warn: no with_items: "{{ tiflash_packages }}" when: - cpu_architecture == 'amd64' - name: cp monitoring binary shell: > cp -v {{ downloads_dir }}/{{ item }}-*/{{ item }} "{{ resources_dir }}/bin/{{ item }}" with_items: - alertmanager - prometheus - node_exporter - pushgateway - blackbox_exporter - name: cp tispark shell: > cp -v {{ downloads_dir }}/assembly/target/tispark-assembly-*-SNAPSHOT.jar "{{ resources_dir }}/bin/tispark-assembly-SNAPSHOT.jar" when: not deploy_without_tidb|default(false) - name: cp tispark-sample-data shell: > cp -rfv {{ downloads_dir }}/tispark-sample-data "{{ resources_dir }}/bin/" when: not deploy_without_tidb|default(false) - name: cp tiflash directory shell: > cp -rfv {{ downloads_dir }}/{{ item.name }}-{{ item.version }}-linux-amd64 "{{ resources_dir }}/bin/tiflash" with_items: "{{ tiflash_packages }}" when: - cpu_architecture == 'amd64' ================================================ FILE: roles/local/tasks/docker_deployment.yml ================================================ --- - name: download tidb docker images docker_image: name: "{{ item.name }}" tag: "{{ item.tag }}" archive_path: "{{ downloads_dir }}/{{ item.service }}.tar" with_items: "{{ tidb_images }}" when: has_outbound_network - name: download third party docker images docker_image: name: "{{ item.name }}" tag: "{{ item.tag }}" archive_path: "{{ downloads_dir }}/{{ item.service }}.tar" with_items: "{{ third_party_images }}" when: has_outbound_network ================================================ FILE: roles/local/tasks/main.yml ================================================ --- - name: Stop if ansible version is too low, make sure that the Ansible version is Ansible 2.4.2 or later, otherwise a compatibility issue occurs. assert: that: - ansible_version.full|version_compare('2.4.2', '>=') # - ansible_version.full is version('2.5.0', '>=') - name: create downloads and resources directories file: path="{{ item }}" state=directory mode=0755 with_items: - "{{ downloads_dir }}" - "{{ resources_dir }}" - "{{ resources_dir }}/bin" - name: create cert directory file: path="{{ cert_dir }}" state=directory mode=0755 when: enable_tls|default(false) - name: create packages.yml template: src=common_packages.yml.j2 dest={{ playbook_dir }}/conf/common_packages.yml - name: create specific deployment method packages.yml template: src={{ deployment_method }}_packages.yml.j2 dest={{ playbook_dir }}/conf/{{ deployment_method }}_packages.yml - include_vars: file={{ playbook_dir }}/conf/common_packages.yml - include_vars: file={{ playbook_dir }}/conf/{{ deployment_method }}_packages.yml # preflight checks - name: detect outbound network[1] shell: > warn=no curl -s --connect-timeout 10 www.baidu.com 2>/dev/null >/dev/null; echo $? changed_when: false register: outbound_network_st - name: set outbound network fact[1] set_fact: has_outbound_network={{ outbound_network_st.stdout.strip() == '0' }} - fail: msg: "The Control Machine must have access to the Internet in order to download TiDB and related packages." when: not has_outbound_network - name: detect outbound network[2] shell: > warn=no curl -s --connect-timeout 10 google.com 2>/dev/null >/dev/null; echo $? changed_when: false register: outbound_st - name: set outbound network fact[2] set_fact: under_outbound={{ outbound_st.stdout.strip() != '0' }} # do actual downloading - name: download tidb binary get_url: url: "{{ item.url }}" dest: "{{ downloads_dir }}/{{ item.name }}-{{ item.version }}.tar.gz" checksum: "{{ item.checksum | default(omit) }}" force: yes validate_certs: no register: get_url_result until: "'OK' in get_url_result.msg or 'file already exists' in get_url_result.msg" retries: 4 delay: "{{ retry_stagger | random + 3 }}" with_items: "{{ tidb_packages }}" when: has_outbound_network - name: download tidb toolkit binary get_url: url: "{{ item.url }}" dest: "{{ downloads_dir }}/{{ item.name }}-{{ item.version }}.tar.gz" checksum: "{{ item.checksum | default(omit) }}" force: yes validate_certs: no register: get_url_result until: "'OK' in get_url_result.msg or 'file already exists' in get_url_result.msg" retries: 4 delay: "{{ retry_stagger | random + 3 }}" with_items: "{{ tidb_toolkit_packages }}" when: has_outbound_network - name: download common binary get_url: url: "{{ item.url }}" dest: "{{ downloads_dir }}/{{ item.name }}-{{ item.version }}.tar.gz" checksum: "{{ item.checksum | default(omit) }}" force: yes validate_certs: no register: get_url_result until: "'OK' in get_url_result.msg or 'file already exists' in get_url_result.msg" retries: 4 delay: "{{ retry_stagger | random + 3 }}" with_items: "{{ common_packages }}" when: has_outbound_network - name: download diagnosis tools get_url: url: "{{ item.url }}" dest: "{{ downloads_dir }}/{{ item.name }}-{{ item.version }}.tar.gz" checksum: "{{ item.checksum | default(omit) }}" force: yes validate_certs: no register: get_url_result until: "'OK' in get_url_result.msg or 'file already exists' in get_url_result.msg" retries: 4 delay: "{{ retry_stagger | random + 3 }}" with_items: "{{ diagnosis_packages }}" when: has_outbound_network - name: download cfssl binary get_url: url: https://pkg.cfssl.org/R1.2/cfssl_linux-amd64 dest: "{{ resources_dir }}/bin/cfssl" mode: 0755 force: yes validate_certs: no register: get_url_result until: "'OK' in get_url_result.msg or 'file already exists' in get_url_result.msg" retries: 4 delay: "{{ retry_stagger | random + 3 }}" when: - has_outbound_network - enable_tls|default(false) - name: download cfssljson binary get_url: url: https://pkg.cfssl.org/R1.2/cfssljson_linux-amd64 dest: "{{ resources_dir }}/bin/cfssljson" mode: 0755 force: yes validate_certs: no register: get_url_result until: "'OK' in get_url_result.msg or 'file already exists' in get_url_result.msg" retries: 4 delay: "{{ retry_stagger | random + 3 }}" when: - has_outbound_network - enable_tls|default(false) - include_tasks: "{{ deployment_method }}_deployment.yml" - name: unarchive tidb binary shell: ls -1 {{ item.name }}-{{ item.version }}.tar.gz | xargs -n1 tar xzf args: chdir: "{{ downloads_dir }}" warn: no with_items: "{{ tidb_packages }}" - name: unarchive tidb toolkit binary shell: ls -1 {{ item.name }}-{{ item.version }}.tar.gz | xargs -n1 tar xzf args: chdir: "{{ downloads_dir }}" warn: no with_items: "{{ tidb_toolkit_packages }}" - name: unarchive common binary shell: ls -1 {{ item.name }}-{{ item.version }}.tar.gz | xargs -n1 tar xzf args: chdir: "{{ downloads_dir }}" warn: no with_items: "{{ common_packages }}" - name: cp tidb binary shell: > cp -v {{ downloads_dir }}/{{ item.name }}-{{ item.version}}*/bin/* "{{ resources_dir }}/bin/" with_items: "{{ tidb_packages }}" - name: cp tidb toolkit binary shell: > cp -v {{ downloads_dir }}/{{ item.name }}-{{ item.version}}*/bin/* "{{ resources_dir }}/bin/" with_items: "{{ tidb_toolkit_packages }}" - name: cp fio binary shell: > cp -v {{ downloads_dir }}/fio-*/{{ item }} "{{ resources_dir }}/bin/" with_items: - fio - name: cp kafka_exporter binary shell: > cp -v {{ downloads_dir }}/kafka_exporter-*/{{ item }} "{{ resources_dir }}/bin/" with_items: - kafka_exporter - name: cp daemontools binary shell: > cp -v {{ downloads_dir }}/daemontools-*/bin/{{ item }} "{{ resources_dir }}/bin/" with_items: - supervise - svstat - svc when: process_supervision == 'supervise' - name: cp tidb-insight tarball shell: mv {{ downloads_dir }}/tidb-insight-v*.tar.gz {{ downloads_dir }}/tidb-insight.tar.gz - name: clean up download dir shell: > cd "{{ downloads_dir }}" && find . -mindepth 1 -maxdepth 1 -type d -exec rm -rf {} \; ================================================ FILE: roles/local/templates/binary_packages.yml.j2 ================================================ --- {% if cpu_architecture == 'amd64' -%} third_party_packages: - name: prometheus version: 2.8.1 url: "https://github.com/prometheus/prometheus/releases/download/v2.8.1/prometheus-2.8.1.linux-amd64.tar.gz" checksum: "sha256:8acf79c9f1bb79c58df557ff3f4824fd1c1220940d70f63aedf60f2e36b8d8d7" - name: alertmanager version: 0.17.0 url: "https://github.com/prometheus/alertmanager/releases/download/v0.17.0/alertmanager-0.17.0.linux-amd64.tar.gz" checksum: "sha256:7c8d2cfeb021c80881ae9904d959131091b8785b6fda9800f84ddef148fe0a4f" - name: node_exporter version: 0.17.0 url: "https://github.com/prometheus/node_exporter/releases/download/v0.17.0/node_exporter-0.17.0.linux-amd64.tar.gz" checksum: "sha256:d2e00d805dbfdc67e7291ce2d2ff151f758dd7401dd993411ff3818d0e231489" - name: blackbox_exporter version: 0.12.0 url: "https://github.com/prometheus/blackbox_exporter/releases/download/v0.12.0/blackbox_exporter-0.12.0.linux-amd64.tar.gz" checksum: "sha256:c5d8ba7d91101524fa7c3f5e17256d467d44d5e1d243e251fd795e0ab4a83605" - name: pushgateway version: 0.7.0 url: "https://github.com/prometheus/pushgateway/releases/download/v0.7.0/pushgateway-0.7.0.linux-amd64.tar.gz" checksum: "sha256:902849c94dc275f157899f7fee1b2f23efbd3bbdb6c3a3c42e503f4439f74ed2" - name: grafana version: 6.1.6 url: "https://dl.grafana.com/oss/release/grafana-6.1.6.linux-amd64.tar.gz" checksum: "sha256:93c81a495e274024d1dd7dcd033033680a6138d7cfd72e2447dd80eaab7ce2b5" third_party_packages_under_outbound: - name: prometheus version: 2.8.1 url: "https://download.pingcap.org/prometheus-2.8.1.linux-amd64.tar.gz" checksum: "sha256:8acf79c9f1bb79c58df557ff3f4824fd1c1220940d70f63aedf60f2e36b8d8d7" - name: alertmanager version: 0.17.0 url: "http://download.pingcap.org/alertmanager-0.17.0.linux-amd64.tar.gz" checksum: "sha256:7c8d2cfeb021c80881ae9904d959131091b8785b6fda9800f84ddef148fe0a4f" - name: node_exporter version: 0.17.0 url: "http://download.pingcap.org/node_exporter-0.17.0.linux-amd64.tar.gz" checksum: "sha256:d2e00d805dbfdc67e7291ce2d2ff151f758dd7401dd993411ff3818d0e231489" - name: pushgateway version: 0.7.0 url: "http://download.pingcap.org/pushgateway-0.7.0.linux-amd64.tar.gz" checksum: "sha256:902849c94dc275f157899f7fee1b2f23efbd3bbdb6c3a3c42e503f4439f74ed2" - name: grafana version: 6.1.6 url: "https://download.pingcap.org/grafana-6.1.6.linux-amd64.tar.gz" checksum: "sha256:93c81a495e274024d1dd7dcd033033680a6138d7cfd72e2447dd80eaab7ce2b5" - name: blackbox_exporter version: 0.12.0 url: "http://download.pingcap.org/blackbox_exporter-0.12.0.linux-amd64.tar.gz" checksum: "sha256:c5d8ba7d91101524fa7c3f5e17256d467d44d5e1d243e251fd795e0ab4a83605" {% if not deploy_without_tidb|default(false) %} tispark_packages: - name: spark-2.4.3-bin-hadoop2.7.tgz version: 2.4.3 url: http://download.pingcap.org/spark-2.4.3-bin-hadoop2.7.tgz checksum: "sha256:80a4c564ceff0d9aff82b7df610b1d34e777b45042e21e2d41f3e497bb1fa5d8" - name: tispark-latest.tar.gz version: latest url: http://download.pingcap.org/tispark-assembly-latest-linux-amd64.tar.gz - name: tispark-sample-data.tar.gz version: latest url: http://download.pingcap.org/tispark-sample-data.tar.gz checksum: "sha256:bd0368a9d8663a4a8de89e39cc4cc1d91c718faf36d4bc7e1f8482c34d5bb8db" {% endif %} tiflash_packages: - name: tiflash version: {{ tidb_version }} url: http://download.pingcap.org/tiflash-{{ tidb_version }}-linux-amd64.tar.gz {% elif cpu_architecture == 'arm64' -%} third_party_packages: - name: prometheus version: 2.8.1 url: "https://github.com/prometheus/prometheus/releases/download/v2.8.1/prometheus-2.8.1.linux-arm64.tar.gz" - name: alertmanager version: 0.17.0 url: "https://github.com/prometheus/alertmanager/releases/download/v0.17.0/alertmanager-0.17.0.linux-arm64.tar.gz" - name: node_exporter version: 0.17.0 url: "https://github.com/prometheus/node_exporter/releases/download/v0.17.0/node_exporter-0.17.0.linux-arm64.tar.gz" - name: blackbox_exporter version: 0.12.0 url: "https://github.com/prometheus/blackbox_exporter/releases/download/v0.12.0/blackbox_exporter-0.12.0.linux-arm64.tar.gz" - name: pushgateway version: 0.7.0 url: "https://github.com/prometheus/pushgateway/releases/download/v0.7.0/pushgateway-0.7.0.linux-arm64.tar.gz" - name: grafana version: 6.1.6 url: "https://dl.grafana.com/oss/release/grafana-6.1.6.linux-arm64.tar.gz" third_party_packages_under_outbound: - name: prometheus version: 2.8.1 url: "https://download.pingcap.org/prometheus-2.8.1.linux-arm64.tar.gz" - name: alertmanager version: 0.17.0 url: "http://download.pingcap.org/alertmanager-0.17.0.linux-arm64.tar.gz" - name: node_exporter version: 0.17.0 url: "http://download.pingcap.org/node_exporter-0.17.0.linux-arm64.tar.gz" - name: pushgateway version: 0.7.0 url: "http://download.pingcap.org/pushgateway-0.7.0.linux-arm64.tar.gz" - name: grafana version: 6.1.6 url: "https://download.pingcap.org/grafana-6.1.6.linux-arm64.tar.gz" - name: blackbox_exporter version: 0.12.0 url: "http://download.pingcap.org/blackbox_exporter-0.12.0.linux-arm64.tar.gz" {% if not deploy_without_tidb|default(false) %} tispark_packages: - name: spark-2.4.3-bin-hadoop2.7.tgz version: 2.4.3 url: http://download.pingcap.org/spark-2.4.3-bin-hadoop2.7.tgz checksum: "sha256:80a4c564ceff0d9aff82b7df610b1d34e777b45042e21e2d41f3e497bb1fa5d8" - name: tispark-latest.tar.gz version: latest url: http://download.pingcap.org/tispark-assembly-latest-linux-amd64.tar.gz - name: tispark-sample-data.tar.gz version: latest url: http://download.pingcap.org/tispark-sample-data.tar.gz {% endif %} {% endif -%} ================================================ FILE: roles/local/templates/common_packages.yml.j2 ================================================ --- {% if cpu_architecture == 'amd64' -%} tidb_packages: - name: tidb version: {{ tidb_version }} url: http://download.pingcap.org/tidb-{{ tidb_version }}-linux-amd64.tar.gz tidb_toolkit_packages: - name: tidb-toolkit version: {{ tidb_version }} url: http://download.pingcap.org/tidb-toolkit-{{ tidb_version }}-linux-amd64.tar.gz common_packages: - name: fio version: 3.8 url: "http://download.pingcap.org/fio-3.8.tar.gz" checksum: "sha256:15739abde7e74b59ac59df57f129b14fc5cd59e1e2eca2ce37b41f8c289c3d58" - name: kafka_exporter version: 1.1.0 url: http://download.pingcap.org/kafka_exporter-1.1.0.linux-amd64.tar.gz checksum: "sha256:6431b9b8f65a7d40c0ef1ea0982cb149931e2896224f373412ae5f93c225d72b" {% if process_supervision == 'supervise' %} - name: daemontools version: 0.53 url: http://download.pingcap.org/daemontools-0.53-linux-amd64.tar.gz checksum: "sha256:a4abd491cf185aef5644be5a4e1ed52c8f458802178d4c0efcc8178a5ca67fb7" {% endif %} diagnosis_packages: - name: tidb-insight version: v0.2.5-1-g99b8fea url: http://download.pingcap.org/tidb-insight-v0.2.5-1-g99b8fea.tar.gz checksum: "sha256:26034435d1b088529c300d5a8145758e68a6ef8000e3eeb6ce027a3ce56ebe45" {% elif cpu_architecture == 'arm64' -%} tidb_packages: - name: tidb version: {{ tidb_version }} url: http://download.pingcap.org/tidb-{{ tidb_version }}-linux-arm64.tar.gz tidb_toolkit_packages: - name: tidb-toolkit version: {{ tidb_version }} url: http://download.pingcap.org/tidb-toolkit-{{ tidb_version }}-linux-arm64.tar.gz common_packages: - name: fio version: 3.8 url: "http://download.pingcap.org/fio-3.8-linux-arm64.tar.gz" checksum: "sha256:8d086512b26d19229d6b1631db749880ed5a581c42e3bbb7e8d6a2f7155f4c9c" - name: kafka_exporter version: 1.1.0 url: http://download.pingcap.org/kafka_exporter-1.1.0.linux-arm64.tar.gz checksum: "sha256:68a130c00dbd13a530b1ea6b7661427b6af32716ee12c02bab52fdd9e280aec0" {% if process_supervision == 'supervise' %} - name: daemontools version: 0.53 url: http://download.pingcap.org/daemontools-0.53-linux-arm64.tar.gz checksum: "sha256:39884c7b714b1eff52e2acce0378c8200eb221174796fa54af6a1567f582938b" {% endif %} diagnosis_packages: - name: tidb-insight version: v0.2.5-1-g99b8fea url: http://download.pingcap.org/tidb-insight-v0.2.5-4-linux-arm64.tar.gz {% endif -%} ================================================ FILE: roles/local/templates/docker_packages.yml.j2 ================================================ --- tidb_images: - name: pingcap/tidb tag: {{ tidb_version }} service: tidb - name: pingcap/tikv tag: {{ tidb_version }} service: tikv - name: pingcap/pd tag: {{ tidb_version }} service: pd - name: pingcap/tidb-binlog {% if tidb_version == 'rc2.2' %} tag: rc2.2 {% else %} tag: latest {% endif %} service: tidb-binlog - name: pingcap/tidb-tools tag: latest service: tidb-tools third_party_images: - name: prom/prometheus tag: v2.2.1 service: prometheus - name: prom/alertmanager tag: v0.14.0 service: alertmanager - name: prom/node-exporter tag: v0.15.2 service: node-exporter - name: prom/blackbox-exporter tag: v0.12.0 service: blackbox-exporter - name: prom/pushgateway tag: v0.4.0 service: pushgateway - name: grafana/grafana tag: 4.6.3 service: grafana ================================================ FILE: roles/machine_benchmark/defaults/main.yml ================================================ --- fio_deploy_dir: "{{ tikv_data_dir }}/fio" # fio randread iops min_ssd_randread_iops: 40000 # fio mixed randread and sequential write min_ssd_mix_randread_iops: 10000 min_ssd_mix_write_iops: 10000 # fio mixed randread and sequential write lat max_ssd_mix_randread_lat: 250000 max_ssd_mix_write_lat: 30000 # fio test file size benchmark_size: 10G ================================================ FILE: roles/machine_benchmark/tasks/fio_randread.yml ================================================ --- - name: fio randread benchmark on tikv_data_dir disk shell: "cd {{ fio_deploy_dir }} && ./fio -ioengine=psync -bs=32k -fdatasync=1 -thread -rw=randread -size={{ benchmark_size }} -filename=fio_randread_test.txt -name='fio randread test' -iodepth=4 -runtime=60 -numjobs=4 -group_reporting --output-format=json --output=fio_randread_result.json" register: fio_randread - name: clean fio randread benchmark temporary file file: path: "{{ fio_deploy_dir }}/fio_randread_test.txt" state: absent - name: get fio randread iops shell: "python parse_fio_output.py --target='fio_randread_result.json' --read-iops" register: disk_randread_iops args: chdir: "{{ fio_deploy_dir }}/" - name: get fio randread summary shell: "python parse_fio_output.py --target='fio_randread_result.json' --summary" register: disk_randread_smmary args: chdir: "{{ fio_deploy_dir }}/" - name: fio randread benchmark command debug: msg: "fio randread benchmark command: {{ fio_randread.cmd }}." run_once: true - name: fio randread benchmark summary debug: msg: "fio randread benchmark summary: {{ disk_randread_smmary.stdout }}." - name: Preflight check - Does fio randread iops of tikv_data_dir disk meet requirement fail: msg: 'fio: randread iops of tikv_data_dir disk is too low: {{ disk_randread_iops.stdout }} < {{ min_ssd_randread_iops }}, it is strongly recommended to use SSD disks for TiKV and PD, or there might be performance issues.' when: disk_randread_iops.stdout|int < min_ssd_randread_iops|int ================================================ FILE: roles/machine_benchmark/tasks/fio_randread_write.yml ================================================ --- - name: fio mixed randread and sequential write benchmark on tikv_data_dir disk shell: "cd {{ fio_deploy_dir }} && ./fio -ioengine=psync -bs=32k -fdatasync=1 -thread -rw=randrw -percentage_random=100,0 -size={{ benchmark_size }} -filename=fio_randread_write_test.txt -name='fio mixed randread and sequential write test' -iodepth=4 -runtime=60 -numjobs=4 -group_reporting --output-format=json --output=fio_randread_write_test.json" register: fio_randread_write - name: clean fio mixed randread and sequential write benchmark temporary file file: path: "{{ fio_deploy_dir }}/fio_randread_write_test.txt" state: absent - name: get fio mixed test randread iops shell: "python parse_fio_output.py --target='fio_randread_write_test.json' --read-iops" register: disk_mix_randread_iops args: chdir: "{{ fio_deploy_dir }}/" - name: get fio mixed test write iops shell: "python parse_fio_output.py --target='fio_randread_write_test.json' --write-iops" register: disk_mix_write_iops args: chdir: "{{ fio_deploy_dir }}/" - name: get fio mixed randread and sequential write summary shell: "python parse_fio_output.py --target='fio_randread_write_test.json' --summary" register: disk_mix_randread_write_smmary args: chdir: "{{ fio_deploy_dir }}/" - name: fio mixed randread and sequential write benchmark command debug: msg: "fio mixed randread and sequential write benchmark command: {{ fio_randread_write.cmd }}." run_once: true - name: fio mixed randread and sequential write benchmark summary debug: msg: "fio mixed randread and sequential write benchmark summary: {{ disk_mix_randread_write_smmary.stdout }}." - name: Preflight check - Does fio mixed randread and sequential write iops of tikv_data_dir disk meet requirement - randread fail: msg: 'fio mixed randread and sequential write test: randread iops of tikv_data_dir disk is too low: {{ disk_mix_randread_iops.stdout }} < {{ min_ssd_mix_randread_iops }}, it is strongly recommended to use SSD disks for TiKV and PD, or there might be performance issues.' when: disk_mix_randread_iops.stdout|int < min_ssd_mix_randread_iops|int - name: Preflight check - Does fio mixed randread and sequential write iops of tikv_data_dir disk meet requirement - sequential write fail: msg: 'fio mixed randread and sequential write test: sequential write iops of tikv_data_dir disk is too low: {{ disk_mix_write_iops.stdout }} < {{ min_ssd_mix_write_iops }}, it is strongly recommended to use SSD disks for TiKV and PD, or there might be performance issues.' when: disk_mix_write_iops.stdout|int < min_ssd_mix_write_iops|int ================================================ FILE: roles/machine_benchmark/tasks/fio_randread_write_latency.yml ================================================ --- - name: fio mixed randread and sequential write benchmark for latency on tikv_data_dir disk shell: "cd {{ fio_deploy_dir }} && ./fio -ioengine=psync -bs=32k -fdatasync=1 -thread -rw=randrw -percentage_random=100,0 -size={{ benchmark_size }} -filename=fio_randread_write_latency_test.txt -name='fio mixed randread and sequential write test' -iodepth=1 -runtime=60 -numjobs=1 -group_reporting --output-format=json --output=fio_randread_write_latency_test.json" register: fio_randread_write_latency - name: clean fio mixed randread and sequential write benchmark for latency temporary file file: path: "{{ fio_deploy_dir }}/fio_randread_write_latency_test.txt" state: absent - name: get fio mixed test randread latency shell: "python parse_fio_output.py --target='fio_randread_write_latency_test.json' --read-lat" register: disk_mix_randread_lat args: chdir: "{{ fio_deploy_dir }}/" - name: get fio mixed test write latency shell: "python parse_fio_output.py --target='fio_randread_write_latency_test.json' --write-lat" register: disk_mix_write_lat args: chdir: "{{ fio_deploy_dir }}/" - name: get fio mixed randread and sequential write for latency summary shell: "python parse_fio_output.py --target='fio_randread_write_latency_test.json' --summary" register: disk_mix_randread_write_latency_smmary args: chdir: "{{ fio_deploy_dir }}/" - name: fio mixed randread and sequential write benchmark for latency command debug: msg: "fio mixed randread and sequential write benchmark for latency command: {{ fio_randread_write_latency.cmd }}." run_once: true - name: fio mixed randread and sequential write benchmark for latency summary debug: msg: "fio mixed randread and sequential write benchmark summary: {{ disk_mix_randread_write_latency_smmary.stdout }}." - name: Preflight check - Does fio mixed randread and sequential write latency of tikv_data_dir disk meet requirement - randread fail: msg: 'fio mixed randread and sequential write test: randread latency of tikv_data_dir disk is too low: {{ disk_mix_randread_lat.stdout }} ns > {{ max_ssd_mix_randread_lat }} ns, it is strongly recommended to use SSD disks for TiKV and PD, or there might be performance issues.' when: disk_mix_randread_lat.stdout|int > max_ssd_mix_randread_lat|int - name: Preflight check - Does fio mixed randread and sequential write latency of tikv_data_dir disk meet requirement - sequential write fail: msg: 'fio mixed randread and sequential write test: sequential write latency of tikv_data_dir disk is too low: {{ disk_mix_write_lat.stdout }} ns > {{ max_ssd_mix_write_lat }} ns, it is strongly recommended to use SSD disks for TiKV and PD, or there might be performance issues.' when: disk_mix_write_lat.stdout|int > max_ssd_mix_write_lat|int ================================================ FILE: roles/machine_benchmark/tasks/main.yml ================================================ --- - name: create fio and tikv data directories file: path: '{{ item }}' state: directory mode: 0755 owner: '{{ deploy_user }}' group: '{{ deploy_user }}' become: true with_items: - "{{ tikv_data_dir }}" - "{{ fio_deploy_dir }}" - name: deploy fio binary copy: src: "{{ resources_dir }}/bin/fio" dest: "{{ fio_deploy_dir }}/" mode: 0755 - name: deploy parse_fio_output.py script copy: src: "{{ script_dir }}/check/parse_fio_output.py" dest: "{{ fio_deploy_dir }}/parse_fio_output.py" mode: 0755 - include_tasks: fio_randread.yml - include_tasks: fio_randread_write.yml - include_tasks: fio_randread_write_latency.yml ================================================ FILE: roles/node_exporter/defaults/main.yml ================================================ --- # default configuration for node_exporter node_exporter_log_level: info node_exporter_log_filename: "node_exporter.log" node_exporter_tag: v0.15.2 ================================================ FILE: roles/node_exporter/meta/main.yml ================================================ --- dependencies: - role: common_dir ================================================ FILE: roles/node_exporter/tasks/binary_deployment.yml ================================================ --- - name: create deploy directories file: path={{ item }} state=directory mode=0755 with_items: - "{{ node_exporter_log_dir }}" - name: deploy node_exporter binary copy: src="{{ resources_dir }}/bin/node_exporter" dest="{{ deploy_dir }}/bin" mode=0755 - name: create run script template: src: "{{ item }}_{{ role_name }}_binary.sh.j2" dest: "{{ deploy_dir }}/scripts/{{ item }}_{{ role_name }}.sh" mode: "0755" backup: yes with_items: - run vars: role_status_dir: status/{{ role_name }} - include_tasks: "{{ process_supervision }}_deployment.yml" ================================================ FILE: roles/node_exporter/tasks/docker_deployment.yml ================================================ --- - name: deploy node-exporter image copy: src="{{ downloads_dir }}/node-exporter.tar" dest="{{ deploy_dir }}/images" mode=0755 - name: create run script template: src: "{{ item }}_{{ role_name }}_docker.sh.j2" dest: "{{ deploy_dir }}/scripts/{{ item }}_{{ role_name }}.sh" mode: "0755" backup: yes with_items: - run - name: load docker image from archive docker_image: state: present force: yes name: prom/node-exporter tag: "{{ node_exporter_tag }}" load_path: "{{ images_dir }}/node-exporter.tar" - include_tasks: "{{ process_supervision }}_deployment.yml" ================================================ FILE: roles/node_exporter/tasks/main.yml ================================================ --- - include_tasks: "{{ deployment_method }}_deployment.yml" - name: prepare firewalld white list set_fact: firewalld_ports: "{{ [node_exporter_port ~ '/tcp'] + firewalld_ports }}" ================================================ FILE: roles/node_exporter/tasks/supervise_deployment.yml ================================================ --- - name: deploy supervise include_role: name: supervise vars: this_role_name: node_exporter service_name: node_exporter-{{ node_exporter_port }} ================================================ FILE: roles/node_exporter/tasks/systemd_deployment.yml ================================================ --- - name: deploy systemd include_role: name: systemd vars: this_role_name: node_exporter service_name: node_exporter-{{ node_exporter_port }} ================================================ FILE: roles/node_exporter/templates/run_node_exporter_binary.sh.j2 ================================================ #!/bin/bash set -e ulimit -n 1000000 # WARNING: This file was auto-generated. Do not edit! # All your edit might be overwritten! DEPLOY_DIR={{ deploy_dir }} cd "${DEPLOY_DIR}" || exit 1 exec > >(tee -i -a "{{ node_exporter_log_dir }}/{{ node_exporter_log_filename }}") exec 2>&1 exec bin/node_exporter --web.listen-address=":{{ node_exporter_port }}" \ --collector.tcpstat \ --collector.systemd \ --collector.mountstats \ --collector.meminfo_numa \ --collector.interrupts \ --collector.buddyinfo \ --collector.vmstat.fields="^.*" \ --log.level="{{ node_exporter_log_level }}" ================================================ FILE: roles/node_exporter/templates/run_node_exporter_docker.sh.j2 ================================================ #!/bin/bash set -e ulimit -n 1000000 # WARNING: This file was auto-generated. Do not edit! # All your edit might be overwritten! DEPLOY_DIR={{ deploy_dir }} cd "${DEPLOY_DIR}" || exit 1 exec docker run \ --net="host" \ --pid="host" \ --name="node_exporter-{{ node_exporter_port }}" \ prom/node-exporter:{{ node_exporter_tag }} ================================================ FILE: roles/ops/tasks/main.yml ================================================ --- - name: create check_tikv.sh script template: src: "check_tikv.sh.j2" dest: "{{ playbook_dir }}/scripts/check_tikv.sh" mode: "0755" - name: create pd-ctl.sh script template: src: "pd-ctl.sh.j2" dest: "{{ playbook_dir }}/scripts/pd-ctl.sh" mode: "0755" ================================================ FILE: roles/ops/templates/check_tikv.sh.j2 ================================================ #!/bin/bash {% if enable_tls|default(false) %} {{ resources_dir }}/bin/pd-ctl store -d -u https://{{ groups.pd_servers[0] }}:{{ hostvars[groups.pd_servers[0]].pd_client_port }} --cacert {{ cert_dir }}/ca.pem --cert {{ cert_dir }}/client.pem --key {{ cert_dir }}/client-key.pem | egrep '(id|address|state_name)' | awk '{if(NR%3!=0)ORS=" "; else ORS="\n"}1' | sed 's/^[ \t]*//g' {%- else -%} {{ resources_dir }}/bin/pd-ctl store -d -u http://{{ groups.pd_servers[0] }}:{{ hostvars[groups.pd_servers[0]].pd_client_port }} | egrep '(id|address|state_name)' | awk '{if(NR%3!=0)ORS=" "; else ORS="\n"}1' | sed 's/^[ \t]*//g' {% endif %} ================================================ FILE: roles/ops/templates/pd-ctl.sh.j2 ================================================ #!/bin/bash {% if enable_tls|default(false) %} {{ resources_dir }}/bin/pd-ctl -u https://{{ groups.pd_servers[0] }}:{{ hostvars[groups.pd_servers[0]].pd_client_port }} --cacert {{ cert_dir }}/ca.pem --cert {{ cert_dir }}/client.pem --key {{ cert_dir }}/client-key.pem {%- else -%} {{ resources_dir }}/bin/pd-ctl -u http://{{ groups.pd_servers[0] }}:{{ hostvars[groups.pd_servers[0]].pd_client_port }} -i {% endif %} ================================================ FILE: roles/pd/defaults/main.yml ================================================ --- pd_client_port: 2379 pd_peer_port: 2380 pd_name_prefix: pd pd_scheme: http pd_data_dir: "{{ deploy_dir }}/data.pd" pd_log_dir: "{{ deploy_dir }}/log" pd_log_filename: "pd.log" pd_stderr_filename: "pd_stderr.log" pd_conf_dir: "{{ deploy_dir }}/conf" location_labels: [] # docker settings pd_docker_log_dir: "{{ pd_log_dir }}/pd" ================================================ FILE: roles/pd/files/make-ssl.sh ================================================ #!/bin/bash # Author: Smana smainklh@gmail.com # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. set -o errexit set -o pipefail usage() { cat << EOF Create self signed certificates Usage : $(basename $0) [-d ] -h | --help : Show this message -d | --ssldir : Directory where the certificates will be located Environmental variables HOSTS and CN should be set to generate keys for each host. EOF } # Options parsing while (($#)); do case "$1" in -h | --help) usage; exit 0;; -d | --ssldir) SSLDIR="${2}"; shift 2;; *) usage echo "ERROR : Unknown option" exit 3 ;; esac done if [ -z ${SSLDIR} ]; then echo "ERROR: the directory where the certificates will be located is missing. option -d" exit 1 fi tmpdir=$(mktemp -d /tmp/tidb_cacert.XXXXXX) trap 'rm -rf "${tmpdir}"' EXIT cd "${tmpdir}" mkdir -p "${SSLDIR}" if [ -e "$SSLDIR/ca-config.json" ]; then # Reuse existing CA cp $SSLDIR/{ca-config.json,ca-csr.json} . else echo "ERROR: ca-config.json and ca-csr.json is missing in $SSLDIR." exit 1 fi # Root CA if [ -e "$SSLDIR/ca-key.pem" ]; then # Reuse existing CA cp $SSLDIR/{ca.pem,ca-key.pem} . else cfssl gencert -initca ca-csr.json | cfssljson -bare ca - > /dev/null 2>&1 fi # client cert if [ ! -e "$SSLDIR/client-key.pem" ]; then echo '{"CN":"client","hosts":[""],"key":{"algo":"rsa","size":2048}}' | cfssl gencert -ca=ca.pem -ca-key=ca-key.pem -config=ca-config.json -profile=client -hostname="" - | cfssljson -bare client > /dev/null 2>&1 fi gen_key_and_cert() { local host=$1 local cn=$2 local name=$3 echo "{\"CN\":\"${cn}\",\"hosts\":[\"\"],\"key\":{\"algo\":\"rsa\",\"size\":2048}}" | cfssl gencert -ca=ca.pem -ca-key=ca-key.pem -config=ca-config.json -profile=server -hostname="${host},127.0.0.1" - | cfssljson -bare ${name} > /dev/null 2>&1 } # Nodes if [ -n "$HOSTS" ]; then for host in $HOSTS; do gen_key_and_cert "${host}" "${CN}" "${CN}-${host}" done fi # Install certs mv *.pem ${SSLDIR}/ ================================================ FILE: roles/pd/meta/main.yml ================================================ --- dependencies: - role: common_dir ================================================ FILE: roles/pd/tasks/binary_deployment.yml ================================================ --- - name: deploy binary copy: src="{{ resources_dir }}/bin/pd-server" dest="{{ deploy_dir }}/bin/" mode=0755 backup=yes register: pd_binary - name: backup binary file command: mv "{{ pd_binary.backup_file }}" "{{ backup_dir }}" when: pd_binary.changed and pd_binary.backup_file is defined - name: create startup script template: src: "{{ item }}_{{ role_name }}_binary.sh.j2" dest: "{{ deploy_dir }}/scripts/{{ item }}_{{ role_name }}.sh" mode: "0755" backup: yes with_items: - run vars: role_status_dir: status/{{ role_name }} - include_tasks: "{{ process_supervision }}_deployment.yml" ================================================ FILE: roles/pd/tasks/check_certs.yml ================================================ --- - name: "Check_certs | check if the certs have already been generated on control machine" find: paths: "{{ cert_dir }}" patterns: "*.pem" get_checksum: true delegate_to: localhost register: cert_control_node run_once: true - debug: var: cert_control_node - name: "Check_certs | Set default value for 'sync_certs', 'gen_certs' to false" set_fact: sync_certs: false gen_certs: false - set_fact: pd_host: "{{ hostvars[inventory_hostname].ansible_host | default(inventory_hostname) }}" - name: "Check certs | check if a cert already exists on node" stat: path: "{{ pd_cert_dir }}/{{ item }}" register: cert_pd_node with_items: - ca.pem - pd-server-{{ pd_host }}-key.pem - pd-server-{{ pd_host }}.pem - debug: var: cert_pd_node - name: "Check_certs | Set 'gen_certs' to true" set_fact: gen_certs: true when: not item in cert_control_node.files|map(attribute='path') | list delegate_to: localhost run_once: true with_items: >- ['{{cert_dir}}/ca.pem', {% set all_pd_hosts = groups['pd_servers']|unique|sort %} {% for host in all_pd_hosts %} {% set pd_ip = hostvars[host].ansible_host | default(hostvars[host].inventory_hostname) -%} '{{cert_dir}}/pd-server-{{ pd_ip }}-key.pem' {% if not loop.last %}{{','}}{% endif %} {% endfor %}] - debug: var: gen_certs - name: "Check_certs | Set 'gen_node_certs' to true" set_fact: gen_node_certs: |- { {% set all_pd_hosts = groups['pd_servers']|unique|sort -%} {% set existing_certs = cert_control_node.files|map(attribute='path')|list|sort %} {% for host in all_pd_hosts -%} {% set pd_ip = hostvars[host].ansible_host | default(hostvars[host].inventory_hostname) -%} {% set host_cert = "%s/pd-server-%s-key.pem"|format(cert_dir, pd_ip) %} {% if host_cert in existing_certs -%} "{{ host }}": False, {% else -%} "{{ host }}": True, {% endif -%} {% endfor %} } run_once: true - debug: var: gen_node_certs - name: "Check_certs | Set pd_cert_key" set_fact: pd_cert_key_path: "{{ cert_dir }}/pd-server-{{ hostvars[inventory_hostname].pd_host }}-key.pem" - debug: var: pd_cert_key_path - name: "Check_certs | Set 'sync_certs' to true" set_fact: sync_certs: true when: gen_node_certs[inventory_hostname] or (not cert_pd_node.results[0].stat.exists|default(False)) or (not cert_pd_node.results[1].stat.exists|default(False)) or (cert_pd_node.results[1].stat.checksum|default('') != cert_control_node.files|selectattr("path","equalto",pd_cert_key_path)|map(attribute="checksum")|first|default('')) - debug: var: sync_certs ================================================ FILE: roles/pd/tasks/docker_deployment.yml ================================================ --- - name: create log directory file: path="{{ item }}" state=directory mode=0755 with_items: - "{{ pd_docker_log_dir }}" - name: deploy pd image copy: src="{{ downloads_dir }}/pd.tar" dest="{{ deploy_dir }}/images" mode=0755 - name: create run script template: src: "{{ item }}_{{ role_name }}_docker.sh.j2" dest: "{{ deploy_dir }}/scripts/{{ item }}_{{ role_name }}.sh" mode: "0755" backup: yes with_items: - run - name: load docker image from archive docker_image: state: present force: yes name: pingcap/pd tag: "{{ tidb_version }}" load_path: "{{ images_dir }}/pd.tar" - include_tasks: "{{ process_supervision }}_deployment.yml" ================================================ FILE: roles/pd/tasks/gen_certs.yml ================================================ --- - name: Gen_certs | copy certs generation script copy: src: "make-ssl.sh" dest: "{{ script_dir }}/make-ssl.sh" mode: 0700 run_once: yes delegate_to: localhost when: gen_certs|default(false) - name: Gen_certs | run cert generation script command: "{{ script_dir }}/make-ssl.sh -d {{ cert_dir }}" environment: - HOSTS: "{% for h in groups['pd_servers'] %} {% if gen_node_certs[h]|default(true) %} {{ hostvars[h].ansible_host | default(hostvars[h].inventory_hostname) }} {% endif %} {% endfor %}" - PATH: "{{ ansible_env.PATH }}:{{ binary_dir }}" - CN: "pd-server" run_once: yes delegate_to: localhost when: gen_certs|default(false) ================================================ FILE: roles/pd/tasks/install_certs.yml ================================================ --- - name: "Deploy_certs | Make sure the certificate directory exits" file: path: "{{ pd_cert_dir }}" state: directory mode: 0700 - name: "Deploy_certs | Deploy certificates" copy: src: "{{ cert_dir }}/{{ item }}" dest: "{{ pd_cert_dir }}/{{ item }}" mode: 0600 backup: yes with_items: - ca.pem - pd-server-{{ pd_host }}-key.pem - pd-server-{{ pd_host }}.pem when: sync_certs|default(false) ================================================ FILE: roles/pd/tasks/main.yml ================================================ --- # tasks file for pd - name: create deploy directories file: path={{ item }} state=directory mode=0755 with_items: - "{{ pd_log_dir }}" - "{{ pd_conf_dir }}" - "{{ pd_data_dir }}" - include_tasks: check_certs.yml when: enable_tls|default(false) - include_tasks: gen_certs.yml when: enable_tls|default(false) - include_tasks: install_certs.yml when: enable_tls|default(false) - name: "load customized config: tidb-ansible/conf/pd.yml" include_vars: file={{ playbook_dir }}/conf/pd.yml name=pd_conf_custom - name: load default config include_vars: file=default.yml name=pd_conf_default - name: generate dynamic config set_fact: pd_conf_generated: replication: location-labels: "{{ location_labels }}" security: cacert-path: >- {%- if enable_tls|default(false) -%}{{ pd_cert_dir }}/ca.pem{%- else -%}{%- endif -%} cert-path: >- {%- if enable_tls|default(false) -%}{{ pd_cert_dir }}/pd-server-{{ pd_host }}.pem{%- else -%}{%- endif -%} key-path: >- {%- if enable_tls|default(false) -%}{{ pd_cert_dir }}/pd-server-{{ pd_host }}-key.pem{%- else -%}{%- endif -%} - name: generate final config set_fact: pd_conf: "{{ pd_conf_custom | with_default_dicts(pd_conf_generated, pd_conf_default) | update_default_dicts }}" - debug: var=pd_conf - name: create configuration file template: src=pd.toml.j2 dest={{ deploy_dir }}/conf/pd.toml mode=0644 backup=yes register: pd_conf_st - name: backup conf file command: mv "{{ pd_conf_st.backup_file }}" "{{ backup_dir }}" when: pd_conf_st.changed and pd_conf_st.backup_file is defined - include_tasks: "{{ deployment_method }}_deployment.yml" - name: prepare firewalld white list set_fact: firewalld_ports: "{{ [pd_peer_port ~ '/tcp', pd_client_port ~ '/tcp'] + firewalld_ports }}" ================================================ FILE: roles/pd/tasks/supervise_deployment.yml ================================================ --- - name: deploy supervise include_role: name: supervise vars: this_role_name: pd service_name: pd-{{ pd_client_port }} ================================================ FILE: roles/pd/tasks/systemd_deployment.yml ================================================ --- - name: deploy systemd include_role: name: systemd vars: this_role_name: pd service_name: pd-{{ pd_client_port }} ================================================ FILE: roles/pd/templates/pd.toml.j2 ================================================ # PD Configuration initial-cluster-state = "new" {% for item, value in pd_conf.global | dictsort -%} {{ item }} = {{ value | to_json }} {% endfor %} [security] {% for item, value in pd_conf.security | dictsort -%} {{ item }} = {{ value | to_json }} {% endfor %} [log] {% for item, value in pd_conf.log | dictsort_by_value_type -%} {% if value is not mapping -%} {{ item }} = {{ value | to_json}} {% else %} [log.{{ item }}] {% for sub_item, sub_value in value | dictsort -%} {{ sub_item }} = {{ sub_value | to_json }} {% endfor %} {% endif %} {% endfor %} [metric] {% for item, value in pd_conf.metric | dictsort -%} {{ item }} = {{ value | to_json }} {% endfor %} [schedule] {% for item, value in pd_conf.schedule | dictsort -%} {{ item }} = {{ value | to_json }} {% endfor %} [replication] {% if groups.tiflash_servers | length | default(0) > 0 -%} enable-placement-rules = true {% endif -%} {% for item, value in pd_conf.replication | dictsort -%} {{ item }} = {{ value | to_json}} {% endfor %} [dashboard] {% for item, value in pd_conf.dashboard | dictsort -%} {{ item }} = {{ value | to_json}} {% endfor %} ================================================ FILE: roles/pd/templates/run_pd_binary.sh.j2 ================================================ #!/bin/bash set -e ulimit -n 1000000 # WARNING: This file was auto-generated. Do not edit! # All your edit might be overwritten! DEPLOY_DIR={{ deploy_dir }} cd "${DEPLOY_DIR}" || exit 1 {% set my_ip = hostvars[inventory_hostname].ansible_host | default(hostvars[inventory_hostname].inventory_hostname) -%} {% set my_separator = "_" %} {% set my_hostname = hostvars[inventory_hostname]['ansible_hostname'] | default(hostvars[inventory_hostname].inventory_hostname) -%} {% set my_peer_id = my_separator ~ my_hostname %} {% if enable_tls|default(false) %} {% set pd_scheme = 'https' -%} {% endif %} {% set all_pd = [] -%} {% for host in groups.pd_servers -%} {% set other_ip = hostvars[host].ansible_host | default(hostvars[host].inventory_hostname) -%} {% set other_port = hostvars[host]['pd_peer_port'] -%} {% set other_pd_name_surfix = hostvars[host]['ansible_hostname'] | default(hostvars[host].inventory_hostname) -%} {% set other_pd_name = pd_name_prefix ~ my_separator ~ other_pd_name_surfix -%} {% set _ = all_pd.append("%s=%s://%s:%s" % (other_pd_name, pd_scheme, other_ip, other_port)) -%} {% endfor -%} exec bin/pd-server \ --name="{{ pd_name_prefix }}{{ my_peer_id }}" \ --client-urls="{{ pd_scheme }}://{{ my_ip }}:{{ pd_client_port }}" \ --advertise-client-urls="{{ pd_scheme }}://{{ my_ip }}:{{ pd_client_port }}" \ --peer-urls="{{ pd_scheme }}://{{ my_ip }}:{{ pd_peer_port }}" \ --advertise-peer-urls="{{ pd_scheme }}://{{ my_ip }}:{{ pd_peer_port }}" \ --data-dir="{{ pd_data_dir }}" \ --initial-cluster="{{ all_pd | join(',') }}" \ --config=conf/pd.toml \ --log-file="{{ pd_log_dir }}/{{ pd_log_filename }}" 2>> "{{ pd_log_dir }}/{{ pd_stderr_filename }}" ================================================ FILE: roles/pd/templates/run_pd_docker.sh.j2 ================================================ #!/bin/bash set -e ulimit -n 1000000 # WARNING: This file was auto-generated. Do not edit! # All your edit might be overwritten! DEPLOY_DIR={{ deploy_dir }} cd "${DEPLOY_DIR}" || exit 1 {% set my_ip = hostvars[inventory_hostname].ansible_host | default(hostvars[inventory_hostname].inventory_hostname) -%} {% set my_peer_id = groups.pd_servers.index(inventory_hostname) + 1 -%} {% set all_pd = [] -%} {% for host in groups.pd_servers -%} {% set other_ip = hostvars[host].ansible_host | default(hostvars[host].inventory_hostname) -%} {% set other_port = hostvars[host]['pd_peer_port'] -%} {% set other_pd_name_surfix = groups.pd_servers.index(host) + 1 -%} {% set other_pd_name = pd_name_prefix ~ other_pd_name_surfix -%} {% set _ = all_pd.append("%s=http://%s:%s" % (other_pd_name, other_ip, other_port)) -%} {% endfor -%} exec docker run \ -p {{ pd_client_port }}:2379 \ -p {{ pd_peer_port }}:2380 \ -v /etc/localtime:/etc/localtime:ro \ -v "{{ pd_conf_dir }}/pd.toml:/etc/pd.toml:ro" \ -v "{{ pd_data_dir }}:/data" \ -v "{{ pd_docker_log_dir }}:/var/log" \ -u `id -u {{ deploy_user }}` \ --ulimit nofile=1000000:1000000 \ --hostname "{{ pd_name_prefix }}{{ my_peer_id }}" \ --name "{{ pd_name_prefix }}" \ pingcap/pd:{{ tidb_version }} \ --name="{{ pd_name_prefix }}{{ my_peer_id }}" \ --client-urls=http://0.0.0.0:2379 \ --advertise-client-urls=http://{{my_ip}}:2379 \ --peer-urls=http://0.0.0.0:2380 \ --advertise-peer-urls=http://{{my_ip}}:2380 \ --initial-cluster="{{ all_pd | join(',') }}" \ --data-dir=/data \ --log-file="/var/log/{{ pd_log_filename }}" \ --config=/etc/pd.toml ================================================ FILE: roles/pd/vars/default.yml ================================================ --- # default configuration file for pd in yaml format global: # name: "pd" # data-dir: "default.pd" # # client-urls: "http://127.0.0.1:2379" # # if not set, use ${client-urls} # advertise-client-urls: "" # # peer-urls: "http://127.0.0.1:2380" # # if not set, use ${peer-urls} # advertise-peer-urls: "" # # initial-cluster: "pd=http://127.0.0.1:2380" # initial-cluster-state: "new" lease: 3 tso-save-interval: "3s" security: log: level: "info" # file logging file: #filename: "" # max log file size in MB #max-size: 300 # max log file keep days #max-days: 28 # maximum number of old log files to retain #max-backups: 7 # rotate log by day #log-rotate: true metric: schedule: split-merge-interval: "1h" max-snapshot-count: 3 max-pending-peer-count: 16 max-store-down-time: "30m" leader-schedule-limit: 4 region-schedule-limit: 64 replica-schedule-limit: 64 merge-schedule-limit: 8 enable-one-way-merge: false replication: # The number of replicas for each region. max-replicas: 3 # The label keys specified the location of a store. # The placement priorities is implied by the order of label keys. # For example, ["zone", "rack"] means that we should place replicas to # different zones first, then to different racks if we don't have enough zones. location-labels: [] dashboard: public-path-prefix: "/dashboard" internal-proxy: false enable-telemetry: true ================================================ FILE: roles/perf_tools/tasks/main.yml ================================================ --- # tasks file for perf-tools - name: Deploy iosnoop and funcslower copy: src: "{{ playbook_dir }}/scripts/{{ item }}" dest: "{{ deploy_dir }}/scripts/" mode: 0755 backup: yes with_items: - iosnoop - funcslower ================================================ FILE: roles/pre-ansible/defaults/main.yml ================================================ # The directory where binaries are stored on Ansible # # managed systems. bin_dir: /usr/bin # The directory where scripts used for bootstrapping CoreOS # reside. bootstrap_script_dir: /opt pypy_version: 5.6.0 # https://bitbucket.org/pypy/pypy/downloads/pypy2-v5.6.0-linux64.tar.bz2 # http://download.pingcap.org/pypy2-v5.6.0-linux64.tar.bz2 pypy_download_url_base: "https://bitbucket.org/pypy/pypy/downloads" pypy_download_url: "{{ pypy_download_url_base }}/pypy2-v{{ pypy_version }}-linux64.tar.bz2" pypy_download_url_under_outbound: "http://download.pingcap.org/pypy2-v5.6.0-linux64.tar.bz2" ================================================ FILE: roles/pre-ansible/tasks/coreos.yml ================================================ --- - name: CoreOS | Check python executable linkage and mark .bootstrapped shell: "{{ bootstrap_script_dir }}/pypy/bin/python --version" register: pypy_st - name: CoreOS - detect outbound network[1] shell: curl -s --connect-timeout 4 baidu.com 2>/dev/null >/dev/null; echo $? changed_when: false register: outbound_network_st - name: CoreOS - set outbound network fact[1] set_fact: has_outbound_network={{ outbound_network_st.stdout.strip() == '0' }} - name: CoreOS - detect outbound network[2] shell: curl -s --connect-timeout 2 google.com 2>/dev/null >/dev/null; echo $? changed_when: false register: outbound_st - name: CoreOS - set outbound network fact[2] set_fact: under_outbound={{ outbound_st.stdout.strip() != '0' }} - name: CoreOS - use mirror if detect outbound network set_fact: pypy_download_url: "{{ pypy_download_url_under_outbound }}" - name: CoreOS | Create bootstrap directory shell: sudo mkdir -p {{ bootstrap_script_dir }} - name: CoreOS | deploy pypy - shell: > http_proxy="{{ http_proxy|default('') }}" https_proxy="{{ https_proxy|default('') }}" no_proxy="{{ no_proxy|default('') }}" wget -O /tmp/pypy2-v{{ pypy_version }}-linux64.tar.bz2 {{ pypy_download_url }} when: has_outbound_network - name: CoreOS | Extract pypy tar to tmp shell: tar -xjf /tmp/pypy2-v{{ pypy_version }}-linux64.tar.bz2 -C /tmp - name: CoreOS | Move pypy source to python bootstrap directory shell: sudo mv -n /tmp/pypy2-v{{ pypy_version }}-linux64 {{ bootstrap_script_dir }}/pypy - name: CoreOS | Make pypy lib directory and link ncurses .so shell: mkdir -p {{ bootstrap_script_dir }}/pypy/lib && ln -snf /lib64/libncurses.so.5.9 {{ bootstrap_script_dir }}/pypy/lib/libtinfo.so.5 - name: CoreOS | Add python exec script to bootstrap directory shell: > printf "%s\n%s" "#! /bin/bash" "LD_LIBRARY_PATH={{ bootstrap_script_dir }}/pypy/lib:\$LD_LIBRARY_PATH exec {{ bootstrap_script_dir }}/pypy/bin/pypy \"\$@\"" > {{ bootstrap_script_dir }}/pypy/bin/python - name: CoreOS | Add exec permission to python exec script shell: chmod +x {{ bootstrap_script_dir }}/pypy/bin/python - name: CoreOS | Check python executable linkage and mark .bootstrapped shell: "{{ bootstrap_script_dir }}/pypy/bin/python --version" register: pypy_st - name: CoreOS | "PATH=\$PATH:{{ bin_dir }}" > /etc/profile.d/python-path.sh - name: CoreOS | Change permissions and ownership for opt-path.sh to run as root shell: chmod 0755 /etc/profile.d/python-path.sh && chown root /etc/profile.d/python-path.sh ================================================ FILE: roles/pre-ansible/tasks/main.yml ================================================ --- - name: disk space check - fail when disk is full shell: df -h . | tail -n1 register: disk_space_st failed_when: " '100%' in disk_space_st.stdout " changed_when: false # Debian GNU/Linux, Ubuntu, Fedora, CentOS, CoreOS - name: Get distro name from /etc/os-release shell: "([ -f /etc/os-release ] && grep '^NAME=' /etc/os-release | sed s'/NAME=//' | tr -d \\\") || ([ -f /etc/redhat-release ] && cat /etc/redhat-release | cut '-d ' -f1)" register: distro_st failed_when: false changed_when: false - name: set distro facts set_fact: distro: "{{ distro_st.stdout | trim }}" - name: python check shell: python --version register: py_st failed_when: false changed_when: false - name: set has_python facts set_fact: has_python: true - name: set has_python facts set_fact: has_python: false when: "'command not found' in py_st.stdout" - include_tasks: coreos.yml when: "'CoreOS' in distro and not has_python" - include_tasks: root_tasks.yml when: "'CoreOS' not in distro" ================================================ FILE: roles/pre-ansible/tasks/root_tasks.yml ================================================ --- - name: Debian/Ubuntu - install python shell: sudo apt-get -y install python when: - "'Ubuntu' in distro or 'Debian' in distro" - "not has_python" - name: Redhat/CentOS - install python shell: sudo yum -y install python when: - "'CentOS' in distro" - "not has_python" - name: Redhat/CentOS - Make sure ntp, ntpstat have been installed yum: name: "{{ item }}" state: present with_items: - ntp when: - "'CentOS' in distro" - enable_ntpd - name: Debian/Ubuntu - Make sure ntp, ntpstat have been installed apt: name: "{{ item }}" state: present with_items: - ntp - ntpstat when: - "'Ubuntu' in distro or 'Debian' in distro" - enable_ntpd ================================================ FILE: roles/prometheus/defaults/main.yml ================================================ --- # default configuration for prometheus prometheus_data_dir: "{{ deploy_dir }}/prometheus2.0.0.data.metrics" # How long to retain samples in the storage prometheus_storage_retention: "30d" prometheus_log_level: info prometheus_log_dir: "{{ deploy_dir }}/log" prometheus_log_filename: "prometheus.log" prometheus_extra_labels: - { label: "monitor", value: "prometheus" } alert_label: "promethues" prometheus_tag: v2.6.1 ================================================ FILE: roles/prometheus/files/binlog.rules.yml ================================================ groups: - name: alert.rules rules: - alert: binlog_pump_storage_error_count expr: changes(binlog_pump_storage_error_count[1m]) > 0 labels: env: ENV_LABELS_ENV level: emergency expr: changes(binlog_pump_storage_error_count[1m]) > 0 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' value: '{{ $value }}' summary: binlog pump storage write some binlogs failed - alert: binlog_drainer_checkpoint_high_delay expr: (time() - binlog_drainer_checkpoint_tso / 1000) > 3600 for: 1m labels: env: ENV_LABELS_ENV level: critical expr: (time() - binlog_drainer_checkpoint_tso / 1000) > 3600 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' value: '{{ $value }}' summary: binlog drainer checkpoint delay more than 1 hour - alert: binlog_pump_write_binlog_rpc_duration_seconds_bucket expr: histogram_quantile(0.9, rate(binlog_pump_rpc_duration_seconds_bucket{method="WriteBinlog"}[5m])) > 1 for: 1m labels: env: ENV_LABELS_ENV level: warning expr: histogram_quantile(0.9, rate(binlog_pump_rpc_duration_seconds_bucket{method="WriteBinlog"}[5m])) annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' value: '{{ $value }}' summary: binlog pump write binlog RPC latency is too high - alert: binlog_pump_storage_write_binlog_duration_time_bucket expr: histogram_quantile(0.9, rate(binlog_pump_storage_write_binlog_duration_time_bucket{type="batch"}[5m])) > 1 for: 1m labels: env: ENV_LABELS_ENV level: warning expr: histogram_quantile(0.9, rate(binlog_pump_storage_write_binlog_duration_time_bucket{type="batch"}[5m])) annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' value: '{{ $value }}' summary: binlog pump write binlog to disk is too slow - alert: binlog_pump_storage_available_size_less_than_20G expr: binlog_pump_storage_storage_size_bytes{type="available"} < 20 * 1024 * 1024 * 1024 for: 10s labels: env: ENV_LABELS_ENV level: warning expr: binlog_pump_storage_storage_size_bytes{type="available"} < 20 * 1024 * 1024 * 1024 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' value: '{{ $value }}' summary: binlog pump storage available size less than 20G - alert: binlog_drainer_execute_duration_time_more_than_10s expr: histogram_quantile(0.9, rate(binlog_drainer_execute_duration_time_bucket[1m])) > 10 for: 1m labels: env: ENV_LABELS_ENV level: warning expr: histogram_quantile(0.9, rate(binlog_drainer_txn_duration_time_bucket[1m])) > 10 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' value: '{{ $value }}' summary: binlog binlog drainer execute_duration_time_more_than_10s - alert: binlog_drainer_checkpoint_tso_no_change_for_1m expr: changes(binlog_drainer_checkpoint_tso[1m]) < 1 labels: env: ENV_LABELS_ENV level: warning expr: changes(binlog_drainer_checkpoint_tso[1m]) < 1 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' value: '{{ $value }}' summary: binlog drainer checkpoint tso no change for 1m ================================================ FILE: roles/prometheus/files/blacker.rules.yml ================================================ groups: - name: alert.rules rules: - alert: TiDB_server_is_down expr: probe_success{group="tidb"} == 0 for: 1m labels: env: ENV_LABELS_ENV level: emergency expr: probe_success{group="tidb"} == 0 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}' value: '{{ $value }}' summary: TiDB server is down - alert: Pump_server_is_down expr: probe_success{group="pump"} == 0 for: 1m labels: env: ENV_LABELS_ENV level: emergency expr: probe_success{group="pump"} == 0 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}' value: '{{ $value }}' summary: Pump server is down - alert: Drainer_server_is_down expr: probe_success{group="drainer"} == 0 for: 1m labels: env: ENV_LABELS_ENV level: emergency expr: probe_success{group="drainer"} == 0 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}' value: '{{ $value }}' summary: Drainer server is down - alert: TiKV_server_is_down expr: probe_success{group="tikv"} == 0 for: 1m labels: env: ENV_LABELS_ENV level: emergency expr: probe_success{group="tikv"} == 0 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}' value: '{{ $value }}' summary: TiKV server is down - alert: PD_server_is_down expr: probe_success{group="pd"} == 0 for: 1m labels: env: ENV_LABELS_ENV level: emergency expr: probe_success{group="pd"} == 0 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}' value: '{{ $value }}' summary: PD server is down - alert: Node_exporter_server_is_down expr: probe_success{group="node_exporter"} == 0 for: 1m labels: env: ENV_LABELS_ENV level: emergency expr: probe_success{group="node_exporter"} == 0 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}' value: '{{ $value }}' summary: Node_exporter server is down - alert: Blackbox_exporter_server_is_down expr: probe_success{group="blackbox_exporter"} == 0 for: 1m labels: env: ENV_LABELS_ENV level: emergency expr: probe_success{group="blackbox_exporter"} == 0 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}' value: '{{ $value }}' summary: Blackbox_exporter server is down - alert: Grafana_server_is_down expr: probe_success{group="grafana"} == 0 for: 1m labels: env: ENV_LABELS_ENV level: emergency expr: probe_success{group="grafana"} == 0 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}' value: '{{ $value }}' summary: Grafana server is down - alert: Pushgateway_server_is_down expr: probe_success{group="pushgateway"} == 0 for: 1m labels: env: ENV_LABELS_ENV level: emergency expr: probe_success{group="pushgateway"} == 0 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}' value: '{{ $value }}' summary: Pushgateway server is down - alert: Kafka_exporter_is_down expr: probe_success{group="kafka_exporter"} == 0 for: 1m labels: env: ENV_LABELS_ENV level: emergency expr: probe_success{group="kafka_exporter"} == 0 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}' value: '{{ $value }}' summary: Kafka_exporter server is down - alert: Pushgateway_metrics_interface expr: probe_success{job="blackbox_exporter_http"} == 0 for: 1m labels: env: ENV_LABELS_ENV level: emergency expr: probe_success{job="blackbox_exporter_http"} == 0 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}' value: '{{ $value }}' summary: Pushgateway metrics interface is down - alert: BLACKER_ping_latency_more_than_1s expr: max_over_time(probe_duration_seconds{job=~"blackbox_exporter.*_icmp"}[1m]) > 1 for: 1m labels: env: ENV_LABELS_ENV level: warning expr: max_over_time(probe_duration_seconds{job=~"blackbox_exporter.*_icmp"}[1m]) > 1 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' value: '{{ $value }}' summary: blackbox_exporter ping latency more than 1s ================================================ FILE: roles/prometheus/files/bypass.rules.yml ================================================ groups: - name: alert.rules rules: ================================================ FILE: roles/prometheus/files/kafka.rules.yml ================================================ groups: - name: alert.rules rules: - alert: Kafka_is_down expr: probe_success{group="kafka"} == 0 for: 1m labels: env: ENV_LABELS_ENV level: emergency expr: probe_success{group="kafka"} == 0 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}' value: '{{ $value }}' summary: Kafka is down - alert: kafka_brokers expr: kafka_brokers < 3 for: 1m labels: env: ENV_LABELS_ENV level: emergency expr: kafka_brokers < 3 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' value: '{{ $value }}' summary: One or more kafka brokers are down - alert: Zookeeper_is_down expr: probe_success{group="zookeeper"} == 0 for: 1m labels: env: ENV_LABELS_ENV level: emergency expr: probe_success{group="zookeeper"} == 0 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}' value: '{{ $value }}' summary: Zookeeper is down - alert: kafka_topic_partition_in_sync_replica expr: kafka_topic_partition_in_sync_replica != 3 for: 1m labels: env: ENV_LABELS_ENV level: critical expr: kafka_topic_partition_in_sync_replica != 3 annotations: description: 'cluster: ENV_LABELS_ENV, topic: {{ $labels.topic }}, partition: {{ $labels.partition }}, values: {{ $value }}' value: '{{ $value }}' summary: kafka topic partition in sync replica is missing - alert: kafka_topic_partition_leader_change expr: changes(kafka_topic_partition_leader[10m]) >= 1 for: 1m labels: env: ENV_LABELS_ENV level: warning expr: changes(kafka_topic_partition_leader[10m]) >= 1 annotations: description: 'cluster: ENV_LABELS_ENV, topic: {{ $labels.topic }}, partition: {{ $labels.partition }}, values: {{ $value }}' value: '{{ $value }}' summary: kafka topic partition leader change ================================================ FILE: roles/prometheus/files/lightning.rules.yml ================================================ groups: - name: alert.rules rules: - alert: Lightning_import_failure_tables_count expr: sum ( lightning_tables{result="failure"} ) > 0 for: 1m labels: env: ENV_LABELS_ENV level: emergency expr: sum ( lightning_tables{result="failure"} ) > 0 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' summary: Lightning failed to import a table ================================================ FILE: roles/prometheus/files/node.rules.yml ================================================ groups: - name: alert.rules rules: - alert: NODE_disk_used_more_than_80% expr: node_filesystem_avail_bytes{fstype=~"(ext.|xfs)", mountpoint!~"/boot"} / node_filesystem_size_bytes{fstype=~"(ext.|xfs)", mountpoint!~"/boot"} * 100 <= 20 for: 3m labels: env: ENV_LABELS_ENV level: emergency expr: node_filesystem_avail_bytes{fstype=~"(ext.|xfs)", mountpoint!~"/boot"} / node_filesystem_size_bytes{fstype=~"(ext.|xfs)", mountpoint!~"/boot"} * 100 <= 20 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' value: '{{ $value }}' summary: disk used more than 80% - alert: NODE_disk_inode_more_than_80% expr: node_filesystem_files_free{fstype=~"(ext.|xfs)"} / node_filesystem_files{fstype=~"(ext.|xfs)"} * 100 < 20 for: 3m labels: env: ENV_LABELS_ENV level: emergency expr: node_filesystem_files_free{fstype=~"(ext.|xfs)"} / node_filesystem_files{fstype=~"(ext.|xfs)"} * 100 < 20 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' value: '{{ $value }}' summary: disk inode more than 80% - alert: NODE_disk_readonly expr: node_filesystem_readonly{fstype=~"(ext.|xfs)"} == 1 for: 1m labels: env: ENV_LABELS_ENV level: emergency expr: node_filesystem_readonly{fstype=~"(ext.|xfs)"} == 1 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}, device: {{ $labels.device }}, mountpoint: {{ $labels.mountpoint }}' value: '{{ $value }}' summary: disk readonly - alert: NODE_memory_used_more_than_80% expr: (((node_memory_MemTotal_bytes-node_memory_MemFree_bytes-node_memory_Cached_bytes)/(node_memory_MemTotal_bytes)*100)) >= 80 for: 3m labels: env: ENV_LABELS_ENV level: critical expr: (((node_memory_MemTotal_bytes-node_memory_MemFree_bytes-node_memory_Cached_bytes)/(node_memory_MemTotal_bytes)*100)) >= 80 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' value: '{{ $value }}' summary: memory used more than 80% - alert: NODE_node_overload expr: (node_load5 / count without (cpu, mode) (node_cpu_seconds_total{mode="system"})) > 1 for: 3m labels: env: ENV_LABELS_ENV level: warning expr: (node_load5 / count without (cpu, mode) (node_cpu_seconds_total{mode="system"})) > 1 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' value: '{{ $value }}' summary: node overload - alert: NODE_cpu_used_more_than_80% expr: avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by(instance) * 100 <= 20 for: 3m labels: env: ENV_LABELS_ENV level: warning expr: avg(irate(irate(node_cpu_seconds_total{mode="idle"}[5m])) by(instance) * 100 <= 20 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' value: '{{ $value }}' summary: cpu used more than 80% - alert: NODE_tcp_estab_num_more_than_50000 expr: node_netstat_Tcp_CurrEstab > 50000 for: 3m labels: env: ENV_LABELS_ENV level: warning expr: node_netstat_Tcp_CurrEstab > 50000 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' value: '{{ $value }}' summary: tcp establish connection more than 50000 - alert: NODE_disk_read_latency_more_than_32ms expr: ( (rate(node_disk_read_time_seconds_total{device=~".+"}[5m]) / rate(node_disk_reads_completed_total{device=~".+"}[5m])) or (irate(node_disk_read_time_seconds_total{device=~".+"}[5m]) / irate(node_disk_reads_completed_total{device=~".+"}[5m])) ) * 1000 > 32 for: 3m labels: env: ENV_LABELS_ENV level: warning expr: ( (rate(node_disk_read_time_seconds_total{device=~".+"}[5m]) / rate(node_disk_reads_completed_total{device=~".+"}[5m])) or (irate(node_disk_read_time_seconds_total{device=~".+"}[5m]) / irate(node_disk_reads_completed_total{device=~".+"}[5m])) ) * 1000 > 32 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' value: '{{ $value }}' summary: disk_read_latency_more_than_32ms - alert: NODE_disk_write_latency_more_than_16ms expr: ( (rate(node_disk_write_time_seconds_total{device=~".+"}[5m]) / rate(node_disk_writes_completed_total{device=~".+"}[5m])) or (irate(node_disk_write_time_seconds_total{device=~".+"}[5m]) / irate(node_disk_writes_completed_total{device=~".+"}[5m])) ) * 1000 > 16 for: 3m labels: env: ENV_LABELS_ENV level: warning expr: ( (rate(node_disk_write_time_seconds_total{device=~".+"}[5m]) / rate(node_disk_writes_completed_total{device=~".+"}[5m])) or (irate(node_disk_write_time_seconds_total{device=~".+"}[5m]) / irate(node_disk_writes_completed_total{device=~".+"}[5m])) ) * 1000 > 16 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' value: '{{ $value }}' summary: disk_write_latency_more_than_16ms ================================================ FILE: roles/prometheus/files/pd.rules.yml ================================================ groups: - name: alert.rules rules: - alert: PD_cluster_offline_tikv_nums expr: (sum ( pd_cluster_status{type="store_down_count"} ) by (instance) > 0) and (sum(etcd_server_is_leader) by (instance) > 0) for: 1m labels: env: ENV_LABELS_ENV level: emergency expr: (sum ( pd_cluster_status{type="store_down_count"} ) by (instance) > 0) and (sum(etcd_server_is_leader) by (instance) > 0) annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' summary: PD_cluster_offline_tikv_nums - alert: PD_etcd_write_disk_latency expr: histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) by (instance,job,le) ) > 1 for: 1m labels: env: ENV_LABELS_ENV level: critical expr: histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[1m])) by (instance,job,le) ) > 1 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' summary: PD_etcd_write_disk_latency - alert: PD_miss_peer_region_count expr: (sum(pd_regions_status{type="miss_peer_region_count"}) by (instance) > 100) and (sum(etcd_server_is_leader) by (instance) > 0) for: 1m labels: env: ENV_LABELS_ENV level: critical expr: (sum(pd_regions_status{type="miss_peer_region_count"}) by (instance) > 100) and (sum(etcd_server_is_leader) by (instance) > 0) annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' summary: PD_miss_peer_region_count - alert: PD_cluster_lost_connect_tikv_nums expr: (sum ( pd_cluster_status{type="store_disconnected_count"} ) by (instance) > 0) and (sum(etcd_server_is_leader) by (instance) > 0) for: 1m labels: env: ENV_LABELS_ENV level: warning expr: (sum ( pd_cluster_status{type="store_disconnected_count"} ) by (instance) > 0) and (sum(etcd_server_is_leader) by (instance) > 0) annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' summary: PD_cluster_lost_connect_tikv_nums - alert: PD_cluster_low_space expr: (sum ( pd_cluster_status{type="store_low_space_count"} ) by (instance) > 0) and (sum(etcd_server_is_leader) by (instance) > 0) for: 1m labels: env: ENV_LABELS_ENV level: warning expr: (sum ( pd_cluster_status{type="store_low_space_count"} ) by (instance) > 0) and (sum(etcd_server_is_leader) by (instance) > 0) annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' summary: PD_cluster_low_space - alert: PD_etcd_network_peer_latency expr: histogram_quantile(0.99, sum(rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) by (To,instance,job,le) ) > 1 for: 1m labels: env: ENV_LABELS_ENV level: warning expr: histogram_quantile(0.99, sum(rate(etcd_network_peer_round_trip_time_seconds_bucket[1m])) by (To,instance,job,le) ) > 1 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' summary: PD_etcd_network_peer_latency - alert: PD_tidb_handle_requests_duration expr: histogram_quantile(0.99, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{type="tso"}[1m])) by (instance,job,le) ) > 0.1 for: 1m labels: env: ENV_LABELS_ENV level: warning expr: histogram_quantile(0.99, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{type="tso"}[1m])) by (instance,job,le) ) > 0.1 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' summary: PD_tidb_handle_requests_duration - alert: PD_down_peer_region_nums expr: (sum(pd_regions_status{type="down_peer_region_count"}) by (instance) > 0) and (sum(etcd_server_is_leader) by (instance) > 0) for: 1m labels: env: ENV_LABELS_ENV level: warning expr: (sum(pd_regions_status{type="down_peer_region_count"}) by (instance) > 0) and (sum(etcd_server_is_leader) by (instance) > 0) annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' summary: PD_down_peer_region_nums - alert: PD_incorrect_namespace_region_count expr: (sum(pd_regions_status{type="incorrect_namespace_region_count"}) by (instance) > 100) and (sum(etcd_server_is_leader) by (instance) > 0) for: 1m labels: env: ENV_LABELS_ENV level: warning expr: (sum(pd_regions_status{type="incorrect_namespace_region_count"}) by (instance) > 100) and (sum(etcd_server_is_leader) by (instance) > 0) annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' summary: PD_incorrect_namespace_region_count - alert: PD_pending_peer_region_count expr: (sum(pd_regions_status{type="pending_peer_region_count"}) by (instance) > 100) and (sum(etcd_server_is_leader) by (instance) > 0) for: 1m labels: env: ENV_LABELS_ENV level: warning expr: (sum(pd_regions_status{type="pending_peer_region_count"}) by (instance) > 100) and (sum(etcd_server_is_leader) by (instance) > 0) annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' summary: PD_pending_peer_region_count - alert: PD_leader_change expr: count( changes(pd_server_tso{type="save"}[10m]) > 0 ) >= 2 for: 1m labels: env: ENV_LABELS_ENV level: warning expr: count( changes(pd_server_tso{type="save"}[10m]) > 0 ) >= 2 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' summary: PD_leader_change - alert: TiKV_space_used_more_than_80% expr: sum(pd_cluster_status{type="storage_size"}) / sum(pd_cluster_status{type="storage_capacity"}) * 100 > 80 for: 1m labels: env: ENV_LABELS_ENV level: warning expr: sum(pd_cluster_status{type="storage_size"}) / sum(pd_cluster_status{type="storage_capacity"}) * 100 > 80 annotations: description: 'cluster: ENV_LABELS_ENV, type: {{ $labels.type }}, instance: {{ $labels.instance }}, values: {{ $value }}' value: '{{ $value }}' summary: TiKV_space_used_more_than_80% - alert: PD_system_time_slow expr: changes(pd_server_tso{type="system_time_slow"}[10m]) >= 1 for: 1m labels: env: ENV_LABELS_ENV level: warning expr: changes(pd_server_tso{type="system_time_slow"}[10m]) >= 1 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' value: '{{ $value }}' summary: PD_system_time_slow - alert: PD_no_store_for_making_replica expr: increase(pd_checker_event_count{type="replica_checker", name="no_target_store"}[1m]) > 0 for: 1m labels: env: ENV_LABELS_ENV level: warning expr: increase(pd_checker_event_count{type="replica_checker", name="no_target_store"}[1m]) > 0 annotations: description: 'cluster: ENV_LABELS_ENV, type: {{ $labels.type }}, instance: {{ $labels.instance }}, values: {{ $value }}' value: '{{ $value }}' summary: PD_no_store_for_making_replica ================================================ FILE: roles/prometheus/files/tidb.rules.yml ================================================ groups: - name: alert.rules rules: - alert: TiDB_schema_error expr: increase(tidb_session_schema_lease_error_total{type="outdated"}[15m]) > 0 for: 1m labels: env: ENV_LABELS_ENV level: emergency expr: increase(tidb_session_schema_lease_error_total{type="outdated"}[15m]) > 0 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' summary: TiDB schema error - alert: TiDB_tikvclient_region_err_total expr: increase( tidb_tikvclient_region_err_total[10m] ) > 6000 for: 1m labels: env: ENV_LABELS_ENV level: emergency expr: increase( tidb_tikvclient_region_err_total[10m] ) > 6000 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' summary: TiDB tikvclient_backoff_count error - alert: TiDB_binlog_error_total expr: increase( tidb_server_critical_error_total[5m] ) > 0 for: 1m labels: env: ENV_LABELS_ENV level: emergency expr: increase( tidb_server_critical_error_total[5m] ) > 0 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' summary: TiDB tidb binlog error total - alert: TiDB_domain_load_schema_total expr: increase( tidb_domain_load_schema_total{type="failed"}[10m] ) > 10 for: 1m labels: env: ENV_LABELS_ENV level: emergency expr: increase( tidb_domain_load_schema_total{type="failed"}[10m] ) > 10 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' summary: TiDB domain_load_schema_total error - alert: TiDB_monitor_keep_alive expr: increase(tidb_monitor_keep_alive_total{job="tidb"}[10m]) < 100 for: 1m labels: env: ENV_LABELS_ENV level: emergency expr: increase(tidb_monitor_keep_alive_total{job="tidb"}[10m]) < 100 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' summary: TiDB monitor_keep_alive error - alert: TiDB_server_panic_total expr: increase(tidb_server_panic_total[10m]) > 0 for: 1m labels: env: ENV_LABELS_ENV level: critical expr: increase(tidb_server_panic_total[10m]) > 0 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' summary: TiDB server panic total - alert: TiDB_memory_abnormal expr: go_memstats_heap_inuse_bytes{job="tidb"} > 1e+10 for: 1m labels: env: ENV_LABELS_ENV level: warning expr: go_memstats_heap_inuse_bytes{job="tidb"} > 1e+10 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' summary: TiDB heap memory usage is over 10 GB - alert: TiDB_query_duration expr: histogram_quantile(0.99, sum(rate(tidb_server_handle_query_duration_seconds_bucket[1m])) BY (le, instance)) > 1 for: 1m labels: env: ENV_LABELS_ENV level: warning expr: histogram_quantile(0.99, sum(rate(tidb_server_handle_query_duration_seconds_bucket[1m])) BY (le, instance)) > 1 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' summary: TiDB query duration 99th percentile is above 1s - alert: TiDB_server_event_error expr: increase(tidb_server_event_total{type=~"server_start|server_hang"}[15m]) > 0 for: 1m labels: env: ENV_LABELS_ENV level: warning expr: increase(tidb_server_event_total{type=~"server_start|server_hang"}[15m]) > 0 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' summary: TiDB server event error - alert: tidb_tikvclient_backoff_seconds_count expr: increase( tidb_tikvclient_backoff_seconds_count[10m] ) > 10 for: 1m labels: env: ENV_LABELS_ENV level: warning expr: increase( tidb_tikvclient_backoff_seconds_count[10m] ) > 10 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' summary: TiDB tikvclient_backoff_count error - alert: TiDB_monitor_time_jump_back_error expr: increase(tidb_monitor_time_jump_back_total[10m]) > 0 for: 1m labels: env: ENV_LABELS_ENV level: warning expr: increase(tidb_monitor_time_jump_back_total[10m]) > 0 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' summary: TiDB monitor time_jump_back error - alert: TiDB_ddl_waiting_jobs expr: sum(tidb_ddl_waiting_jobs) > 5 for: 1m labels: env: ENV_LABELS_ENV level: warning expr: sum(tidb_ddl_waiting_jobs) > 5 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' summary: TiDB ddl waiting_jobs too much ================================================ FILE: roles/prometheus/files/tiflash.rules.yml ================================================ groups: - name: alert.rules rules: - alert: TiFlash_tmt_merge_duration expr: histogram_quantile(0.99, sum(rate(tiflash_tmt_merge_duration_seconds_bucket[1m])) BY (le, instance)) > 600 for: 1m labels: env: ENV_LABELS_ENV level: emergency expr: histogram_quantile(0.99, sum(rate(tiflash_tmt_merge_duration_seconds_bucket[1m])) BY (le, instance)) > 600 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' summary: TiFlash tmt merge duration 99th percentile is above 600s - alert: TiFlash_tmt_write_parts_duration expr: histogram_quantile(0.99, sum(rate(tiflash_tmt_write_parts_duration_seconds_bucket[1m])) BY (le, instance)) > 8 for: 1m labels: env: ENV_LABELS_ENV level: emergency expr: histogram_quantile(0.99, sum(rate(tiflash_tmt_write_parts_duration_seconds_bucket[1m])) BY (le, instance)) > 8 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' summary: TiFlash tmt write parts duration 99th percentile is above 8s - alert: TiFlash_schema_error expr: increase(tiflash_schema_apply_count{type="failed"}[15m]) > 0 for: 1m labels: env: ENV_LABELS_ENV level: emergency expr: increase(tiflash_schema_apply_count{type="failed"}[15m]) > 0 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' summary: TiFlash schema error - alert: TiFlash_schema_apply_duration expr: histogram_quantile(0.99, sum(rate(tiflash_schema_apply_duration_seconds_bucket[1m])) BY (le, instance)) > 20 for: 1m labels: env: ENV_LABELS_ENV level: emergency expr: histogram_quantile(0.99, sum(rate(tiflash_schema_apply_duration_seconds_bucket[1m])) BY (le, instance)) > 20 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' summary: TiFlash schema apply duration 99th percentile is above 20s - alert: TiFlash_raft_read_index_duration expr: histogram_quantile(0.99, sum(rate(tiflash_raft_read_index_duration_seconds_bucket[1m])) BY (le, instance)) > 3 for: 1m labels: env: ENV_LABELS_ENV level: critical expr: histogram_quantile(0.99, sum(rate(tiflash_raft_read_index_duration_seconds_bucket[1m])) BY (le, instance)) > 3 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' summary: TiFlash raft read index duration 99th percentile is above 3s - alert: TiFlash_raft_wait_index_duration expr: histogram_quantile(0.99, sum(rate(tiflash_raft_wait_index_duration_seconds_bucket[1m])) BY (le, instance)) > 2 for: 1m labels: env: ENV_LABELS_ENV level: critical expr: histogram_quantile(0.99, sum(rate(tiflash_raft_wait_index_duration_seconds_bucket[1m])) BY (le, instance)) > 2 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' summary: TiFlash raft wait index duration 99th percentile is above 2s ================================================ FILE: roles/prometheus/files/tikv.accelerate.rules.yml ================================================ groups: - name: tikv_accelerate rules: - record: tikv_grpc_msg_duration_seconds:p99:1m expr: histogram_quantile(0.99, sum(rate(tikv_grpc_msg_duration_seconds_bucket{instance=~".*", type!="kv_gc"}[1m])) by (le, type)) - record: tikv_raftstore_event_duration_bucket:p99:1m expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_event_duration_bucket{instance=~".*"}[1m])) by (le, type)) - record: tikv_thread_cpu_seconds:1m expr: sum(rate(tikv_thread_cpu_seconds_total{instance=~".*"}[1m])) by (instance) - record: tikv_raftstore_append_log_duration_seconds:p99:1m expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_append_log_duration_seconds_bucket{instance=~".*"}[1m])) by (le, instance)) - record: tikv_raftstore_raft_process_duration_secs:p99:1m expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{instance=~".*", type='ready'}[1m])) by (le, instance)) - record: tikv_raftstore_request_wait_time_duration_secs:byins:p99:1m expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_request_wait_time_duration_secs_bucket{instance=~".*"}[1m])) by (le, instance)) - record: tikv_raftstore_append_log_duration_seconds:p95:1m expr: histogram_quantile(0.95, sum(rate(tikv_raftstore_append_log_duration_seconds_bucket{instance=~".*"}[1m])) by (le)) - record: tikv_raftstore_apply_wait_time_duration_secs:byins:p99:1m expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_apply_wait_time_duration_secs_bucket{instance=~".*"}[1m])) by (le, instance)) - record: tikv_raftstore_apply_log_duration_seconds:p99:1m expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_apply_log_duration_seconds_bucket{instance=~".*"}[1m])) by (le, instance)) - record: tikv_raftstore_request_wait_time_duration_secs:p99:1m expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_request_wait_time_duration_secs_bucket{instance=~".*"}[1m])) by (le)) - record: tikv_raftstore_request_wait_time_duration_secs:p95:1m expr: histogram_quantile(0.95, sum(rate(tikv_raftstore_request_wait_time_duration_secs_bucket{instance=~".*"}[1m])) by (le)) - record: tikv_worker_handled_task:1m expr: sum(rate(tikv_worker_handled_task_total{instance=~".*"}[1m])) by (name) - record: tikv_engine_num_files_at_level:kv:avg expr: avg(tikv_engine_num_files_at_level{instance=~".*", db="kv"}) by (cf, level) - record: tikv_engine_num_files_at_level:raft:avg expr: avg(tikv_engine_num_files_at_level{instance=~".*", db="raft"}) by (cf, level) - record: tikv_pd_request_duration_seconds:avg:1m expr: sum(rate(tikv_pd_request_duration_seconds_sum{instance=~".*"}[1m])) by (type) / sum(rate(tikv_pd_request_duration_seconds_count{instance=~".*"}[1m])) by (type) - record: tikv_coprocessor_request_wait_seconds:p95:1m expr: histogram_quantile(0.95, sum(rate(tikv_coprocessor_request_wait_seconds_bucket{instance=~".*"}[1m])) by (le, instance,req)) - record: tikv_grpc_msg_duration_seconds:avg:1m expr: sum(rate(tikv_grpc_msg_duration_seconds_sum{instance=~".*"}[1m])) by (type) / sum(rate(tikv_grpc_msg_duration_seconds_count[1m])) by (type) - record: tikv_raftstore_apply_wait_time_duration_secs:p99:1m expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_apply_wait_time_duration_secs_bucket{instance=~".*"}[1m])) by (le)) - record: tikv_raftstore_apply_wait_time_duration_secs:p95:1m expr: histogram_quantile(0.95, sum(rate(tikv_raftstore_apply_wait_time_duration_secs_bucket{instance=~".*"}[1m])) by (le)) - record: tikv_grpc_msg_duration_seconds:1m expr: sum(rate(tikv_grpc_msg_duration_seconds_count{instance=~".*", type!="kv_gc"}[1m])) by (instance,type) - record: tikv_raftstore_snapshot_duration_seconds:p99:1m expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_snapshot_duration_seconds_bucket{instance=~".*", type="apply"}[1m])) by (le)) - record: tikv_worker_pending_task:1m expr: sum(rate(tikv_worker_pending_task_total{instance=~".*"}[1m])) by (name) - record: tikv_coprocessor_request_duration_seconds:1m expr: sum(rate(tikv_coprocessor_request_duration_seconds_bucket{instance=~".*"}[1m])) by (le) - record: tikv_futurepool_pending_task:1m expr: sum(rate(tikv_futurepool_pending_task_total{instance=~".*"}[1m])) by (name) - record: tikv_storage_engine_async_request:1m expr: sum(rate(tikv_storage_engine_async_request_total{instance=~".*", status!~"all|success"}[1m])) by (status) - record: tikv_thread_cpu_seconds_nogrpc:1m expr: sum(rate(tikv_thread_cpu_seconds_total{instance=~".*", name=~"grpc.*"}[1m])) by (instance) ================================================ FILE: roles/prometheus/files/tikv.rules.yml ================================================ groups: - name: alert.rules rules: - alert: TiKV_memory_used_too_fast expr: process_resident_memory_bytes{job=~"tikv",instance=~".*"} - (process_resident_memory_bytes{job=~"tikv",instance=~".*"} offset 5m) > 5*1024*1024*1024 for: 5m labels: env: ENV_LABELS_ENV level: emergency expr: process_resident_memory_bytes{job=~"tikv",instance=~".*"} - (process_resident_memory_bytes{job=~"tikv",instance=~".*"} offset 5m) > 5*1024*1024*1024 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' value: '{{ $value }}' summary: TiKV memory used too fast - alert: TiKV_GC_can_not_work expr: sum(increase(tikv_gcworker_gc_tasks_vec{task="gc"}[1d])) < 1 for: 1m labels: env: ENV_LABELS_ENV level: emergency expr: sum(increase(tikv_gcworker_gc_tasks_vec{task="gc"}[1d])) < 1 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' summary: TiKV GC can not work - alert: TiKV_server_report_failure_msg_total expr: sum(rate(tikv_server_report_failure_msg_total{type="unreachable"}[10m])) BY (store_id) > 10 for: 1m labels: env: ENV_LABELS_ENV level: critical expr: sum(rate(tikv_server_report_failure_msg_total{type="unreachable"}[10m])) BY (store_id) > 10 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' summary: TiKV server_report_failure_msg_total error - alert: TiKV_channel_full_total expr: sum(rate(tikv_channel_full_total[10m])) BY (type, instance) > 0 for: 1m labels: env: ENV_LABELS_ENV level: critical expr: sum(rate(tikv_channel_full_total[10m])) BY (type, instance) > 0 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' summary: TiKV channel full - alert: TiKV_write_stall expr: delta( tikv_engine_write_stall[10m]) > 0 for: 1m labels: env: ENV_LABELS_ENV level: critical expr: delta( tikv_engine_write_stall[10m]) > 0 annotations: description: 'cluster: ENV_LABELS_ENV, type: {{ $labels.type }}, instance: {{ $labels.instance }}, values: {{ $value }}' value: '{{ $value }}' summary: TiKV write stall - alert: TiKV_raft_log_lag expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_log_lag_bucket[1m])) by (le, instance)) > 5000 for: 1m labels: env: ENV_LABELS_ENV level: critical expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_log_lag_bucket[1m])) by (le, instance)) > 5000 annotations: description: 'cluster: ENV_LABELS_ENV, instance {{ $labels.instance }}, values: {{ $value }}' value: '{{ $value }}' summary: TiKV raftstore log lag more than 5000 - alert: TiKV_async_request_snapshot_duration_seconds expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="snapshot"}[1m])) by (le, instance, type)) > 1 for: 1m labels: env: ENV_LABELS_ENV level: critical expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="snapshot"}[1m])) by (le, instance, type)) > 1 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' summary: TiKV async request snapshot duration seconds more than 1s - alert: TiKV_async_request_write_duration_seconds expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="write"}[1m])) by (le, instance, type)) > 1 for: 1m labels: env: ENV_LABELS_ENV level: critical expr: histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type="write"}[1m])) by (le, instance, type)) > 1 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' summary: TiKV async request write duration seconds more than 1s - alert: TiKV_coprocessor_request_wait_seconds expr: histogram_quantile(0.9999, sum(rate(tikv_coprocessor_request_wait_seconds_bucket[1m])) by (le, instance, req)) > 10 for: 1m labels: env: ENV_LABELS_ENV level: critical expr: histogram_quantile(0.9999, sum(rate(tikv_coprocessor_request_wait_seconds_bucket[1m])) by (le, instance, req)) > 10 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' summary: TiKV coprocessor request wait seconds more than 10s - alert: TiKV_raftstore_thread_cpu_seconds_total expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"raftstore_.*"}[1m])) by (instance) > 1.6 for: 1m labels: env: ENV_LABELS_ENV level: critical expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"raftstore_.*"}[1m])) by (instance) > 1.6 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' summary: TiKV raftstore thread CPU seconds is high - alert: TiKV_raft_append_log_duration_secs expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_append_log_duration_seconds_bucket[1m])) by (le, instance)) > 1 for: 1m labels: env: ENV_LABELS_ENV level: critical expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_append_log_duration_seconds_bucket[1m])) by (le, instance)) > 1 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' summary: TiKV_raft_append_log_duration_secs - alert: TiKV_raft_apply_log_duration_secs expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_apply_log_duration_seconds_bucket[1m])) by (le, instance)) > 1 for: 1m labels: env: ENV_LABELS_ENV level: critical expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_apply_log_duration_seconds_bucket[1m])) by (le, instance)) > 1 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' summary: TiKV_raft_apply_log_duration_secs - alert: TiKV_scheduler_latch_wait_duration_seconds expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_latch_wait_duration_seconds_bucket[1m])) by (le, instance, type)) > 1 for: 1m labels: env: ENV_LABELS_ENV level: critical expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_latch_wait_duration_seconds_bucket[1m])) by (le, instance, type)) > 1 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' summary: TiKV scheduler latch wait duration seconds more than 1s - alert: TiKV_thread_apply_worker_cpu_seconds expr: sum(rate(tikv_thread_cpu_seconds_total{name="apply_worker"}[1m])) by (instance) > 1.8 for: 1m labels: env: ENV_LABELS_ENV level: critical expr: sum(rate(tikv_thread_cpu_seconds_total{name="apply_worker"}[1m])) by (instance) > 1.8 annotations: description: 'cluster: ENV_LABELS_ENV, type: {{ $labels.type }}, instance: {{ $labels.instance }}, values: {{ $value }}' value: '{{ $value }}' summary: TiKV thread apply worker cpu seconds is high - alert: TiDB_tikvclient_gc_action_fail expr: sum(increase(tidb_tikvclient_gc_action_result{type="fail"}[1m])) > 10 for: 1m labels: env: ENV_LABELS_ENV level: critical expr: sum(increase(tidb_tikvclient_gc_action_result{type="fail"}[1m])) > 10 annotations: description: 'cluster: ENV_LABELS_ENV, type: {{ $labels.type }}, instance: {{ $labels.instance }}, values: {{ $value }}' value: '{{ $value }}' summary: TiDB_tikvclient_gc_action_fail - alert: TiKV_leader_drops expr: delta(tikv_pd_heartbeat_tick_total{type="leader"}[30s]) < -10 for: 1m labels: env: ENV_LABELS_ENV level: warning expr: delta(tikv_pd_heartbeat_tick_total{type="leader"}[30s]) < -10 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' summary: TiKV leader drops - alert: TiKV_raft_process_ready_duration_secs expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='ready'}[1m])) by (le, instance, type)) > 2 for: 1m labels: env: ENV_LABELS_ENV level: warning expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='ready'}[1m])) by (le, instance, type)) > 2 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' value: '{{ $value }}' summary: TiKV_raft_process_ready_duration_secs - alert: TiKV_raft_process_tick_duration_secs expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='tick'}[1m])) by (le, instance, type)) > 2 for: 1m labels: env: ENV_LABELS_ENV level: warning expr: histogram_quantile(0.999, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{type='tick'}[1m])) by (le, instance, type)) > 2 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' value: '{{ $value }}' summary: TiKV_raft_process_tick_duration_secs - alert: TiKV_scheduler_context_total expr: abs(delta( tikv_scheduler_contex_total[5m])) > 1000 for: 1m labels: env: ENV_LABELS_ENV level: warning expr: abs(delta( tikv_scheduler_contex_total[5m])) > 1000 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' summary: TiKV scheduler context total - alert: TiKV_scheduler_command_duration_seconds expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_command_duration_seconds_bucket[1m])) by (le, instance, type) / 1000) > 1 for: 1m labels: env: ENV_LABELS_ENV level: warning expr: histogram_quantile(0.99, sum(rate(tikv_scheduler_command_duration_seconds_bucket[1m])) by (le, instance, type) / 1000) > 1 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values:{{ $value }}' value: '{{ $value }}' summary: TiKV scheduler command duration seconds more than 1s - alert: tikv_coprocessor_request_wait_seconds expr: delta( tikv_coprocessor_request_wait_seconds_count[10m] ) > 0 for: 1m labels: env: ENV_LABELS_ENV level: warning expr: delta( tikv_coprocessor_request_wait_seconds_count[10m] ) > 0 annotations: description: 'cluster: ENV_LABELS_ENV, instance: {{ $labels.instance }}, values: {{ $value }}' value: '{{ $value }}' summary: TiKV coprocessor request wait seconds - alert: TiKV_coprocessor_request_error expr: increase(tikv_coprocessor_request_error{reason!="lock"}[10m]) > 100 for: 1m labels: env: ENV_LABELS_ENV level: warning expr: increase(tikv_coprocessor_request_error{reason!="lock"}[10m]) > 100 annotations: description: 'cluster: ENV_LABELS_ENV, reason: {{ $labels.reason }}, instance: {{ $labels.instance }}, values: {{ $value }}' value: '{{ $value }}' summary: TiKV coprocessor request error - alert: TiKV_coprocessor_request_lock_error expr: increase(tikv_coprocessor_request_error{reason="lock"}[10m]) > 10000 for: 1m labels: env: ENV_LABELS_ENV level: warning expr: increase(tikv_coprocessor_request_error{reason="lock"}[10m]) > 10000 annotations: description: 'cluster: ENV_LABELS_ENV, reason: {{ $labels.reason }}, instance: {{ $labels.instance }}, values: {{ $value }}' value: '{{ $value }}' summary: TiKV coprocessor request lock error - alert: TiKV_coprocessor_pending_request expr: delta( tikv_coprocessor_pending_request[10m]) > 5000 for: 1m labels: env: ENV_LABELS_ENV level: warning expr: delta( tikv_coprocessor_pending_request[10m]) > 5000 annotations: description: 'cluster: ENV_LABELS_ENV, type: {{ $labels.type }}, instance: {{ $labels.instance }}, values: {{ $value }}' value: '{{ $value }}' summary: TiKV pending {{ $labels.type }} request is high - alert: TiKV_batch_request_snapshot_nums expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"cop_.*"}[1m])) by (instance) / ( count(tikv_thread_cpu_seconds_total{name=~"cop_.*"}) * 0.9 ) / count(count(tikv_thread_cpu_seconds_total) by (instance)) > 0 for: 1m labels: env: ENV_LABELS_ENV level: warning expr: sum(rate(tikv_thread_cpu_seconds_total{name=~"cop_.*"}[1m])) by (instance) / ( count(tikv_thread_cpu_seconds_total{name=~"cop_.*"}) * 0.9 ) / count(count(tikv_thread_cpu_seconds_total) by (instance)) > 0 annotations: description: 'cluster: ENV_LABELS_ENV, type: {{ $labels.type }}, instance: {{ $labels.instance }}, values: {{ $value }}' value: '{{ $value }}' summary: TiKV batch request snapshot nums is high - alert: TiKV_pending_task expr: sum(tikv_worker_pending_task_total) BY (instance,name) > 1000 for: 1m labels: env: ENV_LABELS_ENV level: warning expr: sum(tikv_worker_pending_task_total) BY (instance,name) > 1000 annotations: description: 'cluster: ENV_LABELS_ENV, type: {{ $labels.type }}, instance: {{ $labels.instance }}, values: {{ $value }}' value: '{{ $value }}' summary: TiKV pending task too much - alert: TiKV_low_space_and_add_region expr: count( (sum(tikv_store_size_bytes{type="available"}) by (instance) / sum(tikv_store_size_bytes{type="capacity"}) by (instance) < 0.2) and (sum(tikv_raftstore_snapshot_traffic_total{type="applying"}) by (instance) > 0 ) ) > 0 for: 1m labels: env: ENV_LABELS_ENV level: warning expr: count( (sum(tikv_store_size_bytes{type="available"}) by (instance) / sum(tikv_store_size_bytes{type="capacity"}) by (instance) < 0.2) and (sum(tikv_raftstore_snapshot_traffic_total{type="applying"}) by (instance) > 0 ) ) > 0 annotations: description: 'cluster: ENV_LABELS_ENV, type: {{ $labels.type }}, instance: {{ $labels.instance }}, values: {{ $value }}' value: '{{ $value }}' summary: TiKV low_space and add_region - alert: TiKV_approximate_region_size expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_region_size_bucket[1m])) by (le)) > 1073741824 for: 1m labels: env: ENV_LABELS_ENV level: warning expr: histogram_quantile(0.99, sum(rate(tikv_raftstore_region_size_bucket[1m])) by (le)) > 1073741824 annotations: description: 'cluster: ENV_LABELS_ENV, type: {{ $labels.type }}, instance: {{ $labels.instance }}, values: {{ $value }}' value: '{{ $value }}' summary: TiKV approximate region size is more than 1GB ================================================ FILE: roles/prometheus/meta/main.yml ================================================ --- dependencies: - role: common_dir ================================================ FILE: roles/prometheus/tasks/binary_deployment.yml ================================================ --- - name: deploy prometheus binary copy: src="{{ resources_dir }}/bin/prometheus" dest="{{ deploy_dir }}/bin/" mode=0755 - name: create run script template: src: "{{ item }}_{{ role_name }}_binary.sh.j2" dest: "{{ deploy_dir }}/scripts/{{ item }}_{{ role_name }}.sh" mode: "0755" backup: yes with_items: - run vars: role_status_dir: status/{{ role_name }} - include_tasks: "{{ process_supervision }}_deployment.yml" ================================================ FILE: roles/prometheus/tasks/docker_deployment.yml ================================================ --- - name: deploy prometheus image copy: src="{{ downloads_dir }}/prometheus.tar" dest="{{ deploy_dir }}/images" mode=0755 - name: create run script template: src: "{{ item }}_{{ role_name }}_docker.sh.j2" dest: "{{ deploy_dir }}/scripts/{{ item }}_{{ role_name }}.sh" mode: "0755" backup: yes with_items: - run - name: load docker image from archive docker_image: state: present force: yes name: prom/prometheus tag: "{{ prometheus_tag }}" load_path: "{{ images_dir }}/prometheus.tar" - include_tasks: "{{ process_supervision }}_deployment.yml" ================================================ FILE: roles/prometheus/tasks/main.yml ================================================ --- - name: create deploy directories file: path={{ item }} state=directory mode=0755 with_items: - "{{ prometheus_log_dir }}" - "{{ prometheus_data_dir }}" - "{{ deploy_dir }}/status/{{ role_name }}" - name: create configuration file template: src=prometheus.yml.j2 dest={{ deploy_dir }}/conf/prometheus.yml mode=0644 backup=yes register: prometheus_conf_st - name: backup configuration file command: mv "{{ prometheus_conf_st.backup_file }}" "{{ backup_dir }}" when: prometheus_conf_st.changed and prometheus_conf_st.backup_file is defined - name: copy alert rules file copy: src={{ item }} dest="{{ deploy_dir }}/conf/{{ item }}" mode=0644 backup=yes with_items: - node.rules.yml - bypass.rules.yml - pd.rules.yml - tidb.rules.yml - tikv.rules.yml - tikv.accelerate.rules.yml - binlog.rules.yml - blacker.rules.yml - kafka.rules.yml - lightning.rules.yml - tiflash.rules.yml register: alert_rules_st - name: backup alert rules file command: mv "{{ item.backup_file }}" "{{ backup_dir }}" when: - item.changed - item.backup_file is defined with_items: "{{ alert_rules_st.results }}" - name: set alert rules label changes replace: > dest={{ deploy_dir }}/conf/{{ item }} regexp="ENV_LABELS_ENV" replace="{{ cluster_name }}" with_items: - node.rules.yml - bypass.rules.yml - pd.rules.yml - tidb.rules.yml - tikv.rules.yml - tikv.accelerate.rules.yml - binlog.rules.yml - blacker.rules.yml - kafka.rules.yml - tiflash.rules.yml - include_tasks: "{{ deployment_method }}_deployment.yml" - name: prepare firewalld white list set_fact: firewalld_ports: "{{ [prometheus_port ~ '/tcp'] + firewalld_ports }}" ================================================ FILE: roles/prometheus/tasks/supervise_deployment.yml ================================================ --- - name: deploy supervise include_role: name: supervise vars: this_role_name: prometheus service_name: prometheus-{{ prometheus_port }} ================================================ FILE: roles/prometheus/tasks/systemd_deployment.yml ================================================ --- - name: deploy systemd include_role: name: systemd vars: this_role_name: prometheus service_name: prometheus-{{ prometheus_port }} ================================================ FILE: roles/prometheus/templates/prometheus.yml.j2 ================================================ --- global: scrape_interval: 15s # By default, scrape targets every 15 seconds. evaluation_interval: 15s # By default, scrape targets every 15 seconds. # scrape_timeout is set to the global default (10s). external_labels: cluster: '{{ cluster_name }}' {% for item in prometheus_extra_labels %} {{ item.label }}: "{{ item.value }}" {% endfor %} # Load and evaluate rules in this file every 'evaluation_interval' seconds. rule_files: - 'node.rules.yml' - 'blacker.rules.yml' - 'bypass.rules.yml' - 'pd.rules.yml' - 'tidb.rules.yml' - 'tikv.rules.yml' - 'tikv.accelerate.rules.yml' - 'tiflash.rules.yml' {% if enable_binlog|default(false) %} - 'binlog.rules.yml' {% endif %} {% if kafka_addrs | default("") %} - 'kafka.rules.yml' {% endif %} {% if groups.lightning_server | default([]) | length == 1 %} - 'lightning.rules.yml' {% endif %} {% if alertmanager_target|default("") -%} alerting: alertmanagers: - static_configs: - targets: - '{{ alertmanager_target }}' {% elif groups.alertmanager_servers -%} {% if groups.monitoring_servers | length == groups.alertmanager_servers | length -%} {% set index = [] -%} {% for host in groups.monitoring_servers -%} {% if inventory_hostname == hostvars[host].inventory_hostname -%} {% set _ = index.append(loop.index0) -%} {% endif -%} {% endfor -%} {% set alertmanager_host = hostvars[groups.alertmanager_servers[index.0]].ansible_host | default(hostvars[groups.alertmanager_servers[index.0]].inventory_hostname) -%} {% set alertmanager_port = hostvars[groups.alertmanager_servers[index.0]].alertmanager_port -%} alerting: alertmanagers: - static_configs: - targets: - '{{ alertmanager_host }}:{{ alertmanager_port }}' {% endif -%} {% else -%} # alerting: # alertmanagers: # - static_configs: # - targets: # - 'alertmanager_host:9093' {% endif -%} {% set pushgateway_addrs = [] -%} {% for host in groups.monitoring_servers -%} {% set pushgateway_ip = hostvars[host].ansible_host | default(hostvars[host].inventory_hostname) -%} {% set pushgateway_port = hostvars[host].pushgateway_port -%} {% set _ = pushgateway_addrs.append("%s:%s" % (pushgateway_ip, pushgateway_port)) -%} {% endfor -%} {% set grafana_addrs = [] -%} {% for host in groups.grafana_servers -%} {% set grafana_ip = hostvars[host].ansible_host | default(hostvars[host].inventory_hostname) -%} {% set grafana_port = hostvars[host].grafana_port -%} {% set _ = grafana_addrs.append("%s:%s" % (grafana_ip, grafana_port)) -%} {% endfor -%} {% set blackbox_host = hostvars[groups.monitored_servers[0]].ansible_host | default(hostvars[groups.monitored_servers[0]].inventory_hostname) if groups.get('monitored_servers', []) else '' -%} {% set blackbox_port = hostvars[groups.monitored_servers[0]].blackbox_exporter_port if blackbox_host else '' -%} {% set kafka_exporter_host = hostvars[groups.kafka_exporter_servers[0]].ansible_host | default(hostvars[groups.kafka_exporter_servers[0]].inventory_hostname) if groups.get('kafka_exporter_servers', []) else '' -%} {% set kafka_exporter_port = hostvars[groups.kafka_exporter_servers[0]].kafka_exporter_port if kafka_exporter_host else '' -%} {% set node_exporter_addrs = [] -%} {% set blackbox_exporter_addrs = [] -%} {% set target_hosts = [] -%} {% for host in groups.monitored_servers -%} {% set host_ip = hostvars[host].ansible_host | default(hostvars[host].inventory_hostname) -%} {% set node_exporter_port = hostvars[host].node_exporter_port -%} {% set blackbox_exporter_port = hostvars[host].blackbox_exporter_port -%} {% set _ = node_exporter_addrs.append("%s:%s" % (host_ip, node_exporter_port)) -%} {% set _ = blackbox_exporter_addrs.append("%s:%s" % (host_ip, blackbox_exporter_port)) -%} {% set _ = target_hosts.append(host_ip) -%} {% endfor -%} {% set tidb_addrs = [] -%} {% set tidb_status_addrs = [] -%} {% for host in groups.tidb_servers -%} {% set tidb_ip = hostvars[host].ansible_host | default(hostvars[host].inventory_hostname) -%} {% set tidb_port = hostvars[host].tidb_port -%} {% set tidb_status_port = hostvars[host].tidb_status_port -%} {% set _ = tidb_addrs.append("%s:%s" % (tidb_ip, tidb_port)) -%} {% set _ = tidb_status_addrs.append("%s:%s" % (tidb_ip, tidb_status_port)) -%} {% endfor -%} {% set tikv_addrs = [] -%} {% set tikv_status_addrs = [] -%} {% for host in groups.tikv_servers -%} {% set tikv_ip = hostvars[host].ansible_host | default(hostvars[host].inventory_hostname) -%} {% set tikv_port = hostvars[host].tikv_port -%} {% set tikv_status_port = hostvars[host].tikv_status_port -%} {% set _ = tikv_addrs.append("%s:%s" % (tikv_ip, tikv_port)) -%} {% set _ = tikv_status_addrs.append("%s:%s" % (tikv_ip, tikv_status_port)) -%} {% endfor -%} {% set tiflash_metrics_addrs = [] -%} {% set tiflash_http_addrs = [] -%} {% for host in groups.tiflash_servers -%} {% set tiflash_ip = hostvars[host].ansible_host | default(hostvars[host].inventory_hostname) -%} {% set tiflash_metrics_port = hostvars[host].metrics_port -%} {% set tiflash_proxy_status_port = hostvars[host].flash_proxy_status_port -%} {% set tiflash_http_port = hostvars[host].http_port -%} {% set _ = tiflash_metrics_addrs.append("%s:%s" % (tiflash_ip, tiflash_metrics_port)) -%} {% set _ = tiflash_metrics_addrs.append("%s:%s" % (tiflash_ip, tiflash_proxy_status_port)) -%} {% set _ = tiflash_http_addrs.append("%s:%s" % (tiflash_ip, tiflash_http_port)) -%} {% endfor -%} {% set pd_addrs = [] -%} {% for host in groups.pd_servers -%} {% set pd_ip = hostvars[host].ansible_host | default(hostvars[host].inventory_hostname) -%} {% set pd_port = hostvars[host].pd_client_port -%} {% set _ = pd_addrs.append("%s:%s" % (pd_ip, pd_port)) -%} {% endfor -%} scrape_configs: {% if pushgateway_addrs %} - job_name: 'overwritten-cluster' scrape_interval: 15s honor_labels: true # don't overwrite job & instance labels static_configs: - targets: {% for pushgateway_addr in pushgateway_addrs %} - '{{ pushgateway_addr }}' {% endfor %} - job_name: "blackbox_exporter_http" scrape_interval: 30s metrics_path: /probe params: module: [http_2xx] static_configs: - targets: {% for pushgateway_addr in pushgateway_addrs %} - 'http://{{ pushgateway_addr }}/metrics' {% endfor %} relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: {{ blackbox_host }}:{{ blackbox_port }} {% endif %} {% if groups.lightning_server | default([]) | length == 1 %} - job_name: "lightning" static_configs: - targets: ['{{ hostvars[groups.lightning_server[0]].ansible_host | default(hostvars[groups.lightning_server[0]].inventory_hostname) }}:{{ hostvars[groups.lightning_server[0]].tidb_lightning_pprof_port }}'] {% endif %} - job_name: "overwritten-nodes" honor_labels: true # don't overwrite job & instance labels static_configs: - targets: {% for node_exporter_addr in node_exporter_addrs %} - '{{ node_exporter_addr }}' {% endfor %} - job_name: "tidb" honor_labels: true # don't overwrite job & instance labels static_configs: - targets: {% for tidb_status_addr in tidb_status_addrs %} - '{{ tidb_status_addr }}' {% endfor %} - job_name: "tikv" honor_labels: true # don't overwrite job & instance labels static_configs: - targets: {% for tikv_status_addr in tikv_status_addrs %} - '{{ tikv_status_addr }}' {% endfor %} {% if tiflash_metrics_addrs %} - job_name: 'tiflash' honor_labels: true # don't overwrite job & instance labels static_configs: - targets: {% for tiflash_metrics_addr in tiflash_metrics_addrs %} - '{{ tiflash_metrics_addr }}' {% endfor %} {% endif %} - job_name: "pd" honor_labels: true # don't overwrite job & instance labels static_configs: - targets: {% for pd_addr in pd_addrs %} - '{{ pd_addr }}' {% endfor %} {% if enable_binlog|default(false) %} {% set pump_addrs = [] -%} {% for host in groups.pump_servers -%} {% set pump_ip = hostvars[host].ansible_host | default(hostvars[host].inventory_hostname) -%} {% set pump_port = hostvars[host].pump_port -%} {% set _ = pump_addrs.append("%s:%s" % (pump_ip, pump_port)) -%} {% endfor -%} {% set drainer_addrs = [] -%} {% for host in (groups.drainer_servers)|default([]) -%} {% set drainer_ip = hostvars[host].ansible_host | default(hostvars[host].inventory_hostname) -%} {% set drainer_port = hostvars[host].drainer_port -%} {% set _ = drainer_addrs.append("%s:%s" % (drainer_ip, drainer_port)) -%} {% endfor %} {% if kafka_exporter_host %} - job_name: 'kafka_exporter' honor_labels: true # don't overwrite job & instance labels static_configs: - targets: - '{{ kafka_exporter_host }}:{{ kafka_exporter_port }}' {% endif %} - job_name: 'pump' honor_labels: true # don't overwrite job & instance labels static_configs: - targets: {% for pump_addr in pump_addrs %} - '{{ pump_addr }}' {% endfor %} - job_name: 'drainer' honor_labels: true # don't overwrite job & instance labels static_configs: - targets: {% for drainer_addr in drainer_addrs %} - '{{ drainer_addr }}' {% endfor %} - job_name: "port_probe" scrape_interval: 30s metrics_path: /probe params: module: [tcp_connect] static_configs: {% if kafka_addrs | default("") %} - targets: {% for kafka_addr in (kafka_addrs | default("")).split(',') | unique %} - '{{ kafka_addr }}' {% endfor %} labels: group: 'kafka' {% endif %} {% if zookeeper_addrs | default("") %} - targets: {% for zoo_addr in (zookeeper_addrs | default("")).split(',') | unique %} - '{{ zoo_addr }}' {% endfor %} labels: group: 'zookeeper' {% endif %} - targets: {% for pump_addr in pump_addrs %} - '{{ pump_addr }}' {% endfor %} labels: group: 'pump' - targets: {% for drainer_addr in drainer_addrs %} - '{{ drainer_addr }}' {% endfor %} labels: group: 'drainer' {% if kafka_exporter_host %} - targets: - '{{ kafka_exporter_host }}:{{ kafka_exporter_port }}' labels: group: 'kafka_exporter' {% endif %} relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: {{ blackbox_host }}:{{ blackbox_port }} {% endif %} - job_name: "tidb_port_probe" scrape_interval: 30s metrics_path: /probe params: module: [tcp_connect] static_configs: - targets: {% for tidb_addr in tidb_addrs %} - '{{ tidb_addr }}' {% endfor %} labels: group: 'tidb' - targets: {% for tikv_addr in tikv_addrs %} - '{{ tikv_addr }}' {% endfor %} labels: group: 'tikv' - targets: {% for tiflash_http_addr in tiflash_http_addrs %} - '{{ tiflash_http_addr }}' {% endfor %} labels: group: 'tiflash' - targets: {% for pd_addr in pd_addrs %} - '{{ pd_addr }}' {% endfor %} labels: group: 'pd' - targets: {% for pushgateway_addr in pushgateway_addrs %} - '{{ pushgateway_addr }}' {% endfor %} labels: group: 'pushgateway' - targets: {% for grafana_addr in grafana_addrs %} - '{{ grafana_addr }}' {% endfor %} labels: group: 'grafana' - targets: {% for node_exporter_addr in node_exporter_addrs %} - '{{ node_exporter_addr }}' {% endfor %} labels: group: 'node_exporter' - targets: {% for blackbox_exporter_addr in blackbox_exporter_addrs %} - '{{ blackbox_exporter_addr }}' {% endfor %} labels: group: 'blackbox_exporter' relabel_configs: - source_labels: [__address__] target_label: __param_target - source_labels: [__param_target] target_label: instance - target_label: __address__ replacement: {{ blackbox_host }}:{{ blackbox_port }} {% for blackbox_exporter_addr in blackbox_exporter_addrs %} {% set blackbox_exporter_ip = blackbox_exporter_addr.split(':')[0] %} - job_name: "blackbox_exporter_{{ blackbox_exporter_ip }}_icmp" scrape_interval: 6s metrics_path: /probe params: module: [icmp] static_configs: - targets: {% for target_host in target_hosts %} - '{{ target_host }}' {% endfor %} relabel_configs: - source_labels: [__address__] regex: (.*)(:80)? target_label: __param_target replacement: ${1} - source_labels: [__param_target] regex: (.*) target_label: ping replacement: ${1} - source_labels: [] regex: .* target_label: __address__ replacement: {{ blackbox_exporter_addr }} {% endfor %} ================================================ FILE: roles/prometheus/templates/run_prometheus_binary.sh.j2 ================================================ #!/bin/bash set -e ulimit -n 1000000 DEPLOY_DIR={{ deploy_dir }} cd "${DEPLOY_DIR}" || exit 1 {% set my_ip = hostvars[inventory_hostname].ansible_host | default(hostvars[inventory_hostname].inventory_hostname) -%} # WARNING: This file was auto-generated. Do not edit! # All your edit might be overwritten! exec > >(tee -i -a "{{ prometheus_log_dir }}/{{ prometheus_log_filename }}") exec 2>&1 exec bin/prometheus \ --config.file="{{ deploy_dir }}/conf/prometheus.yml" \ --web.listen-address=":{{ prometheus_port }}" \ --web.external-url="http://{{ my_ip }}:{{ prometheus_port }}/" \ --web.enable-admin-api \ --log.level="{{ prometheus_log_level }}" \ --storage.tsdb.path="{{ prometheus_data_dir }}" \ --storage.tsdb.retention="{{ prometheus_storage_retention }}" ================================================ FILE: roles/prometheus/templates/run_prometheus_docker.sh.j2 ================================================ #!/bin/bash set -e ulimit -n 1000000 # WARNING: This file was auto-generated. Do not edit! # All your edit might be overwritten! DEPLOY_DIR={{ deploy_dir }} cd "${DEPLOY_DIR}" || exit 1 exec docker run -p {{ prometheus_port }}:9090 \ -v /etc/localtime:/etc/localtime:ro \ -v "{{ prometheus_data_dir }}:/prometheus" \ -v "{{ deploy_dir }}/conf:/etc/prometheus" \ -u `id -u {{ deploy_user }}` \ --name="prometheus-{{ prometheus_port }}" \ prom/prometheus:{{ prometheus_tag }} \ --config.file=/etc/prometheus/prometheus.yml \ --web.enable-admin-api \ --storage.tsdb.path=/prometheus \ --storage.tsdb.retention="{{ prometheus_storage_retention }}" ================================================ FILE: roles/pump/defaults/main.yml ================================================ --- pump_port: 8250 pump_log_dir: "{{ deploy_dir }}/log" pump_log_filename: "pump.log" pump_stderr_filename: "pump_stderr.log" pump_data_dir: "{{ deploy_dir }}/data.pump" # docker settings pump_docker_log_dir: "{{ pump_log_dir }}/pump" pd_scheme: http # systemd: Specifies when to restart the service. restart: on-failure ================================================ FILE: roles/pump/files/make-ssl.sh ================================================ #!/bin/bash # Author: Smana smainklh@gmail.com # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. set -o errexit set -o pipefail usage() { cat << EOF Create self signed certificates Usage : $(basename $0) [-d ] -h | --help : Show this message -d | --ssldir : Directory where the certificates will be located Environmental variables HOSTS and CN should be set to generate keys for each host. EOF } # Options parsing while (($#)); do case "$1" in -h | --help) usage; exit 0;; -d | --ssldir) SSLDIR="${2}"; shift 2;; *) usage echo "ERROR : Unknown option" exit 3 ;; esac done if [ -z ${SSLDIR} ]; then echo "ERROR: the directory where the certificates will be located is missing. option -d" exit 1 fi tmpdir=$(mktemp -d /tmp/tidb_cacert.XXXXXX) trap 'rm -rf "${tmpdir}"' EXIT cd "${tmpdir}" mkdir -p "${SSLDIR}" if [ -e "$SSLDIR/ca-config.json" ]; then # Reuse existing CA cp $SSLDIR/{ca-config.json,ca-csr.json} . else echo "ERROR: ca-config.json and ca-csr.json is missing in $SSLDIR." exit 1 fi # Root CA if [ -e "$SSLDIR/ca-key.pem" ]; then # Reuse existing CA cp $SSLDIR/{ca.pem,ca-key.pem} . else cfssl gencert -initca ca-csr.json | cfssljson -bare ca - > /dev/null 2>&1 fi # client cert if [ ! -e "$SSLDIR/client-key.pem" ]; then echo '{"CN":"client","hosts":[""],"key":{"algo":"rsa","size":2048}}' | cfssl gencert -ca=ca.pem -ca-key=ca-key.pem -config=ca-config.json -profile=client -hostname="" - | cfssljson -bare client > /dev/null 2>&1 fi gen_key_and_cert() { local host=$1 local cn=$2 local name=$3 echo "{\"CN\":\"${cn}\",\"hosts\":[\"\"],\"key\":{\"algo\":\"rsa\",\"size\":2048}}" | cfssl gencert -ca=ca.pem -ca-key=ca-key.pem -config=ca-config.json -profile=server -hostname="${host},127.0.0.1" - | cfssljson -bare ${name} > /dev/null 2>&1 } # Nodes if [ -n "$HOSTS" ]; then for host in $HOSTS; do gen_key_and_cert "${host}" "${CN}" "${CN}-${host}" done fi # Install certs mv *.pem ${SSLDIR}/ ================================================ FILE: roles/pump/meta/main.yml ================================================ --- dependencies: - role: common_dir ================================================ FILE: roles/pump/tasks/binary_deployment.yml ================================================ --- - name: deploy pump binary copy: src="{{ resources_dir }}/bin/pump" dest="{{ deploy_dir }}/bin/" mode=0755 - name: create run script template: src: "{{ item }}_pump_binary.sh.j2" dest: "{{ deploy_dir }}/scripts/{{ item }}_pump.sh" mode: "0755" backup: yes with_items: - run vars: role_status_dir: status/pump - include_tasks: "{{ process_supervision }}_deployment.yml" ================================================ FILE: roles/pump/tasks/check_certs.yml ================================================ --- - name: "Check_certs | check if the certs have already been generated on control machine" find: paths: "{{ cert_dir }}" patterns: "*.pem" get_checksum: true delegate_to: localhost register: cert_control_node run_once: true - debug: var: cert_control_node - name: "Check_certs | Set default value for 'sync_certs', 'gen_certs' to false" set_fact: sync_certs: false gen_certs: false - set_fact: pump_host: "{{ hostvars[inventory_hostname].ansible_host | default(inventory_hostname) }}" - name: "Check certs | check if a cert already exists on node" stat: path: "{{ pump_cert_dir }}/{{ item }}" register: cert_pump_node with_items: - ca.pem - pump-server-{{ pump_host }}-key.pem - pump-server-{{ pump_host }}.pem - debug: var: cert_pump_node - name: "Check_certs | Set 'gen_certs' to true" set_fact: gen_certs: true when: not item in cert_control_node.files|map(attribute='path') | list delegate_to: localhost run_once: true with_items: >- ['{{cert_dir}}/ca.pem', {% set all_pump_hosts = groups['pump_servers']|unique|sort %} {% for host in all_pump_hosts %} {% set pump_ip = hostvars[host].ansible_host | default(hostvars[host].inventory_hostname) -%} '{{cert_dir}}/pump-server-{{ pump_ip }}-key.pem' {% if not loop.last %}{{','}}{% endif %} {% endfor %}] - debug: var: gen_certs - name: "Check_certs | Set 'gen_node_certs' to true" set_fact: gen_node_certs: |- { {% set all_pump_hosts = groups['pump_servers']|unique|sort -%} {% set existing_certs = cert_control_node.files|map(attribute='path')|list|sort %} {% for host in all_pump_hosts -%} {% set pump_ip = hostvars[host].ansible_host | default(hostvars[host].inventory_hostname) -%} {% set host_cert = "%s/pump-server-%s-key.pem"|format(cert_dir, pump_ip) %} {% if host_cert in existing_certs -%} "{{ host }}": False, {% else -%} "{{ host }}": True, {% endif -%} {% endfor %} } run_once: true - debug: var: gen_node_certs - name: "Check_certs | Set pump_cert_key" set_fact: pump_cert_key_path: "{{ cert_dir }}/pump-server-{{ hostvars[inventory_hostname].pump_host }}-key.pem" - debug: var: pump_cert_key_path - name: "Check_certs | Set 'sync_certs' to true" set_fact: sync_certs: true when: gen_node_certs[inventory_hostname] or (not cert_pump_node.results[0].stat.exists|default(False)) or (not cert_pump_node.results[1].stat.exists|default(False)) or (cert_pump_node.results[1].stat.checksum|default('') != cert_control_node.files|selectattr("path","equalto",pump_cert_key_path)|map(attribute="checksum")|first|default('')) - debug: var: sync_certs ================================================ FILE: roles/pump/tasks/docker_deployment.yml ================================================ --- - name: create log directory file: path="{{ item }}" state=directory mode=0755 with_items: - "{{ pump_docker_log_dir }}" - name: deploy tidb-binlog image copy: src="{{ downloads_dir }}/tidb-binlog.tar" dest="{{ deploy_dir }}/images" mode=0755 - name: create run script template: src: "{{ item }}_pump_docker.sh.j2" dest: "{{ deploy_dir }}/scripts/{{ item }}_pump.sh" mode: "0755" backup: yes with_items: - run - name: load docker image from archive docker_image: state: present force: yes name: pingcap/tidb-binlog tag: "{{ tidb_version }}" load_path: "{{ images_dir }}/tidb-binlog.tar" - include_tasks: "{{ process_supervision }}_deployment.yml" ================================================ FILE: roles/pump/tasks/gen_certs.yml ================================================ --- - name: Gen_certs | copy certs generation script copy: src: "make-ssl.sh" dest: "{{ script_dir }}/make-ssl.sh" mode: 0700 run_once: yes delegate_to: localhost when: gen_certs|default(false) - name: Gen_certs | run cert generation script command: "{{ script_dir }}/make-ssl.sh -d {{ cert_dir }}" environment: - HOSTS: "{% for h in groups['pump_servers'] %} {% if gen_node_certs[h]|default(true) %} {{ hostvars[h].ansible_host | default(hostvars[h].inventory_hostname) }} {% endif %} {% endfor %}" - PATH: "{{ ansible_env.PATH }}:{{ binary_dir }}" - CN: "pump-server" run_once: yes delegate_to: localhost when: gen_certs|default(false) ================================================ FILE: roles/pump/tasks/install_certs.yml ================================================ --- - name: "Deploy_certs | Make sure the certificate directory exits" file: path: "{{ pump_cert_dir }}" state: directory mode: 0700 - name: "Deploy_certs | Deploy certificates" copy: src: "{{ cert_dir }}/{{ item }}" dest: "{{ pump_cert_dir }}/{{ item }}" mode: 0600 backup: yes with_items: - ca.pem - pump-server-{{ pump_host }}-key.pem - pump-server-{{ pump_host }}.pem when: sync_certs|default(false) ================================================ FILE: roles/pump/tasks/main.yml ================================================ --- # tasks file for pump - name: create deploy directories file: path="{{ item }}" state=directory mode=0755 with_items: - "{{ pump_data_dir }}" - "{{ pump_log_dir }}" - "{{ status_dir }}" - include_tasks: check_certs.yml when: enable_tls|default(false) - include_tasks: gen_certs.yml when: enable_tls|default(false) - include_tasks: install_certs.yml when: enable_tls|default(false) - name: load customized config include_vars: file={{ playbook_dir }}/conf/pump.yml name=pump_conf_custom - name: load default config include_vars: file=default.yml name=pump_conf_default - name: generate dynamic config set_fact: pump_conf_generated: security: ssl-ca: >- {%- if enable_tls|default(false) -%}{{ pump_cert_dir }}/ca.pem{%- else -%}{%- endif -%} ssl-cert: >- {%- if enable_tls|default(false) -%}{{ pump_cert_dir }}/pump-server-{{ pump_host }}.pem{%- else -%}{%- endif -%} ssl-key: >- {%- if enable_tls|default(false) -%}{{ pump_cert_dir }}/pump-server-{{ pump_host }}-key.pem{%- else -%}{%- endif -%} - name: generate final config set_fact: pump_conf: "{{ pump_conf_custom | with_default_dicts(pump_conf_generated, pump_conf_default) | update_default_dicts }}" - debug: var=pump_conf - name: create configuration file template: src=pump.toml.j2 dest={{ deploy_dir }}/conf/pump.toml mode=0644 backup=yes register: pump_conf_st - name: backup conf file command: mv "{{ pump_conf_st.backup_file }}" "{{ backup_dir }}" when: pump_conf_st.changed and pump_conf_st.backup_file is defined - include_tasks: "{{ deployment_method }}_deployment.yml" - name: prepare firewalld white list set_fact: firewalld_ports: "{{ [pump_port ~ '/tcp'] + firewalld_ports }}" ================================================ FILE: roles/pump/tasks/supervise_deployment.yml ================================================ --- - name: deploy supervise include_role: name: supervise vars: this_role_name: pump service_name: pump-{{ pump_port }} ================================================ FILE: roles/pump/tasks/systemd_deployment.yml ================================================ --- - name: deploy systemd include_role: name: systemd vars: this_role_name: pump service_name: pump-{{ pump_port }} disable_send_sigkill: true ================================================ FILE: roles/pump/templates/pump.toml.j2 ================================================ # pump Configuration {% for item, value in pump_conf.global | dictsort -%} {{ item }} = {{ value | to_json }} {% endfor %} [security] {% for item, value in pump_conf.security | dictsort -%} {{ item }} = {{ value | to_json }} {% endfor %} [storage] {% for item, value in pump_conf.storage | dictsort -%} {{ item }} = {{ value | to_json }} {% endfor %} ================================================ FILE: roles/pump/templates/run_pump_binary.sh.j2 ================================================ #!/bin/bash set -e ulimit -n 1000000 DEPLOY_DIR={{ deploy_dir }} cd "${DEPLOY_DIR}" || exit 1 # WARNING: This file was auto-generated. Do not edit! # All your edit might be overwritten! {% set my_ip = hostvars[inventory_hostname].ansible_host | default(hostvars[inventory_hostname].inventory_hostname) -%} {% if enable_tls|default(false) %} {% set pd_scheme = 'https' -%} {% endif %} {% set all_pd = [] -%} {% set pd_hosts = groups.pd_servers %} {% for host in pd_hosts -%} {% set pd_ip = hostvars[host].ansible_host | default(hostvars[host].inventory_hostname) -%} {% set pd_port = hostvars[host].pd_client_port -%} {% set _ = all_pd.append("%s://%s:%s" % (pd_scheme, pd_ip, pd_port)) -%} {% endfor -%} exec bin/pump \ --addr="0.0.0.0:{{ pump_port }}" \ --advertise-addr="{{ my_ip }}:{{ pump_port }}" \ --pd-urls="{{ all_pd | join(',') }}" \ --data-dir="{{ pump_data_dir }}" \ --log-file="{{ pump_log_dir }}/{{ pump_log_filename }}" \ --config=conf/pump.toml 2>> "{{ pump_log_dir }}/{{ pump_stderr_filename }}" ================================================ FILE: roles/pump/templates/run_pump_docker.sh.j2 ================================================ #!/bin/bash set -e ulimit -n 1000000 DEPLOY_DIR={{ deploy_dir }} cd "${DEPLOY_DIR}" || exit 1 # WARNING: This file was auto-generated. Do not edit! # All your edit might be overwritten! {% set my_ip = hostvars[inventory_hostname].ansible_host | default(hostvars[inventory_hostname].inventory_hostname) -%} {% set all_pd = [] -%} {% set pd_hosts = groups.pd_servers %} {% for host in pd_hosts -%} {% set pd_ip = hostvars[host].ansible_host | default(hostvars[host].inventory_hostname) -%} {% set pd_port = hostvars[host].pd_client_port -%} {% set _ = all_pd.append("http://%s:%s" % (pd_ip, pd_port)) -%} {% endfor -%} exec docker run \ --net=host \ -v /etc/localtime:/etc/localtime:ro \ -v "{{ pump_docker_log_dir }}:/var/log" \ -v "{{ pump_data_dir }}:/data" \ -v "{{ status_dir }}:/status" \ -u `id -u {{ deploy_user }}` \ --ulimit nofile=1000000:1000000 \ --hostname "pump" \ --name "pump" \ pingcap/tidb-binlog:{{ tidb_version }} \ /pump \ --addr="0.0.0.0:{{ pump_port }}" \ --advertise-addr="{{ my_ip }}:{{ pump_port }}" \ --pd-urls="{{ all_pd | join(',') }}" \ --data-dir=/data \ --log-file="/var/log/{{ pump_log_filename }}" 2>> "{{ pump_log_dir }}/{{ pump_stderr_filename }}" ================================================ FILE: roles/pump/vars/default.yml ================================================ --- # default configuration file for pump in yaml format global: # a integer value to control expiry date of the binlog data, indicates for how long (in days) the binlog data would be stored. # must bigger than 0 gc: 7 # number of seconds between heartbeat ticks (in 2 seconds) heartbeat-interval: 2 security: # Path of file that contains list of trusted SSL CAs for connection with cluster components. ssl-ca: "" # Path of file that contains X509 certificate in PEM format for connection with cluster components. ssl-cert: "" # Path of file that contains X509 key in PEM format for connection with cluster components. ssl-key: "" storage: # Set to true (by default) to guarantee reliability by ensuring binlog data is flushed to the disk. # sync-log: true # stop write when disk available space less than the configured size # 42 MB -> 42000000, 42 mib -> 44040192 # default: 10 gib # stop-write-at-available-space = "10 gib" ================================================ FILE: roles/pushgateway/defaults/main.yml ================================================ --- # default configuration for pushgateway pushgateway_log_level: info pushgateway_log_dir: "{{ deploy_dir }}/log" pushgateway_log_filename: "pushgateway.log" pushgateway_tag: v0.4.0 ================================================ FILE: roles/pushgateway/meta/main.yml ================================================ --- dependencies: - role: common_dir ================================================ FILE: roles/pushgateway/tasks/binary_deployment.yml ================================================ --- - name: create deploy directories file: path="{{ item }}" state=directory mode=0755 with_items: - "{{ pushgateway_log_dir }}" - name: deploy pushgateway binary copy: src="{{ resources_dir }}/bin/pushgateway" dest="{{ deploy_dir }}/bin/" mode=0755 - name: create run script template: src: "{{ item }}_{{ role_name }}_binary.sh.j2" dest: "{{ deploy_dir }}/scripts/{{ item }}_{{ role_name }}.sh" mode: "0755" backup: yes with_items: - run vars: role_status_dir: status/{{ role_name }} - include_tasks: "{{ process_supervision }}_deployment.yml" ================================================ FILE: roles/pushgateway/tasks/docker_deployment.yml ================================================ --- - name: deploy pushgateway image copy: src="{{ downloads_dir }}/pushgateway.tar" dest="{{ deploy_dir }}/images" mode=0755 - name: create run script template: src: "{{ item }}_{{ role_name }}_docker.sh.j2" dest: "{{ deploy_dir }}/scripts/{{ item }}_{{ role_name }}.sh" mode: "0755" backup: yes with_items: - run - name: load docker image from archive docker_image: state: present force: yes name: prom/pushgateway tag: "{{ pushgateway_tag }}" load_path: "{{ images_dir }}/pushgateway.tar" - include_tasks: "{{ process_supervision }}_deployment.yml" ================================================ FILE: roles/pushgateway/tasks/main.yml ================================================ --- - include_tasks: "{{ deployment_method }}_deployment.yml" - name: prepare firewalld white list set_fact: firewalld_ports: "{{ [pushgateway_port ~ '/tcp'] + firewalld_ports }}" ================================================ FILE: roles/pushgateway/tasks/supervise_deployment.yml ================================================ --- - name: deploy supervise include_role: name: supervise vars: this_role_name: pushgateway service_name: pushgateway-{{ pushgateway_port }} ================================================ FILE: roles/pushgateway/tasks/systemd_deployment.yml ================================================ --- - name: deploy systemd include_role: name: systemd vars: this_role_name: pushgateway service_name: pushgateway-{{ pushgateway_port }} ================================================ FILE: roles/pushgateway/templates/run_pushgateway_binary.sh.j2 ================================================ #!/bin/bash set -e ulimit -n 1000000 DEPLOY_DIR={{ deploy_dir }} cd "${DEPLOY_DIR}" || exit 1 # WARNING: This file was auto-generated. Do not edit! # All your edit might be overwritten! exec > >(tee -i -a "{{ pushgateway_log_dir }}/{{ pushgateway_log_filename }}") exec 2>&1 exec bin/pushgateway \ --log.level="{{ pushgateway_log_level }}" \ --web.listen-address=":{{ pushgateway_port }}" ================================================ FILE: roles/pushgateway/templates/run_pushgateway_docker.sh.j2 ================================================ #!/bin/bash set -e ulimit -n 1000000 # WARNING: This file was auto-generated. Do not edit! # All your edit might be overwritten! DEPLOY_DIR={{ deploy_dir }} cd "${DEPLOY_DIR}" || exit 1 exec docker run -p {{ pushgateway_port }}:9091 \ -v /etc/localtime:/etc/localtime:ro \ -u `id -u {{ deploy_user }}` \ --name="pushgateway-{{ pushgateway_port }}" \ prom/pushgateway:{{ pushgateway_tag }} ================================================ FILE: roles/supervise/tasks/main.yml ================================================ --- # tasks file for supervise - name: create supervise status directory file: path={{ item }} state=directory mode=0755 with_items: - "{{ deploy_dir }}/status/{{ this_role_name }}" - name: deploy supervise binary copy: src="{{ resources_dir }}/bin/{{ item }}" dest="{{ deploy_dir }}/bin/" mode=0755 with_items: - supervise - svc - svstat - name: create startup script - common start/stop template: src: "{{ item }}_role.sh.j2" dest: "{{ deploy_dir }}/scripts/{{ item }}_{{ this_role_name }}.sh" mode: "0755" with_items: - start - stop vars: role_status_dir: status/{{ this_role_name }} ================================================ FILE: roles/supervise/templates/start_role.sh.j2 ================================================ #!/bin/bash set -e # WARNING: This file was auto-generated. Do not edit! # All your edit might be overwritten! DEPLOY_USER="{{ deploy_user }}" STATUS_DIR="{{ role_status_dir }}" DEPLOY_DIR="{{ deploy_dir }}" cd "${DEPLOY_DIR}" || (echo "error: deploy dir not exists!"; exit 1) # running as root? if [ "$(id -u)" -eq 0 ]; then echo "warning: run as root is dangerous! try switch to user: ${DEPLOY_USER}!" echo "error: did nothing!" exit 1 fi # try up bin/svc -u "${STATUS_DIR}" &>/dev/null || true # check by svstat _check=$(bin/svstat "${STATUS_DIR}" 2>/dev/null) if echo "${_check}" | grep 'up pid' &>/dev/null; then _pid=$(echo "${_check}" | cut -d' ' -f 4 | cut -d',' -f 1) if [ -e "/proc/${_pid}/cwd" ] && [ "`echo $(readlink /proc/${_pid}/cwd) | sed -r "s/^(.*)\/status.*$/\1/"`" == "${DEPLOY_DIR}" ]; then echo "ok: already started!" exit 0 fi fi # check by pid if [ -e "${STATUS_DIR}/pid" ]; then _pid=$(cat "${STATUS_DIR}/pid") if [ -e "/proc/${_pid}/cwd" ] && [ "$(readlink /proc/${_pid}/cwd)" == "${DEPLOY_DIR}" ]; then kill -9 ${_pid} fi fi {% if deployment_method == 'docker' %} {{ docker_bin_dir }}/docker rm -f {{ service_name }} &>/dev/null || true {% endif %} nohup bin/supervise "${STATUS_DIR}" "${DEPLOY_DIR}/scripts/run_{{ this_role_name | default(role_name) }}.sh" &>/dev/null & echo $! > "${STATUS_DIR}/pid" echo "ok: started!" ================================================ FILE: roles/supervise/templates/stop_role.sh.j2 ================================================ #!/bin/bash set -e DEPLOY_USER={{ deploy_user }} STATUS_DIR={{ role_status_dir }} DEPLOY_DIR={{ deploy_dir }} cd "${DEPLOY_DIR}" || exit 1 # try down bin/svc -d "${STATUS_DIR}" &>/dev/null || true # check by supervise pid if [ ! -z $(readlink /proc/$(cat ${STATUS_DIR}/pid)/exe) ]; then if [ "$(basename "$(readlink /proc/$(cat ${STATUS_DIR}/pid)/exe)")" == "supervise" ]; then kill $(cat ${STATUS_DIR}/pid) fi fi # check by svstat _check=$(bin/svstat "${STATUS_DIR}" 2>/dev/null) if echo "${_check}" | grep 'up pid' &>/dev/null; then _pid=$(echo "${_check}" | cut -d' ' -f 4 | cut -d',' -f 1) if [ "$(readlink /proc/${_pid}/cwd)" == "${DEPLOY_DIR}" ]; then kill ${_pid} fi fi echo -n "sync ... "; sync; echo "done!" echo "ok: stopped!" ================================================ FILE: roles/systemd/tasks/main.yml ================================================ --- # systemd configuration generation - name: create systemd service configuration become: true template: src="systemd_{{ deployment_method }}.service.j2" dest="/etc/systemd/system/{{ service_name }}.service" mode=0644 - name: create startup script - common start/stop become: true template: src: "{{ item }}_role.sh.j2" dest: "{{ deploy_dir }}/scripts/{{ item }}_{{ this_role_name }}.sh" mode: "0755" owner: "{{ deploy_user }}" group: "{{ deploy_user }}" with_items: - start - stop - name: reload systemd become: true shell: "systemctl daemon-reload" ================================================ FILE: roles/systemd/templates/start_role.sh.j2 ================================================ #!/bin/bash set -e # WARNING: This file was auto-generated. Do not edit! # All your edit might be overwritten! sudo systemctl start {{ service_name }}.service ================================================ FILE: roles/systemd/templates/stop_role.sh.j2 ================================================ #!/bin/bash set -e # WARNING: This file was auto-generated. Do not edit! # All your edit might be overwritten! sudo systemctl stop {{ service_name }}.service ================================================ FILE: roles/systemd/templates/systemd_binary.service.j2 ================================================ [Unit] Description={{ service_name }} service After=syslog.target network.target remote-fs.target nss-lookup.target [Service] {% if MemoryLimit|default("") %} MemoryLimit={{ MemoryLimit }} {% endif %} {% if CPUQuota|default("") %} CPUQuota={{ CPUQuota }} {% endif %} {% if IOReadBandwidthMax|default("") %} IOReadBandwidthMax={{ IOReadBandwidthMax }} {% endif %} {% if IOWriteBandwidthMax|default("") %} IOWriteBandwidthMax={{ IOWriteBandwidthMax }} {% endif %} LimitNOFILE=1000000 #LimitCORE=infinity LimitSTACK=10485760 User={{ deploy_user }} ExecStart={{ deploy_dir }}/scripts/run_{{ this_role_name }}.sh {% if restart|default(false) %} Restart={{ restart }} {% else %} Restart=always {% endif %} RestartSec=15s {% if disable_send_sigkill|default(false) %} SendSIGKILL=no {% endif %} [Install] WantedBy=multi-user.target ================================================ FILE: roles/systemd/templates/systemd_docker.service.j2 ================================================ [Unit] Description={{ service_name }} service docker wrapper Wants=docker.socket After=docker.service [Service] LimitNOFILE=1000000 #LimitCORE=infinity LimitSTACK=10485760 User={{ deploy_user }} ExecStart={{ deploy_dir }}/scripts/run_{{ this_role_name }}.sh ExecStartPre=-{{ docker_bin_dir }}/docker rm -f {{ service_name }} ExecStop={{ docker_bin_dir }}/docker stop {{ service_name }} Restart={{ restart }} RestartSec=15s [Install] WantedBy=multi-user.target ================================================ FILE: roles/tidb/defaults/main.yml ================================================ --- tidb_port: 4000 tidb_status_port: 10080 tidb_log_dir: "{{ deploy_dir }}/log" tidb_log_filename: "tidb.log" tidb_slow_query_filename: "tidb_slow_query.log" tidb_stderr_filename: "tidb_stderr.log" tidb_conf_dir: "{{ deploy_dir }}/conf" # docker settings tidb_docker_log_dir: "{{ tidb_log_dir }}/tidb" ================================================ FILE: roles/tidb/files/make-ssl.sh ================================================ #!/bin/bash # Author: Smana smainklh@gmail.com # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. set -o errexit set -o pipefail usage() { cat << EOF Create self signed certificates Usage : $(basename $0) [-d ] -h | --help : Show this message -d | --ssldir : Directory where the certificates will be located Environmental variables HOSTS and CN should be set to generate keys for each host. EOF } # Options parsing while (($#)); do case "$1" in -h | --help) usage; exit 0;; -d | --ssldir) SSLDIR="${2}"; shift 2;; *) usage echo "ERROR : Unknown option" exit 3 ;; esac done if [ -z ${SSLDIR} ]; then echo "ERROR: the directory where the certificates will be located is missing. option -d" exit 1 fi tmpdir=$(mktemp -d /tmp/tidb_cacert.XXXXXX) trap 'rm -rf "${tmpdir}"' EXIT cd "${tmpdir}" mkdir -p "${SSLDIR}" if [ -e "$SSLDIR/ca-config.json" ]; then # Reuse existing CA cp $SSLDIR/{ca-config.json,ca-csr.json} . else echo "ERROR: ca-config.json and ca-csr.json is missing in $SSLDIR." exit 1 fi # Root CA if [ -e "$SSLDIR/ca-key.pem" ]; then # Reuse existing CA cp $SSLDIR/{ca.pem,ca-key.pem} . else cfssl gencert -initca ca-csr.json | cfssljson -bare ca - > /dev/null 2>&1 fi # client cert if [ ! -e "$SSLDIR/client-key.pem" ]; then echo '{"CN":"client","hosts":[""],"key":{"algo":"rsa","size":2048}}' | cfssl gencert -ca=ca.pem -ca-key=ca-key.pem -config=ca-config.json -profile=client -hostname="" - | cfssljson -bare client > /dev/null 2>&1 fi gen_key_and_cert() { local host=$1 local cn=$2 local name=$3 echo "{\"CN\":\"${cn}\",\"hosts\":[\"\"],\"key\":{\"algo\":\"rsa\",\"size\":2048}}" | cfssl gencert -ca=ca.pem -ca-key=ca-key.pem -config=ca-config.json -profile=server -hostname="${host},127.0.0.1" - | cfssljson -bare ${name} > /dev/null 2>&1 } # Nodes if [ -n "$HOSTS" ]; then for host in $HOSTS; do gen_key_and_cert "${host}" "${CN}" "${CN}-${host}" done fi # Install certs mv *.pem ${SSLDIR}/ ================================================ FILE: roles/tidb/meta/main.yml ================================================ --- dependencies: - role: common_dir ================================================ FILE: roles/tidb/tasks/binary_deployment.yml ================================================ --- - name: deploy binary copy: src="{{ resources_dir }}/bin/tidb-server" dest="{{ deploy_dir }}/bin/" mode=0755 backup=yes register: tidb_binary - name: backup binary file command: mv "{{ tidb_binary.backup_file }}" "{{ backup_dir }}" when: tidb_binary.changed and tidb_binary.backup_file is defined - name: check numactl shell: which numactl ignore_errors: yes register: numactl_info - name: check numa node shell: numactl --hardware | awk 'NR==1{print $2}' register: numa_node_count when: not numactl_info.failed - name: create run script template: src: "{{ item }}_{{ role_name }}_binary.sh.j2" dest: "{{ deploy_dir }}/scripts/{{ item }}_{{ role_name }}.sh" mode: "0755" backup: yes with_items: - run vars: role_status_dir: status/{{ role_name }} register: tidb_script - name: backup script file command: mv "{{ tidb_script.backup_file }}" "{{ backup_dir }}" when: tidb_script.changed and tidb_script.backup_file is defined - include_tasks: "{{ process_supervision }}_deployment.yml" ================================================ FILE: roles/tidb/tasks/check_certs.yml ================================================ --- - name: "Check_certs | check if the certs have already been generated on control machine" find: paths: "{{ cert_dir }}" patterns: "*.pem" get_checksum: true delegate_to: localhost register: cert_control_node run_once: true - debug: var: cert_control_node - name: "Check_certs | Set default value for 'sync_certs', 'gen_certs' to false" set_fact: sync_certs: false gen_certs: false - set_fact: tidb_host: "{{ hostvars[inventory_hostname].ansible_host | default(inventory_hostname) }}" - name: "Check certs | check if a cert already exists on node" stat: path: "{{ tidb_cert_dir }}/{{ item }}" register: cert_tidb_node with_items: - ca.pem - tidb-server-{{ tidb_host }}-key.pem - tidb-server-{{ tidb_host }}.pem - debug: var: cert_tidb_node - name: "Check_certs | Set 'gen_certs' to true" set_fact: gen_certs: true when: not item in cert_control_node.files|map(attribute='path') | list delegate_to: localhost run_once: true with_items: >- ['{{cert_dir}}/ca.pem', {% set all_tidb_hosts = groups['tidb_servers']|unique|sort %} {% for host in all_tidb_hosts %} {% set tidb_ip = hostvars[host].ansible_host | default(hostvars[host].inventory_hostname) -%} '{{cert_dir}}/tidb-server-{{ tidb_ip }}-key.pem' {% if not loop.last %}{{','}}{% endif %} {% endfor %}] - debug: var: gen_certs - name: "Check_certs | Set 'gen_node_certs' to true" set_fact: gen_node_certs: |- { {% set all_tidb_hosts = groups['tidb_servers']|unique|sort -%} {% set existing_certs = cert_control_node.files|map(attribute='path')|list|sort %} {% for host in all_tidb_hosts -%} {% set tidb_ip = hostvars[host].ansible_host | default(hostvars[host].inventory_hostname) -%} {% set host_cert = "%s/tidb-server-%s-key.pem"|format(cert_dir, tidb_ip) %} {% if host_cert in existing_certs -%} "{{ host }}": False, {% else -%} "{{ host }}": True, {% endif -%} {% endfor %} } run_once: true - debug: var: gen_node_certs - name: "Check_certs | Set tidb_cert_key" set_fact: tidb_cert_key_path: "{{ cert_dir }}/tidb-server-{{ hostvars[inventory_hostname].tidb_host }}-key.pem" - debug: var: tidb_cert_key_path - name: "Check_certs | Set 'sync_certs' to true" set_fact: sync_certs: true when: gen_node_certs[inventory_hostname] or (not cert_tidb_node.results[0].stat.exists|default(False)) or (not cert_tidb_node.results[1].stat.exists|default(False)) or (cert_tidb_node.results[1].stat.checksum|default('') != cert_control_node.files|selectattr("path","equalto",tidb_cert_key_path)|map(attribute="checksum")|first|default('')) - debug: var: sync_certs ================================================ FILE: roles/tidb/tasks/docker_deployment.yml ================================================ --- - name: create log directory file: path="{{ item }}" state=directory mode=0755 with_items: - "{{ tidb_docker_log_dir }}" - name: deploy tidb image copy: src="{{ downloads_dir }}/tidb.tar" dest="{{ deploy_dir }}/images" mode=0755 - name: create run script template: src: "{{ item }}_{{ role_name }}_docker.sh.j2" dest: "{{ deploy_dir }}/scripts/{{ item }}_{{ role_name }}.sh" mode: "0755" backup: yes with_items: - run - name: load docker image from archive docker_image: state: present force: yes name: pingcap/tidb tag: "{{ tidb_version }}" load_path: "{{ images_dir }}/tidb.tar" - include_tasks: "{{ process_supervision }}_deployment.yml" ================================================ FILE: roles/tidb/tasks/gen_certs.yml ================================================ --- - name: Gen_certs | copy certs generation script copy: src: "make-ssl.sh" dest: "{{ script_dir }}/make-ssl.sh" mode: 0700 run_once: yes delegate_to: localhost when: gen_certs|default(false) - name: Gen_certs | run cert generation script command: "{{ script_dir }}/make-ssl.sh -d {{ cert_dir }}" environment: - HOSTS: "{% for h in groups['tidb_servers'] %} {% if gen_node_certs[h]|default(true) %} {{ hostvars[h].ansible_host | default(hostvars[h].inventory_hostname) }} {% endif %} {% endfor %}" - PATH: "{{ ansible_env.PATH }}:{{ binary_dir }}" - CN: "tidb-server" run_once: yes delegate_to: localhost when: gen_certs|default(false) ================================================ FILE: roles/tidb/tasks/install_certs.yml ================================================ --- - name: "Deploy_certs | Make sure the certificate directory exits" file: path: "{{ tidb_cert_dir }}" state: directory mode: 0700 - name: "Deploy_certs | Deploy certificates" copy: src: "{{ cert_dir }}/{{ item }}" dest: "{{ tidb_cert_dir }}/{{ item }}" mode: 0600 backup: yes with_items: - ca.pem - tidb-server-{{ tidb_host }}-key.pem - tidb-server-{{ tidb_host }}.pem when: sync_certs|default(false) ================================================ FILE: roles/tidb/tasks/main.yml ================================================ --- # tasks file for tidb - name: create deploy directories file: path="{{ item }}" state=directory mode=0755 with_items: - "{{ tidb_log_dir }}" - "{{ tidb_conf_dir }}" - include_tasks: check_certs.yml when: enable_tls|default(false) - include_tasks: gen_certs.yml when: enable_tls|default(false) - include_tasks: install_certs.yml when: enable_tls|default(false) # config part - name: "load customized config: tidb-ansible/conf/tidb.yml" include_vars: file={{ playbook_dir }}/conf/tidb.yml name=tidb_conf_custom - name: load default config include_vars: file=default.yml name=tidb_conf_default - name: generate dynamic config set_fact: tidb_conf_generated: security: cluster-ssl-ca: >- {%- if enable_tls|default(false) -%}{{ tidb_cert_dir }}/ca.pem{%- else -%}{%- endif -%} cluster-ssl-cert: >- {%- if enable_tls|default(false) -%}{{ tidb_cert_dir }}/tidb-server-{{ tidb_host }}.pem{%- else -%}{%- endif -%} cluster-ssl-key: >- {%- if enable_tls|default(false) -%}{{ tidb_cert_dir }}/tidb-server-{{ tidb_host }}-key.pem{%- else -%}{%- endif -%} - name: combine final config set_fact: tidb_conf: "{{ tidb_conf_generated | with_default_dicts(tidb_conf_custom, tidb_conf_default) | update_default_dicts }}" - debug: var=tidb_conf - name: create config file template: src=tidb.toml.j2 dest={{ deploy_dir }}/conf/tidb.toml mode=0644 backup=yes register: tidb_conf_st - name: backup conf file command: mv "{{ tidb_conf_st.backup_file }}" "{{ backup_dir }}" when: tidb_conf_st.changed and tidb_conf_st.backup_file is defined - include_tasks: "{{ deployment_method }}_deployment.yml" - name: prepare firewalld white list set_fact: firewalld_ports: "{{ [tidb_port ~ '/tcp', tidb_status_port ~ '/tcp'] + firewalld_ports }}" ================================================ FILE: roles/tidb/tasks/supervise_deployment.yml ================================================ --- - name: deploy supervise include_role: name: supervise vars: this_role_name: tidb service_name: tidb-{{ tidb_port }} ================================================ FILE: roles/tidb/tasks/systemd_deployment.yml ================================================ --- - name: deploy systemd include_role: name: systemd vars: this_role_name: tidb service_name: tidb-{{ tidb_port }} ================================================ FILE: roles/tidb/templates/run_tidb_binary.sh.j2 ================================================ #!/bin/bash set -e ulimit -n 1000000 # WARNING: This file was auto-generated. Do not edit! # All your edit might be overwritten! DEPLOY_DIR={{ deploy_dir }} cd "${DEPLOY_DIR}" || exit 1 {% set my_name = hostvars[inventory_hostname].inventory_hostname -%} {% set my_ip = hostvars[inventory_hostname].ansible_host | default(hostvars[inventory_hostname].inventory_hostname) -%} {% set all_pd = [] -%} {% set pd_hosts = groups.pd_servers %} {% for host in pd_hosts -%} {% set pd_ip = hostvars[host].ansible_host | default(hostvars[host].inventory_hostname) -%} {% set pd_port = hostvars[host].pd_client_port -%} {% set _ = all_pd.append("%s:%s" % (pd_ip, pd_port)) -%} {% endfor -%} {% set all_node_on_instance = [] -%} {% for node in groups.tidb_servers -%} {% set node_ip = hostvars[node].ansible_host | default(hostvars[node].inventory_hostname) -%} {% if node_ip == my_ip -%} {% set _ = all_node_on_instance.append(hostvars[node].inventory_hostname) -%} {% endif -%} {% endfor -%} {% set numaNodeId = [] -%} {% if numactl_info.failed -%} {% set set_numa_bind = false -%} {% elif all_node_on_instance | length == numa_node_count.stdout | int -%} {% set set_numa_bind = true -%} {% for inventory_name in all_node_on_instance -%} {% if inventory_name == my_name -%} {% set _ = numaNodeId.append(loop.index0) -%} {% endif -%} {% endfor -%} {% endif %} export TZ={{ timezone }} {% if set_numa_bind|default(false) -%} exec numactl --cpunodebind={{ numaNodeId.0 }} --membind={{ numaNodeId.0 }} bin/tidb-server \ {% else -%} exec bin/tidb-server \ {% endif %} -P {{ tidb_port }} \ --status="{{ tidb_status_port }}" \ --advertise-address="{{ my_ip }}" \ --path="{{ all_pd | join(',') }}" \ --config=conf/tidb.toml \ {% if enable_binlog|default(false) %} --enable-binlog \ {% endif %} {% if tidb_affinity_cpu|default(false) %} --affinity-cpus="{{tidb_affinity_cpu | join(',')}}" \ {% endif %} --log-slow-query="{{ tidb_log_dir }}/{{ tidb_slow_query_filename }}" \ --log-file="{{ tidb_log_dir }}/{{ tidb_log_filename }}" 2>> "{{ tidb_log_dir }}/{{ tidb_stderr_filename }}" ================================================ FILE: roles/tidb/templates/run_tidb_docker.sh.j2 ================================================ #!/bin/bash set -e ulimit -n 1000000 # WARNING: This file was auto-generated. Do not edit! # All your edit might be overwritten! DEPLOY_DIR={{ deploy_dir }} cd "${DEPLOY_DIR}" || exit 1 {% set all_pd = [] -%} {% set pd_hosts = groups.pd_servers %} {% for host in pd_hosts -%} {% set pd_ip = hostvars[host].ansible_host | default(hostvars[host].inventory_hostname) -%} {% set pd_port = hostvars[host].pd_client_port -%} {% set _ = all_pd.append("%s:%s" % (pd_ip, pd_port)) -%} {% endfor -%} {% set my_id = groups.tidb_servers.index(inventory_hostname) + 1 -%} exec docker run \ -p {{ tidb_port }}:4000 \ -p {{ tidb_status_port }}:10080 \ -v /etc/localtime:/etc/localtime:ro \ -v "{{ tidb_conf_dir }}/tidb.toml:/etc/tidb.toml:ro" \ -v "{{ tidb_docker_log_dir }}:/var/log" \ {% if enable_binlog|default(false) %} -v "{{ status_dir }}:/status" \ {% endif %} -u `id -u {{ deploy_user }}` \ --ulimit nofile=1000000:1000000 \ --hostname "tidb-{{ my_id }}" \ --name "tidb-{{ tidb_port }}" \ pingcap/tidb:{{ tidb_version }} \ --log-file="/var/log/{{ tidb_log_filename }}" \ --path="{{ all_pd | join(',') }}" \ {% if enable_binlog|default(false) %} --enable-binlog \ {% endif %} --config=/etc/tidb.toml ================================================ FILE: roles/tidb/templates/tidb.toml.j2 ================================================ # TiDB Configuration. {% for item, value in tidb_conf.global | dictsort -%} {{ item }} = {{ value | to_json }} {% endfor %} [log] {% for item, value in tidb_conf.log | dictsort_by_value_type -%} {% if value is not mapping -%} {{ item }} = {{ value | to_json}} {% else %} [log.{{ item }}] {% for sub_item, sub_value in value | dictsort -%} {{ sub_item }} = {{ sub_value | to_json }} {% endfor %} {% endif %} {% endfor %} [security] {% for item, value in tidb_conf.security | dictsort -%} {{ item }} = {{ value | to_json }} {% endfor %} [status] {% for item, value in tidb_conf.status | dictsort -%} {{ item }} = {{ value | to_json }} {% endfor %} [performance] {% for item, value in tidb_conf.performance | dictsort -%} {{ item }} = {{ value | to_json }} {% endfor %} [proxy-protocol] {% for item, value in tidb_conf.proxy_protocol | dictsort -%} {{ item }} = {{ value | to_json }} {% endfor %} [prepared-plan-cache] {% for item, value in tidb_conf.prepared_plan_cache | dictsort -%} {{ item }} = {{ value | to_json }} {% endfor %} [opentracing] {% for item, value in tidb_conf.opentracing | dictsort_by_value_type -%} {% if value is not mapping -%} {{ item }} = {{ value | to_json }} {% else %} [opentracing.{{ item }}] {% for sub_item, sub_value in value | dictsort -%} {{ sub_item }} = {{ sub_value | to_json }} {% endfor %} {% endif %} {% endfor %} [tikv-client] {% for item, value in tidb_conf.tikv_client | dictsort_by_value_type -%} {% if value is not mapping -%} {{ item }} = {{ value | to_json }} {% else %} {% if item == 'copr_cache' %} [tikv-client.copr-cache] {% else %} [tikv-client.{{ item }}] {% endif %} {% for sub_item, sub_value in value | dictsort -%} {{ sub_item }} = {{ sub_value | to_json }} {% endfor %} {% endif %} {% endfor %} [txn-local-latches] {% for item, value in tidb_conf.txn_local_latches | dictsort -%} {{ item }} = {{ value | to_json }} {% endfor %} [binlog] {% for item, value in tidb_conf.binlog | dictsort -%} {{ item }} = {{ value | to_json }} {% endfor %} [pessimistic-txn] {% for item, value in tidb_conf.pessimistic_txn | dictsort -%} {{ item }} = {{ value | to_json }} {% endfor %} [experimental] {% for item, value in tidb_conf.experimental | dictsort -%} {{ item }} = {{ value | to_json }} {% endfor %} ================================================ FILE: roles/tidb/vars/default.yml ================================================ --- # default configuration file for TiDB in yaml format global: # TiDB Configuration. # TiDB server host. host: "0.0.0.0" # TiDB server port. # port: 4000 # Registered store name, [tikv, mocktikv] store: "tikv" # TiDB storage path. # path: "/tmp/tidb" # The socket file to use for connection. socket: "" # Schema lease duration, very dangerous to change only if you know what you do. lease: "45s" # The limit of concurrent executed sessions. token-limit: 1000 # Only print a log when out of memory quota. # Valid options: ["log", "cancel"] oom-action: "cancel" # Set the memory quota for a query in bytes. Default: 32GB # mem-quota-query: 34359738368 # Make "kill query" behavior compatible with MySQL. It's not recommend to # turn on this option when TiDB server is behind a proxy. compatible-kill-query: false # check mb4 value in utf8 is used to control whether to check the mb4 characters when the charset is utf8. # check-mb4-value-in-utf8: true # max-index-length is used to deal with compatibility issues from v3.0.7 and previous version upgrades. It can only be in [3072, 3072*4]. max-index-length: 3072 # alter-primary-key is used to control alter primary key feature. Default is false, indicate the alter primary key feature is disabled. # If it is true, we can add the primary key by "alter table". However, if a table already exists before the switch is turned true and # the data type of its primary key column is an integer, the primary key cannot be dropped. alter-primary-key: false # server-version is used to change the version string of TiDB in the following scenarios: # 1. the server version returned by builtin-function `VERSION()`. # 2. the server version filled in handshake packets of MySQL Connection Protocol, see https://dev.mysql.com/doc/internals/en/connection-phase-packets.html#packet-Protocol::Handshake for more details. # if server-version = "", the default value(original TiDB version string) is used. server-version: "" # Whether new collations are enabled, as indicated by its name, this configuration entry take effect ONLY when a TiDB cluster bootstraps for the first time. new_collations_enabled_on_first_bootstrap: false # When enabled, usage data (for example, instance versions) will be reported to PingCAP periodically for user experience analytics. # If this config is set to `false` on all TiDB servers, telemetry will be always disabled regardless of the value of the global variable `tidb_enable_telemetry`. # See PingCAP privacy policy for details: https://pingcap.com/en/privacy-policy/ enable-telemetry: true log: # Log level: debug, info, warn, error, fatal. level: "info" # Queries with execution time greater than this value will be logged. (Milliseconds) slow-threshold: 300 # Queries with internal result greater than this value will be logged. expensive-threshold: 10000 status: # TiDB status host. status-host: "0.0.0.0" # TiDB status port. status-port: 10080 # Prometheus pushgateway address, leaves it empty will disable prometheus push. metrics-addr: "" # Prometheus client push interval in second, set \"0\" to disable prometheus push. metrics-interval: 15 performance: # Max CPUs to use, 0 use number of CPUs in the machine. max-procs: 0 # Max memory size to use, 0 use the total usable memory in the machine. # max-memory: 0 # StmtCountLimit limits the max count of statement inside a transaction. stmt-count-limit: 5000 # Stats lease duration, which influences the time of analyze and stats load. stats-lease: "3s" proxy_protocol: prepared_plan_cache: enabled: false capacity: 100 memory-guard-ratio: 0.1 opentracing: # Enable opentracing. enable: false # Whether to enable the rpc metrics. rpc-metrics: false sampler: # Type specifies the type of the sampler: const, probabilistic, rateLimiting, or remote type: "const" # Param is a value passed to the sampler. # Valid values for Param field are: # - for "const" sampler, 0 or 1 for always false/true respectively # - for "probabilistic" sampler, a probability between 0 and 1 # - for "rateLimiting" sampler, the number of spans per second # - for "remote" sampler, param is the same as for "probabilistic" # and indicates the initial sampling rate before the actual one # is received from the mothership param: 1.0 # SamplingServerURL is the address of jaeger-agent's HTTP sampling server sampling-server-url: "" # MaxOperations is the maximum number of operations that the sampler # will keep track of. If an operation is not tracked, a default probabilistic # sampler will be used rather than the per operation specific sampler. max-operations: 0 # SamplingRefreshInterval controls how often the remotely controlled sampler will poll # jaeger-agent for the appropriate sampling strategy. sampling-refresh-interval: 0 reporter: # QueueSize controls how many spans the reporter can keep in memory before it starts dropping # new spans. The queue is continuously drained by a background go-routine, as fast as spans # can be sent out of process. queue-size: 0 # BufferFlushInterval controls how often the buffer is force-flushed, even if it's not full. # It is generally not useful, as it only matters for very low traffic services. buffer-flush-interval: 0 # LogSpans, when true, enables LoggingReporter that runs in parallel with the main reporter # and logs all submitted spans. Main Configuration.Logger must be initialized in the code # for this option to have any effect. log-spans: false # LocalAgentHostPort instructs reporter to send spans to jaeger-agent at this address local-agent-host-port: "" tikv_client: # Max gRPC connections that will be established with each tikv-server. grpc-connection-count: 4 # After a duration of this time in seconds if the client doesn't see any activity it pings # the server to see if the transport is still alive. grpc-keepalive-time: 10 # After having pinged for keepalive check, the client waits for a duration of Timeout in seconds # and if no activity is seen even after that the connection is closed. grpc-keepalive-timeout: 3 # max time for commit command, must be twice bigger than raft election timeout. commit-timeout: "41s" # Max batch size in gRPC. # max-batch-size: 128 # Overload threshold of TiKV. # overload-threshold: 200 # Max batch wait time in nanosecond to avoid waiting too long. 0 means disable this feature. # max-batch-wait-time: 0 # Batch wait size, to avoid waiting too long. # batch-wait-size: 8 txn_local_latches: binlog: # Socket file to write binlog. # binlog-socket: "" # WriteTimeout specifies how long it will wait for writing binlog to pump. write-timeout: "15s" # If IgnoreError is true, when writting binlog meets error, TiDB would stop writting binlog, # but still provide service. ignore-error: false pessimistic_txn: # enable pessimistic transaction. enable: true # max retry count for a statement in a pessimistic transaction. max-retry-count: 256 experimental: # enable column attribute `auto_random` to be defined on the primary key column. allow-auto-random: false # enable creating expression index. allow-expression-index: false ================================================ FILE: roles/tidb_lightning/defaults/main.yml ================================================ --- dummy: # Tidb cluster information to import data tidb_host: "{{ hostvars[groups.tidb_servers[0]].ansible_host | default(hostvars[groups.tidb_servers[0]].inventory_hostname) }}" tidb_port: "{{ hostvars[groups.tidb_servers[0]].tidb_port }}" tidb_user: "root" tidb_password: "" tidb_status_port: "{{ hostvars[groups.tidb_servers[0]].tidb_status_port }}" lightning_log_dir: "{{ deploy_dir }}/log" lightning_conf_dir: "{{ deploy_dir }}/conf" lightning_log_file: "tidb_lightning.log" ================================================ FILE: roles/tidb_lightning/meta/main.yml ================================================ --- dependencies: - role: common_dir ================================================ FILE: roles/tidb_lightning/tasks/binary_deployment.yml ================================================ --- - name: deploy tidb-lightning binary copy: src="{{ resources_dir }}/bin/tidb-lightning" dest="{{ deploy_dir }}/bin/" mode=0755 backup=yes register: tidb_lightning - name: deploy tidb-lightning-ctl binary copy: src="{{ resources_dir }}/bin/tidb-lightning-ctl" dest="{{ deploy_dir }}/bin/" mode=0755 backup=yes register: tidb_lightning_ctl - name: backup tidb-lightning binary file command: mv "{{ tidb_lightning.backup_file }}" "{{ backup_dir }}" when: tidb_lightning.changed and tidb_lightning.backup_file is defined - name: backup tidb-lightning-ctl binary file command: mv "{{ tidb_lightning_ctl.backup_file }}" "{{ backup_dir }}" when: tidb_lightning_ctl.changed and tidb_lightning_ctl.backup_file is defined - name: create run script template: src: "{{ item }}_binary.sh.j2" dest: "{{ deploy_dir }}/scripts/{{ item }}.sh" mode: "0755" backup: yes with_items: - start_lightning - stop_lightning - tidb_lightning_ctl register: lightning_script - name: backup script file command: mv "{{ item.backup_file }}" "{{ backup_dir }}" when: - item.changed - item.backup_file is defined with_items: "{{ lightning_script.results }}" ================================================ FILE: roles/tidb_lightning/tasks/main.yml ================================================ --- # tasks file for tidb-lightning - name: create deploy directories file: path="{{ item }}" state=directory mode=0755 with_items: - "{{ lightning_log_dir }}" - "{{ lightning_conf_dir }}" - name: "load lightning customized config: tidb-ansible/conf/tidb-lightning.yml" include_vars: file={{ playbook_dir }}/conf/tidb-lightning.yml name=tidb_lightning_conf_custom - name: load tidb-lightning default config include_vars: file=tidb-lightning.yml name=tidb_lightning_conf_default - name: generate tidb-lightning dynamic config set_fact: tidb_lightning_conf_generated: lightning: pprof-port: "{{ tidb_lightning_pprof_port }}" file: "{{ lightning_log_dir }}/{{ lightning_log_file }}" mydumper: data-source-dir: "{{ data_source_dir }}" tidb: host: "{{ tidb_host }}" port: "{{ tidb_port }}" user: "{{ tidb_user }}" password: "{{ tidb_password }}" status-port: "{{ tidb_status_port }}" - name: generate tidb-lightning final config set_fact: tidb_lightning_conf: "{{ tidb_lightning_conf_custom | with_default_dicts(tidb_lightning_conf_generated, tidb_lightning_conf_default) | update_default_dicts }}" - debug: var=tidb_lightning_conf - name: create tidb-lightning configuration file template: src=tidb-lightning.toml.j2 dest={{ deploy_dir }}/conf/tidb-lightning.toml mode=0644 backup=yes register: tidb_lightning_conf_st - name: backup tidb-lightning conf file command: mv "{{ tidb_lightning_conf_st.backup_file }}" "{{ backup_dir }}" when: tidb_lightning_conf_st.changed and tidb_lightning_conf_st.backup_file is defined - include_tasks: "binary_deployment.yml" ================================================ FILE: roles/tidb_lightning/templates/start_lightning_binary.sh.j2 ================================================ #!/bin/bash set -e ulimit -n 1000000 cd "{{ deploy_dir }}" || exit 1 mkdir -p status export RUST_BACKTRACE=1 export TZ={{ timezone }} echo -n 'sync ... ' stat=$(time sync) echo ok echo $stat nohup ./bin/tidb-lightning -config ./conf/tidb-lightning.toml &> log/tidb_lightning_stderr.log & echo $! > "status/tidb-lightning.pid" ================================================ FILE: roles/tidb_lightning/templates/stop_lightning_binary.sh.j2 ================================================ #!/bin/bash set -e ulimit -n 1000000 cd "{{ deploy_dir }}" || exit 1 export RUST_BACKTRACE=1 export TZ={{ timezone }} echo -n 'sync ... ' stat=$(time sync) echo ok echo $stat if [ `ps aux |grep tidb-lightning |grep $(cat status/tidb-lightning.pid) |wc -l` -eq 1 ];then kill `cat status/tidb-lightning.pid` fi ================================================ FILE: roles/tidb_lightning/templates/tidb-lightning.toml.j2 ================================================ # lightning Configuration {% set all_pd = [] -%} {% set pd_host = groups.pd_servers[0] -%} {% set pd_ip = hostvars[pd_host].ansible_host | default(hostvars[pd_host].inventory_hostname) -%} {% set pd_port = hostvars[pd_host].pd_client_port -%} {% set _ = all_pd.append("%s:%s" % (pd_ip, pd_port)) -%} [lightning] {% for item, value in tidb_lightning_conf.lightning | dictsort -%} {{ item }} = {{ value | to_json }} {% endfor %} [checkpoint] {% for item, value in tidb_lightning_conf.checkpoint | dictsort -%} {{ item }} = {{ value | to_json }} {% endfor %} [tikv-importer] {% set tikv_importer_node = groups.importer_server[0] -%} {% set tikv_importer_ip = hostvars[tikv_importer_node].ansible_host | default(hostvars[tikv_importer_node].inventory_hostname) -%} {% set tikv_importer_port = hostvars[tikv_importer_node].tikv_importer_port %} addr = "{{ tikv_importer_ip }}:{{ tikv_importer_port }}" {% for item, value in tidb_lightning_conf.tikv_importer | dictsort -%} {{ item }} = {{ value | to_json }} {% endfor %} [mydumper] {% for item, value in tidb_lightning_conf.mydumper | dictsort -%} {% if item != 'csv' -%} {{ item }} = {{ value | to_json }} {% endif -%} {% endfor %} {% if tidb_lightning_conf.mydumper.csv -%} [mydumper.csv] {% for item, value in tidb_lightning_conf.mydumper.csv | dictsort -%} {{ item }} = {{ value | to_json }} {% endfor %} {% endif -%} [tidb] {% for item, value in tidb_lightning_conf.tidb | dictsort -%} {% if item == "port" or item == "status-port" %} {{ item }} = {{ value | int }} {% else %} {{ item }} = {{ value | to_json }} {% endif %} {% endfor %} pd-addr = "{{ all_pd |join(',') }}" [post-restore] {% for item, value in tidb_lightning_conf.post_restore | dictsort -%} {{ item }} = {{ value | to_json }} {% endfor %} [cron] {% for item, value in tidb_lightning_conf.cron | dictsort -%} {{ item }} = {{ value | to_json }} {% endfor %} ================================================ FILE: roles/tidb_lightning/templates/tidb_lightning_ctl_binary.sh.j2 ================================================ #!/bin/bash set -e ulimit -n 1000000 cd "{{ deploy_dir }}" || exit 1 export TZ={{ timezone }} ./bin/tidb-lightning-ctl -config ./conf/tidb-lightning.toml "$@" ================================================ FILE: roles/tidb_lightning/vars/tidb-lightning.yml ================================================ --- ### tidb-lightning configuration lightning: # check if the cluster satisfies the minimum requirement before starting # check-requirements = true # table-concurrency controls the maximum handled tables concurrently while reading Mydumper SQL files. # index-concurrency controls the maximum handled index concurrently while reading Mydumper SQL files. # They can affect the tikv-importer memory and disk usage. # table-concurrency + index-concurrency must be <= max-open-engines value in tikv-importer.tmol index-concurrency: 2 table-concurrency: 6 # region-concurrency changes the concurrency number of data. It is set to the number of logical CPU cores by default and needs no configuration. # in mixed configuration, you can set it to 75% of the size of logical CPU cores. # region-concurrency default to runtime.NumCPU() # region-concurrency: # io-concurrency controls the maximum IO concurrency io-concurrency: 5 # logging level: "info" file: "log/tidb_lightning.log" max-size: 128 # MB max-days: 28 max-backups: 14 checkpoint: # Whether to enable checkpoints. # While importing, Lightning will record which tables have been imported, so even if Lightning or other component # crashed, we could start from a known good state instead of redoing everything. enable: true # The schema name (database name) to store the checkpoints schema: "tidb_lightning_checkpoint" # Where to store the checkpoints. # Set to "file" to store as a local file. # Set to "mysql" to store into a remote MySQL-compatible database # driver: "file" # The data source name (DSN) indicating the location of the checkpoint storage. # For "file" driver, the DSN is a path. If not specified, Lightning would default to "/tmp/CHKPTSCHEMA.pb". # For "mysql" driver, the DSN is a URL in the form "USER:PASS@tcp(HOST:PORT)/". # If not specified, the TiDB server from the [tidb] section will be used to store the checkpoints. # dsn: "/tmp/tidb_lightning_checkpoint.pb" # Whether to keep the checkpoints after all data are imported. If false, the checkpoints will be deleted. The schema # needs to be dropped manually, however. # keep-after-success: false tikv_importer: # delivery back end ("tidb" or "importer") backend: "importer" # action on duplicated entry ("error", "ignore" or "replace") # on-duplicate: "replace" mydumper: # block size of file reading read-block-size: 65536 # Byte (default = 64 KB) # minimum size (in terms of source data file) of each batch of import. # Lightning will split a large table into multiple engine files according to this size. # batch-size: 107374182400 # Byte (default = 100 GiB) # Engine file needs to be imported sequentially. Due to table-concurrency, multiple engines will be # imported nearly the same time, and this will create a queue and this wastes resources. Therefore, # Lightning will slightly increase the size of the first few batches to properly distribute # resources. The scale up is controlled by this parameter, which expresses the ratio of duration # between the "import" and "write" steps with full concurrency. This can be calculated as the ratio # (import duration / write duration) of a single table of size around 1 GB. The exact timing can be # found in the log. If "import" is faster, the batch size anomaly is smaller, and a ratio of # zero means uniform batch size. This value should be in the range (0 <= batch-import-ratio < 1). # batch-import-ratio: 0.75 # the source data directory of Mydumper. tidb-lightning will automatically create the corresponding database and tables based on the schema file in the directory. # data-source-dir: "/data/mydumper" # If no-schema is set to true, tidb-lightning will obtain the table schema information from tidb-server, # instead of creating the database or tables based on the schema file of data-source-dir. # This applies to manually creating tables or the situation where the table schema exits in TiDB. no-schema: false # the character set of the schema files; only supports one of: # - utf8mb4: the schema files must be encoded as UTF-8, otherwise will emit errors # - gb18030: the schema files must be encoded as GB-18030, otherwise will emit errors # - auto: (default) automatically detect if the schema is UTF-8 or GB-18030, error if the encoding is neither # - binary: do not try to decode the schema files # note that the *data* files are always parsed as binary regardless of schema encoding. # character-set: "auto" # CSV files are imported according to MySQL's LOAD DATA INFILE rules. # See https://pingcap.com/docs/tools/lightning/csv/ for details of these settings csv: separator: ',' delimiter: '"' header: true not-null: false 'null': \N backslash-escape: true trim-last-separator: false # configuration for TiDB (pick one of them if it has many TiDB servers) and the PD server. tidb: # the target cluster information # the listening address of tidb-server. Setting one of them is enough. # host: "127.0.0.1" # port: 4000 # user: "root" # password: "" # table schema information is fetched from TiDB via this status-port. # status-port: 10080 # Lightning uses some code of TiDB (used as a library) and the flag controls its log level. log-level: "error" # Set tidb session variables to speed up checksum/analyze table. # See https://pingcap.com/docs/sql/statistics/#control-analyze-concurrency for the meaning of each setting build-stats-concurrency: 20 distsql-scan-concurrency: 100 index-serial-scan-concurrency: 20 checksum-table-concurrency: 16 # cron performs some periodic actions in background cron: # duration between which Lightning will automatically refresh the import mode status. # should be shorter than the corresponding TiKV setting switch-mode: '5m' # the duration which the an import progress will be printed to the log. log-progress: '5m' post_restore: # if it is set to true, tidb-lightning will perform the ADMIN CHECKSUM TABLE
operation on the tables one by one. checksum: true # compaction is performed automatically starting v2.1.6. These settings should be left as `false`. # level-1-compact: false # compact: false # if it is set to true, tidb-lightning will perform the ANALYZE TABLE
operation on the tables one by one. # If the Analyze operation fails, you can analyze data manually on the Mysql client. analyze: true ================================================ FILE: roles/tiflash/defaults/main.yml ================================================ --- tiflash_dir: "{{ deploy_dir }}/tiflash" data_dir: "{{ deploy_dir }}/tiflash/data/db" tmp_path: "{{ data_dir | split_string(',') | get_element_by_index(0) }}/tmp" cluster_manager_path: "{{ deploy_dir }}/bin/tiflash/flash_cluster_manager" cluster_manager_log: "{{ deploy_dir }}/log/tiflash_cluster_manager.log" tiflash_tikv_log: "{{ deploy_dir }}/log/tiflash_tikv.log" tiflash_errlog: "{{ deploy_dir }}/log/tiflash_error.log" tiflash_server_log: "{{ deploy_dir }}/log/tiflash.log" flash_proxy_config: "{{ deploy_dir }}/conf/tiflash-learner.toml" raft_data_dir: "{{ data_dir | split_string(',') | get_element_by_index(0) }}/flash" tiflash_conf_dir: "{{ deploy_dir }}/conf" tiflash_log_dir: "{{ deploy_dir }}/log" tiflash_scripts_dir: "{{ deploy_dir }}/scripts" tcp_port: 9000 http_port: 8123 flash_service_port: 3930 flash_proxy_port: 20170 flash_proxy_status_port: 20292 metrics_port: 8234 ================================================ FILE: roles/tiflash/meta/main.yml ================================================ --- dependencies: - role: common_dir ================================================ FILE: roles/tiflash/tasks/binary_deployment.yml ================================================ --- - name: deploy tiflash binary copy: src="{{ resources_dir }}/bin/tiflash" dest="{{ deploy_dir }}/bin/" mode=0755 backup=yes register: tiflash_binary - name: create run script template: src: "{{ item }}_{{ role_name }}_binary.sh.j2" dest: "{{ deploy_dir }}/scripts/{{ item }}_{{ role_name }}.sh" mode: "0755" backup: yes with_items: - run vars: role_status_dir: status/{{ role_name }} register: tiflash_script - name: backup script file command: mv "{{ tiflash_script.backup_file }}" "{{ backup_dir }}" when: tiflash_script.changed and tiflash_script.backup_file is defined - include_tasks: "systemd_deployment.yml" ================================================ FILE: roles/tiflash/tasks/main.yml ================================================ --- - name: create tiflash directories become: true file: path={{ item }} state=directory mode=0755 owner={{ deploy_user }} group={{ deploy_user }} with_items: - "{{ tiflash_dir }}" - "{{ tmp_path }}" - "{{ data_dir | split_string(',') }}" - "{{ raft_data_dir }}" - "{{ tiflash_conf_dir }}" - "{{ tiflash_log_dir }}" - "{{ tiflash_scripts_dir }}" - name: "load tiflash config: tidb-ansible/conf/tiflash.yml" include_vars: file={{ playbook_dir }}/conf/tiflash.yml name=tiflash_conf_custom - name: load tiflash default config include_vars: file=tiflash.yml name=tiflash_conf_default - name: generate tiflash dynamic config set_fact: tiflash_conf_generated: flash: flash_cluster: cluster_manager_path: "{{ cluster_manager_path }}" log: "{{ cluster_manager_log }}" - name: combine tiflash config set_fact: tiflash_conf: "{{ tiflash_conf_custom | with_default_dicts(tiflash_conf_generated, tiflash_conf_default) | update_default_dicts }}" - debug: var=tiflash_conf - name: create tiflash config file template: src=tiflash.toml.j2 dest={{ deploy_dir }}/conf/tiflash.toml mode=0644 backup=yes register: tiflash_conf_st - name: backup tiflash conf file command: mv "{{ tiflash_conf_st.backup_file }}" "{{ backup_dir }}" when: tiflash_conf_st.changed and tiflash_conf_st.backup_file is defined - name: "load tiflash learner config: tidb-ansible/conf/tiflash-learner.yml" include_vars: file={{ playbook_dir }}/conf/tiflash-learner.yml name=tiflash_learner_conf_custom - name: load tiflash learner default config include_vars: file=tiflash-learner.yml name=tiflash_learner_conf_default - name: generate tiflash learner config set_fact: tiflash_learner_conf: "{{ tiflash_learner_conf_custom | with_default_dicts(tiflash_learner_conf_default) | update_default_dicts }}" - debug: var=tiflash_learner_conf - name: create tiflash learner config file template: src=tiflash_learner.toml.j2 dest={{ deploy_dir }}/conf/tiflash-learner.toml mode=0644 backup=yes register: tiflash_learner_conf_st - name: backup tiflash conf file command: mv "{{ tiflash_learner_conf_st.backup_file }}" "{{ backup_dir }}" when: tiflash_learner_conf_st.changed and tiflash_learner_conf_st.backup_file is defined - include_tasks: "binary_deployment.yml" - name: prepare firewalld white list set_fact: firewalld_ports: "{{ [tcp_port ~ '/tcp', http_port ~ '/tcp', flash_service_port ~ '/tcp', flash_proxy_port ~ '/tcp', flash_proxy_status_port ~ '/tcp', metrics_port ~ '/tcp'] + firewalld_ports }}" ================================================ FILE: roles/tiflash/tasks/supervise_deployment.yml ================================================ --- - name: deploy supervise include_role: name: supervise vars: this_role_name: tiflash service_name: tiflash-{{ tcp_port }} ================================================ FILE: roles/tiflash/tasks/systemd_deployment.yml ================================================ --- - name: deploy systemd include_role: name: systemd vars: this_role_name: tiflash service_name: tiflash-{{ tcp_port }} ================================================ FILE: roles/tiflash/templates/run_tiflash_binary.sh.j2 ================================================ #!/bin/bash set -e ulimit -n 1000000 # WARNING: This file was auto-generated. Do not edit! # All your edit might be overwritten! cd "{{ deploy_dir }}" || exit 1 export RUST_BACKTRACE=1 export TZ=${TZ:-/etc/localtime} export LD_LIBRARY_PATH={{ deploy_dir }}/bin/tiflash:$LD_LIBRARY_PATH echo -n 'sync ... ' stat=$(time sync) echo ok echo $stat echo $$ > "status/{{ role_name }}.pid" exec bin/tiflash/tiflash server --config-file conf/tiflash.toml ================================================ FILE: roles/tiflash/templates/tiflash.toml.j2 ================================================ {% for item, value in tiflash_conf.global | dictsort -%} {{ item }} = {{ value | to_json }} {% endfor %} tmp_path = "{{ tmp_path }}" path = "{{ data_dir }}" tcp_port = {{ tcp_port }} http_port = {{ http_port }} [flash] {% set all_tidb = [] -%} {% for host in groups.tidb_servers -%} {% set tidb_ip = hostvars[host].ansible_host | default(hostvars[host].inventory_hostname) -%} {% set tidb_port = hostvars[host].tidb_status_port -%} {% set _ = all_tidb.append("%s:%s" % (tidb_ip, tidb_port)) -%} {% endfor %} tidb_status_addr = "{{ all_tidb | join(',') }}" service_addr = "{{ ansible_host | default(inventory_hostname) }}:{{ flash_service_port }}" {% for item, value in tiflash_conf.flash | dictsort_by_value_type -%} {% if value is not mapping -%} {{ item }} = {{ value | to_json }} {% else %} [flash.{{ item }}] {% if item == 'proxy' %} config = "{{ flash_proxy_config }}" {% endif %} {% for sub_item, sub_value in value | dictsort -%} {{ sub_item }} = {{ sub_value | to_json }} {% endfor %} {% endif %} {% endfor %} [status] metrics_port = {{ metrics_port }} {% for item, value in tiflash_conf.profiles | dictsort_by_value_type -%} {% if value is not mapping -%} {{ item }} = {{ value | to_json }} {% endif %} {% endfor %} [logger] errorlog = "{{ tiflash_errlog }}" log = "{{ tiflash_server_log }}" {% for item, value in tiflash_conf.logger | dictsort -%} {{ item }} = {{ value | to_json }} {% endfor %} [application] {% for item, value in tiflash_conf.application | dictsort -%} {{ item }} = {{ value | to_json }} {% endfor %} [raft] {% set all_pd = [] -%} {% for host in groups.pd_servers -%} {% set pd_ip = hostvars[host].ansible_host | default(hostvars[host].inventory_hostname) -%} {% set pd_port = hostvars[host].pd_client_port -%} {% set _ = all_pd.append("%s:%s" % (pd_ip, pd_port)) -%} {% endfor %} pd_addr = "{{ all_pd | join(',') }}" {% for item, value in tiflash_conf.raft | dictsort -%} {{ item }} = {{ value | to_json }} {% endfor %} [quotas] {% for item, value in tiflash_conf.quotas | dictsort_by_value_type -%} {% if value is not mapping -%} {{ item }} = {{ value | to_json }} {% else %} [quotas.{{ item }}] {% for sub_item, sub_value in value | dictsort_by_value_type -%} {% if sub_value is not mapping -%} {{ sub_item }} = {{ sub_value | to_json }} {% else %} [quotas.{{ item }}.{{ sub_item }}] {% for sub_sub_item, sub_sub_value in sub_value | dictsort -%} {{ sub_sub_item }} = {{ sub_sub_value | to_json }} {% endfor %} {% endif %} {% endfor %} {% endif %} {% endfor %} [users] {% for item, value in tiflash_conf.users | dictsort_by_value_type -%} {% if value is not mapping -%} {{ item }} = {{ value | to_json }} {% else %} [users.{{ item }}] {% for sub_item, sub_value in value | dictsort_by_value_type -%} {% if sub_value is not mapping -%} {{ sub_item }} = {{ sub_value | to_json }} {% else %} [users.{{ item }}.{{ sub_item }}] {% for sub_sub_item, sub_sub_value in sub_value | dictsort -%} {{ sub_sub_item }} = {{ sub_sub_value | to_json }} {% endfor %} {% endif %} {% endfor %} {% endif %} {% endfor %} [profiles] {% for item, value in tiflash_conf.profiles | dictsort_by_value_type -%} {% if value is not mapping -%} {{ item }} = {{ value | to_json }} {% else %} [profiles.{{ item }}] {% for sub_item, sub_value in value | dictsort -%} {{ sub_item }} = {{ sub_value | to_json }} {% endfor %} {% endif %} {% endfor %} ================================================ FILE: roles/tiflash/templates/tiflash_learner.toml.j2 ================================================ log-file = "{{ tiflash_tikv_log }}" [readpool] {% for item, value in tiflash_learner_conf.readpool | dictsort_by_value_type -%} {% if value is not mapping -%} {{ item }} = {{ value | to_json }} {% else %} [readpool.{{ item }}] {% for sub_item, sub_value in value | dictsort -%} {{ sub_item }} = {{ sub_value | to_json }} {% endfor %} {% endif %} {% endfor %} [server] engine-addr = "{{ ansible_host | default(inventory_hostname) }}:{{ flash_service_port }}" addr = "{{ ansible_host | default(inventory_hostname) }}:{{ flash_proxy_port }}" advertise-addr = "{{ ansible_host | default(inventory_hostname) }}:{{ flash_proxy_port }}" status-addr = "{{ ansible_host | default(inventory_hostname) }}:{{ flash_proxy_status_port }}" {% for item, value in tiflash_learner_conf.server | dictsort -%} {% if item == "labels" %} {{ item }} = {{ value | tikv_server_labels_format }} {% else %} {{ item }} = {{ value | to_json }} {% endif %} {% endfor %} [storage] data-dir = "{{ raft_data_dir }}" {% for item, value in tiflash_learner_conf.storage | dictsort -%} {{ item }} = {{ value | to_json }} {% endfor %} [pd] {% for item, value in tiflash_learner_conf.pd | dictsort -%} {{ item }} = {{ value | to_json }} {% endfor %} [metric] {% for item, value in tiflash_learner_conf.metric | dictsort -%} {{ item }} = {{ value | to_json }} {% endfor %} [raftstore] {% for item, value in tiflash_learner_conf.raftstore | dictsort -%} {{ item }} = {{ value | to_json }} {% endfor %} [coprocessor] {% for item, value in tiflash_learner_conf.coprocessor | dictsort -%} {{ item }} = {{ value | to_json }} {% endfor %} [rocksdb] {% for item, value in tiflash_learner_conf.rocksdb | dictsort_by_value_type -%} {% if value is not mapping -%} {{ item }} = {{ value | to_json }} {% else %} [rocksdb.{{ item }}] {% for sub_item, sub_value in value | dictsort -%} {{ sub_item }} = {{ sub_value | to_json }} {% endfor %} {% endif %} {% endfor %} [raftdb] {% for item, value in tiflash_learner_conf.raftdb | dictsort_by_value_type -%} {% if value is not mapping -%} {{ item }} = {{ value | to_json }} {% else %} [raftdb.{{ item }}] {% for sub_item, sub_value in value | dictsort -%} {{ sub_item }} = {{ sub_value | to_json }} {% endfor %} {% endif %} {% endfor %} [security] {% for item, value in tiflash_learner_conf.security | dictsort -%} {{ item }} = {{ value | to_json }} {% endfor %} [import] {% for item, value in tiflash_learner_conf.import | dictsort -%} {{ item }} = {{ value | to_json }} {% endfor %} ================================================ FILE: roles/tiflash/vars/tiflash-learner.yml ================================================ # TiKV config template # Human-readable big numbers: # File size(based on byte): KB, MB, GB, TB, PB # e.g.: 1_048_576: "1MB" # Time(based on ms): ms, s, m, h # e.g.: 78_000: "1.3m" readpool: storage: coprocessor: server: storage: pd: # This section will be overwritten by command line parameters metric: #address: "172.16.30.31:9531" #interval: "15s" #job: "tikv" raftstore: coprocessor: rocksdb: wal-dir: "" defaultcf: lockcf: writecf: raftdb: defaultcf: security: ca-path: "" cert-path: "" key-path: "" import: ================================================ FILE: roles/tiflash/vars/tiflash.yml ================================================ --- global: display_name: "TiFlash" default_profile: "default" mark_cache_size: 5368709120 listen_host: "0.0.0.0" flash: flash_cluster: refresh_interval: 20 update_rule_interval: 5 master_ttl: 60 proxy: status: logger: count: 20 size: "1000M" level: "debug" application: runAsDaemon: true raft: quotas: default: interval: result_rows: 0 read_rows: 0 execution_time: 0 queries: 0 errors: 0 duration: 3600 users: readonly: quota: "default" profile: "readonly" password: "" networks: ip: "::/0" default: quota: "default" profile: "default" password: "" networks: ip: "::/0" profiles: readonly: readonly: 1 default: load_balancing: "random" use_uncompressed_cache: 0 max_memory_usage: 10000000000 ================================================ FILE: roles/tikv/defaults/main.yml ================================================ --- tikv_port: 20160 tikv_status_port: 20180 tikv_data_dir: "{{ deploy_dir }}/data" tikv_log_dir: "{{ deploy_dir }}/log" tikv_log_filename: "tikv.log" tikv_stderr_filename: "tikv_stderr.log" tikv_conf_dir: "{{ deploy_dir }}/conf" labels: {} wal_dir: "" raftdb_path: "" # docker settings tikv_docker_log_dir: "{{ tikv_log_dir }}/tikv" ================================================ FILE: roles/tikv/files/make-ssl.sh ================================================ #!/bin/bash # Author: Smana smainklh@gmail.com # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. set -o errexit set -o pipefail usage() { cat << EOF Create self signed certificates Usage : $(basename $0) [-d ] -h | --help : Show this message -d | --ssldir : Directory where the certificates will be located Environmental variables HOSTS and CN should be set to generate keys for each host. EOF } # Options parsing while (($#)); do case "$1" in -h | --help) usage; exit 0;; -d | --ssldir) SSLDIR="${2}"; shift 2;; *) usage echo "ERROR : Unknown option" exit 3 ;; esac done if [ -z ${SSLDIR} ]; then echo "ERROR: the directory where the certificates will be located is missing. option -d" exit 1 fi tmpdir=$(mktemp -d /tmp/tidb_cacert.XXXXXX) trap 'rm -rf "${tmpdir}"' EXIT cd "${tmpdir}" mkdir -p "${SSLDIR}" if [ -e "$SSLDIR/ca-config.json" ]; then # Reuse existing CA cp $SSLDIR/{ca-config.json,ca-csr.json} . else echo "ERROR: ca-config.json and ca-csr.json is missing in $SSLDIR." exit 1 fi # Root CA if [ -e "$SSLDIR/ca-key.pem" ]; then # Reuse existing CA cp $SSLDIR/{ca.pem,ca-key.pem} . else cfssl gencert -initca ca-csr.json | cfssljson -bare ca - > /dev/null 2>&1 fi # client cert if [ ! -e "$SSLDIR/client-key.pem" ]; then echo '{"CN":"client","hosts":[""],"key":{"algo":"rsa","size":2048}}' | cfssl gencert -ca=ca.pem -ca-key=ca-key.pem -config=ca-config.json -profile=client -hostname="" - | cfssljson -bare client > /dev/null 2>&1 fi gen_key_and_cert() { local host=$1 local cn=$2 local name=$3 echo "{\"CN\":\"${cn}\",\"hosts\":[\"\"],\"key\":{\"algo\":\"rsa\",\"size\":2048}}" | cfssl gencert -ca=ca.pem -ca-key=ca-key.pem -config=ca-config.json -profile=server -hostname="${host},127.0.0.1" - | cfssljson -bare ${name} > /dev/null 2>&1 } # Nodes if [ -n "$HOSTS" ]; then for host in $HOSTS; do gen_key_and_cert "${host}" "${CN}" "${CN}-${host}" done fi # Install certs mv *.pem ${SSLDIR}/ ================================================ FILE: roles/tikv/meta/main.yml ================================================ --- dependencies: - role: common_dir ================================================ FILE: roles/tikv/tasks/binary_deployment.yml ================================================ --- - name: deploy binary copy: src="{{ resources_dir }}/bin/tikv-server" dest="{{ deploy_dir }}/bin/" mode=0755 backup=yes register: tikv_binary - name: backup binary file command: mv "{{ tikv_binary.backup_file }}" "{{ backup_dir }}" when: tikv_binary.changed and tikv_binary.backup_file is defined - name: create run script template: src: "{{ item }}_{{ role_name }}_binary.sh.j2" dest: "{{ deploy_dir }}/scripts/{{ item }}_{{ role_name }}.sh" mode: "0755" backup: yes with_items: - run vars: role_status_dir: status/{{ role_name }} register: tikv_script - name: backup script file command: mv "{{ tikv_script.backup_file }}" "{{ backup_dir }}" when: tikv_script.changed and tikv_script.backup_file is defined - include_tasks: "{{ process_supervision }}_deployment.yml" ================================================ FILE: roles/tikv/tasks/check_certs.yml ================================================ --- - name: "Check_certs | check if the certs have already been generated on control machine" find: paths: "{{ cert_dir }}" patterns: "*.pem" get_checksum: true delegate_to: localhost register: cert_control_node run_once: true - debug: var: cert_control_node - name: "Check_certs | Set default value for 'sync_certs', 'gen_certs' to false" set_fact: sync_certs: false gen_certs: false - set_fact: tikv_host: "{{ hostvars[inventory_hostname].ansible_host | default(inventory_hostname) }}" - name: "Check certs | check if a cert already exists on node" stat: path: "{{ tikv_cert_dir }}/{{ item }}" register: cert_tikv_node with_items: - ca.pem - tikv-server-{{ tikv_host }}-key.pem - tikv-server-{{ tikv_host }}.pem - debug: var: cert_tikv_node - name: "Check_certs | Set 'gen_certs' to true" set_fact: gen_certs: true when: not item in cert_control_node.files|map(attribute='path') | list delegate_to: localhost run_once: true with_items: >- ['{{cert_dir}}/ca.pem', {% set all_tikv_hosts = groups['tikv_servers']|unique|sort %} {% for host in all_tikv_hosts %} {% set tikv_ip = hostvars[host].ansible_host | default(hostvars[host].inventory_hostname) -%} '{{cert_dir}}/tikv-server-{{ tikv_ip }}-key.pem' {% if not loop.last %}{{','}}{% endif %} {% endfor %}] - debug: var: gen_certs - name: "Check_certs | Set 'gen_node_certs' to true" set_fact: gen_node_certs: |- { {% set all_tikv_hosts = groups['tikv_servers']|unique|sort -%} {% set existing_certs = cert_control_node.files|map(attribute='path')|list|sort %} {% for host in all_tikv_hosts -%} {% set tikv_ip = hostvars[host].ansible_host | default(hostvars[host].inventory_hostname) -%} {% set host_cert = "%s/tikv-server-%s-key.pem"|format(cert_dir, tikv_ip) %} {% if host_cert in existing_certs -%} "{{ host }}": False, {% else -%} "{{ host }}": True, {% endif -%} {% endfor %} } run_once: true - debug: var: gen_node_certs - name: "Check_certs | Set tikv_cert_key" set_fact: tikv_cert_key_path: "{{ cert_dir }}/tikv-server-{{ hostvars[inventory_hostname].tikv_host }}-key.pem" - debug: var: tikv_cert_key_path - name: "Check_certs | Set 'sync_certs' to true" set_fact: sync_certs: true when: gen_node_certs[inventory_hostname] or (not cert_tikv_node.results[0].stat.exists|default(False)) or (not cert_tikv_node.results[1].stat.exists|default(False)) or (cert_tikv_node.results[1].stat.checksum|default('') != cert_control_node.files|selectattr("path","equalto",tikv_cert_key_path)|map(attribute="checksum")|first|default('')) - debug: var: sync_certs ================================================ FILE: roles/tikv/tasks/check_filesystem.yml ================================================ --- - name: Get ansible_mounts fact setup: gather_subset: hardware filter: ansible_mounts - name: Determine which mountpoint that tikv data dir exists on shell: "df {{ tikv_data_dir }} | tail -n1 | awk '{print $NF}'" register: deploy_partition changed_when: False - set_fact: xfs_filesystem: "true" when: - item.mount == deploy_partition.stdout - item.fstype == 'xfs' with_items: "{{ ansible_mounts }}" - name: Preflight check - Check bug if filesystem is xfs shell: cd {{ tikv_data_dir }} && fallocate -n -o 0 -l 9192 tidb_test && printf 'a%.0s' {1..5000} > tidb_test && truncate -s 5000 tidb_test && fallocate -p -n -o 5000 -l 4192 tidb_test && LANG=en_US.UTF-8 stat tidb_test |awk 'NR==2{print $2}' register: xfs_result when: - xfs_filesystem is defined - xfs_filesystem - name: Clean check file for xfs filesystem file: path={{ tikv_data_dir }}/tidb_test state=absent when: - xfs_filesystem is defined - xfs_filesystem - set_fact: ext4_filesystem_alert: "true" when: - item.mount == deploy_partition.stdout - item.fstype == 'ext4' - item.options.find("nodelalloc") == -1 with_items: "{{ ansible_mounts }}" - name: Preflight check - Does tikv data dir meet ext4 file system requirement fail: msg: "You don't mount the file system of {{ deploy_partition.stdout }} with ext4 nodelalloc option. See https://github.com/pingcap/docs/blob/master/online-deployment-using-ansible.md#step-8-mount-the-data-disk-ext4-filesystem-with-options-on-the-target-machines." when: - ext4_filesystem_alert is defined - ext4_filesystem_alert - name: Preflight check - Set fssystem_check_result fact set_fact: fssystem_check_result: true when: "(item.mount == deploy_partition.stdout and item.fstype == 'ext4') or (xfs_filesystem is defined and xfs_filesystem and xfs_result.stdout|int == 5000)" with_items: "{{ ansible_mounts }}" - name: Preflight check - Does tikv data dir meet ext4 or xfs file system requirement fail: msg: 'The file system mounted at {{ item.mount }} does not meet ext4 or xfs file system requirement' when: - item.mount == deploy_partition.stdout - fssystem_check_result is not defined with_items: "{{ ansible_mounts }}" ================================================ FILE: roles/tikv/tasks/docker_deployment.yml ================================================ --- - name: create log directory file: path="{{ item }}" state=directory mode=0755 with_items: - "{{ tikv_docker_log_dir }}" - name: deploy tikv image copy: src="{{ downloads_dir }}/tikv.tar" dest="{{ deploy_dir }}/images" mode=0755 - name: create run script template: src: "{{ item }}_{{ role_name }}_docker.sh.j2" dest: "{{ deploy_dir }}/scripts/{{ item }}_{{ role_name }}.sh" mode: "0755" backup: yes with_items: - run - name: load docker image from archive docker_image: state: present force: yes name: pingcap/tikv tag: "{{ tidb_version }}" load_path: "{{ images_dir }}/tikv.tar" - include_tasks: "{{ process_supervision }}_deployment.yml" ================================================ FILE: roles/tikv/tasks/gen_certs.yml ================================================ --- - name: Gen_certs | copy certs generation script copy: src: "make-ssl.sh" dest: "{{ script_dir }}/make-ssl.sh" mode: 0700 run_once: yes delegate_to: localhost when: gen_certs|default(false) - name: Gen_certs | run cert generation script command: "{{ script_dir }}/make-ssl.sh -d {{ cert_dir }}" environment: - HOSTS: "{% for h in groups['tikv_servers'] %} {% if gen_node_certs[h]|default(true) %} {{ hostvars[h].ansible_host | default(hostvars[h].inventory_hostname) }} {% endif %} {% endfor %}" - PATH: "{{ ansible_env.PATH }}:{{ binary_dir }}" - CN: "tikv-server" run_once: yes delegate_to: localhost when: gen_certs|default(false) ================================================ FILE: roles/tikv/tasks/install_certs.yml ================================================ --- - name: "Deploy_certs | Make sure the certificate directory exits" file: path: "{{ tikv_cert_dir }}" state: directory mode: 0700 - name: "Deploy_certs | Deploy certificates" copy: src: "{{ cert_dir }}/{{ item }}" dest: "{{ tikv_cert_dir }}/{{ item }}" mode: 0600 backup: yes with_items: - ca.pem - tikv-server-{{ tikv_host }}-key.pem - tikv-server-{{ tikv_host }}.pem when: sync_certs|default(false) ================================================ FILE: roles/tikv/tasks/main.yml ================================================ --- # tasks file for tikv - name: create deploy directories file: path={{ item }} state=directory mode=0755 with_items: - "{{ tikv_log_dir }}" - "{{ tikv_data_dir }}" - "{{ tikv_conf_dir }}" - include_tasks: check_filesystem.yml - include_tasks: check_certs.yml when: enable_tls|default(false) - include_tasks: gen_certs.yml when: enable_tls|default(false) - include_tasks: install_certs.yml when: enable_tls|default(false) - name: "load customized config: tidb-ansible/conf/tikv.yml" include_vars: file={{ playbook_dir }}/conf/tikv.yml name=tikv_conf_custom - name: load default config include_vars: file=default.yml name=tikv_conf_default - name: generate dynamic config set_fact: tikv_conf_generated: server: labels: "{{ labels }}" rocksdb: wal-dir: "{{ wal_dir }}" raftstore: raftdb-path: "{{ raftdb_path }}" security: ca-path: >- {%- if enable_tls|default(false) -%}{{ tikv_cert_dir }}/ca.pem{%- else -%}{%- endif -%} cert-path: >- {%- if enable_tls|default(false) -%}{{ tikv_cert_dir }}/tikv-server-{{ tikv_host }}.pem{%- else -%}{%- endif -%} key-path: >- {%- if enable_tls|default(false) -%}{{ tikv_cert_dir }}/tikv-server-{{ tikv_host }}-key.pem{%- else -%}{%- endif -%} - name: generate final config set_fact: tikv_conf: "{{ tikv_conf_custom | with_default_dicts(tikv_conf_generated, tikv_conf_default) | update_default_dicts }}" - debug: var=tikv_conf - name: create config file template: src=tikv.toml.j2 dest={{ deploy_dir }}/conf/tikv.toml mode=0644 backup=yes register: tikv_conf_st - name: backup conf file command: mv "{{ tikv_conf_st.backup_file }}" "{{ backup_dir }}" when: tikv_conf_st.changed and tikv_conf_st.backup_file is defined - include_tasks: "{{ deployment_method }}_deployment.yml" - name: prepare firewalld white list set_fact: firewalld_ports: "{{ [tikv_port ~ '/tcp'] + firewalld_ports }}" ================================================ FILE: roles/tikv/tasks/supervise_deployment.yml ================================================ --- - name: deploy supervise include_role: name: supervise vars: this_role_name: tikv service_name: tikv-{{ tikv_port }} ================================================ FILE: roles/tikv/tasks/systemd_deployment.yml ================================================ --- - name: deploy systemd include_role: name: systemd vars: this_role_name: tikv service_name: tikv-{{ tikv_port }} ================================================ FILE: roles/tikv/templates/run_tikv_binary.sh.j2 ================================================ #!/bin/bash set -e ulimit -n 1000000 # WARNING: This file was auto-generated. Do not edit! # All your edit might be overwritten! cd "{{ deploy_dir }}" || exit 1 {% set my_ip = hostvars[inventory_hostname].ansible_host | default(hostvars[inventory_hostname].inventory_hostname) -%} {% set my_peer_id = groups.tikv_servers.index(inventory_hostname) + 1 -%} {% set all_pd = [] -%} {% set pd_hosts = groups.pd_servers %} {% for host in pd_hosts -%} {% set pd_ip = hostvars[host].ansible_host | default(hostvars[host].inventory_hostname) -%} {% set pd_port = hostvars[host].pd_client_port -%} {% set _ = all_pd.append("%s:%s" % (pd_ip, pd_port)) -%} {% endfor -%} export RUST_BACKTRACE=1 export TZ=${TZ:-/etc/localtime} echo -n 'sync ... ' stat=$(time sync) echo ok echo $stat echo $$ > "status/{{ role_name }}.pid" exec bin/tikv-server \ --addr "0.0.0.0:{{ tikv_port }}" \ --advertise-addr "{{ my_ip }}:{{ tikv_port }}" \ --status-addr "{{ my_ip }}:{{ tikv_status_port }}" \ --pd "{{ all_pd | join(',') }}" \ --data-dir "{{ tikv_data_dir }}" \ --config conf/tikv.toml \ --log-file "{{ tikv_log_dir }}/{{ tikv_log_filename }}" 2>> "{{ tikv_log_dir }}/{{ tikv_stderr_filename }}" ================================================ FILE: roles/tikv/templates/run_tikv_docker.sh.j2 ================================================ #!/bin/bash set -e ulimit -n 1000000 # WARNING: This file was auto-generated. Do not edit! # All your edit might be overwritten! cd "{{ deploy_dir }}" || exit 1 {% set my_ip = hostvars[inventory_hostname].ansible_host | default(hostvars[inventory_hostname].inventory_hostname) -%} {% set all_pd = [] -%} {% set pd_hosts = groups.pd_servers %} {% for host in pd_hosts -%} {% set pd_ip = hostvars[host].ansible_host | default(hostvars[host].inventory_hostname) -%} {% set pd_port = hostvars[host].pd_client_port -%} {% set _ = all_pd.append("%s:%s" % (pd_ip, pd_port)) -%} {% endfor -%} export RUST_BACKTRACE=1 export TZ=${TZ:-/etc/localtime} echo -n 'sync ... ' stat=$(time sync) echo ok echo $stat echo $$ > "status/{{ role_name }}.pid" exec docker run \ -p {{ tikv_port }}:20160 \ -v /etc/localtime:/etc/localtime:ro \ -v "{{ tikv_conf_dir }}/tikv.toml:/etc/tikv.toml:ro" \ -v "{{ tikv_data_dir }}:/data" \ -v "{{ tikv_docker_log_dir }}:/var/log" \ -u `id -u {{ deploy_user }}` \ --ulimit nofile=1000000:1000000 \ --hostname "tikv-{{ tikv_port }}" \ --name "tikv-{{ tikv_port }}" \ pingcap/tikv:{{ tidb_version }} \ --addr="0.0.0.0:20160" \ --advertise-addr="{{ my_ip }}:{{ tikv_port }}" \ --status-addr "{{ my_ip }}:{{ tikv_status_port }}" \ --data-dir=/data \ --log-file="/var/log/{{ tikv_log_filename }}" \ --pd={{ all_pd | join(',') }} \ --config=/etc/tikv.toml ================================================ FILE: roles/tikv/templates/tikv.toml.j2 ================================================ # TiKV config template # Human-readable big numbers: # File size(based on byte): KB, MB, GB, TB, PB # e.g.: 1_048_576 = "1MB" # Time(based on ms): ms, s, m, h # e.g.: 78_000 = "1.3m" {% for item, value in tikv_conf.global | dictsort -%} {{ item }} = {{ value | to_json }} {% endfor %} [readpool] {% for item, value in tikv_conf.readpool | dictsort_by_value_type -%} {% if value is not mapping -%} {{ item }} = {{ value | to_json }} {% else %} [readpool.{{ item }}] {% for sub_item, sub_value in value | dictsort -%} {{ sub_item }} = {{ sub_value | to_json }} {% endfor %} {% endif %} {% endfor %} [server] {% for item, value in tikv_conf.server | dictsort -%} {% if item == "labels" %} {{ item }} = {{ value | tikv_server_labels_format }} {% else %} {{ item }} = {{ value | to_json }} {% endif %} {% endfor %} [storage] {% for item, value in tikv_conf.storage | dictsort_by_value_type -%} {% if value is not mapping -%} {{ item }} = {{ value | to_json }} {% else %} [storage.{{ item }}] {% for sub_item, sub_value in value | dictsort -%} {{ sub_item }} = {{ sub_value | to_json }} {% endfor %} {% endif %} {% endfor %} [pd] # This section will be overwritten by command line parameters {% for item, value in tikv_conf.pd | dictsort -%} {{ item }} = {{ value | to_json }} {% endfor %} [metric] [raftstore] {% for item, value in tikv_conf.raftstore | dictsort -%} {{ item }} = {{ value | to_json }} {% endfor %} [coprocessor] {% for item, value in tikv_conf.coprocessor | dictsort -%} {{ item }} = {{ value | to_json }} {% endfor %} [rocksdb] {% for item, value in tikv_conf.rocksdb | dictsort_by_value_type -%} {% if value is not mapping -%} {{ item }} = {{ value | to_json}} {% else %} [rocksdb.{{ item }}] {% for sub_item, sub_value in value | dictsort_by_value_type -%} {% if sub_value is not mapping -%} {{ sub_item }} = {{ sub_value | to_json }} {% else %} [rocksdb.{{ item }}.{{sub_item}}] {% for sub_sub_item, sub_sub_value in sub_value | dictsort -%} {{ sub_sub_item }} = {{ sub_sub_value | to_json }} {% endfor %} {% endif %} {% endfor %} {% endif %} {% endfor %} [raftdb] {% for item, value in tikv_conf.raftdb | dictsort_by_value_type -%} {% if value is not mapping -%} {{ item }} = {{ value | to_json}} {% else %} [raftdb.{{ item }}] {% for sub_item, sub_value in value | dictsort -%} {{ sub_item }} = {{ sub_value | to_json }} {% endfor %} {% endif %} {% endfor %} [security] {% for item, value in tikv_conf.security | dictsort_by_value_type -%} {% if value is not mapping -%} {{ item }} = {{ value | to_json }} {% else %} [security.{{ item }}] {% for sub_item, sub_value in value | dictsort_by_value_type -%} {% if sub_value is not mapping -%} {{ sub_item }} = {{ sub_value | to_json }} {% else %} [security.{{ item }}.{{sub_item}}] {% for sub_sub_item, sub_sub_value in sub_value | dictsort -%} {{ sub_sub_item }} = {{ sub_sub_value | to_json }} {% endfor %} {% endif %} {% endfor %} {% endif %} {% endfor %} [import] {% for item, value in tikv_conf.import | dictsort -%} {{ item }} = {{ value | to_json }} {% endfor %} [pessimistic-txn] {% for item, value in tikv_conf.pessimistic_txn | dictsort -%} {{ item }} = {{ value | to_json }} {% endfor %} [gc] {% for item, value in tikv_conf.gc | dictsort -%} {{ item }} = {{ value | to_json }} {% endfor %} ================================================ FILE: roles/tikv/vars/default.yml ================================================ --- ## The default configuration file for TiKV in YAML format ## TiKV config template ## Human-readable big numbers: ## File size(based on byte): KB, MB, GB, TB, PB ## e.g.: 1_048_576 = "1MB" ## Time(based on ms): ms, s, m, h ## e.g.: 78_000 = "1.3m" global: ## Log levels: trace, debug, info, warning, error, critical. ## Note that `debug` and `trace` are only available in development builds. # log-level: "info" ## Timespan between rotating the log files. ## Once this timespan passes, log files will be rotated, i.e. existing log file will have a ## timestamp appended to its name and a new file will be created. # log-rotation-timespan: "24h" readpool: ## Configurations for the single thread pool serving read requests. unified: ## The minimal working thread count of the thread pool. # min-thread-count: 1 ## The maximum working thread count of the thread pool. ## The default value is max(4, LOGICAL_CPU_NUM * 0.8). # max-thread-count: 8 ## Size of the stack for each thread in the thread pool. # stack-size: "10MB" ## Max running tasks of each worker, reject if exceeded. # max-tasks-per-worker: 2000 storage: ## Size of the thread pool for high-priority operations. # high-concurrency: 4 ## Size of the thread pool for normal-priority operations. # normal-concurrency: 4 ## Size of the thread pool for low-priority operations. # low-concurrency: 4 ## Max running high-priority operations of each worker, reject if exceeded. # max-tasks-per-worker-high: 2000 ## Max running normal-priority operations of each worker, reject if exceeded. # max-tasks-per-worker-normal: 2000 ## Max running low-priority operations of each worker, reject if exceeded. # max-tasks-per-worker-low: 2000 ## Size of the stack for each thread in the thread pool. # stack-size: "10MB" coprocessor: ## Most read requests from TiDB are sent to the coprocessor of TiKV. high/normal/low-concurrency is ## used to set the number of threads of the coprocessor. ## If there are many read requests, you can increase these config values (but keep it within the ## number of system CPU cores). For example, for a 32-core machine deployed with TiKV, you can even ## set these config to 30 in heavy read scenarios. ## If CPU_NUM > 8, the default thread pool size for coprocessors is set to CPU_NUM * 0.8. # high-concurrency: 8 # normal-concurrency: 8 # low-concurrency: 8 # max-tasks-per-worker-high: 2000 # max-tasks-per-worker-normal: 2000 # max-tasks-per-worker-low: 2000 # stack-size: "10MB" server: ## Advertise listening address for client communication. ## If not set, `addr` will be used. # advertise-addr: "" ## Size of the thread pool for the gRPC server. # grpc-concurrency: 4 ## The number of max concurrent streams/requests on a client connection. # grpc-concurrent-stream: 1024 ## The number of connections with each TiKV server to send Raft messages. # grpc-raft-conn-num: 1 ## Amount to read ahead on individual gRPC streams. # grpc-stream-initial-window-size: "2MB" ## Time to wait before sending out a ping to check if server is still alive. ## This is only for communications between TiKV instances. # grpc-keepalive-time: "10s" ## Time to wait before closing the connection without receiving KeepAlive ping Ack. # grpc-keepalive-timeout: "3s" ## How many snapshots can be sent concurrently. # concurrent-send-snap-limit: 32 ## How many snapshots can be received concurrently. # concurrent-recv-snap-limit: 32 ## Max allowed recursion level when decoding Coprocessor DAG expression. # end-point-recursion-limit: 1000 ## Max time to handle Coprocessor requests before timeout. # end-point-request-max-handle-duration: "60s" ## Max bytes that snapshot can be written to disk in one second. ## It should be set based on your disk performance. # snap-max-write-bytes-per-sec: "100MB" ## Attributes about this server, e.g. `{ zone = "us-west-1", disk = "ssd" }`. labels: {} storage: ## The number of slots in Scheduler latches, which controls write concurrency. ## In most cases you can use the default value. When importing data, you can set it to a larger ## value, but no more than 2097152. # scheduler-concurrency: 524288 ## Scheduler's worker pool size, i.e. the number of write threads. ## It should be less than total CPU cores. When there are frequent write operations, set it to a ## higher value. More specifically, you can run `top -H -p tikv-pid` to check whether the threads ## named `sched-worker-pool` are busy. # scheduler-worker-pool-size: 4 ## When the pending write bytes exceeds this threshold, the "scheduler too busy" error is displayed. # scheduler-pending-write-threshold: "100MB" ## TiKV will create a temporary file in {{data-dir}} to reserve some space, which is named 'space_placeholder_file'. ## When the disk has no free space you could remove this temporary file so that TiKV can execute compaction ## job to reclaim disk space, which requires some extra temporary space. # reserve-space: "2GB" block-cache: ## Whether to create a shared block cache for all RocksDB column families. ## ## Block cache is used by RocksDB to cache uncompressed blocks. Big block cache can speed up ## read. It is recommended to turn on shared block cache. Since only the total cache size need ## to be set, it is easier to config. In most cases it should be able to auto-balance cache ## usage between column families with standard LRU algorithm. ## ## The rest of config in the storage.block-cache session is effective only when shared block ## cache is on. # shared: true ## Size of the shared block cache. Normally it should be tuned to 30%-50% of system's total ## memory. When the config is not set, it is decided by the sum of the following fields or ## their default value: ## * rocksdb.defaultcf.block-cache-size or 25% of system's total memory ## * rocksdb.writecf.block-cache-size or 15% of system's total memory ## * rocksdb.lockcf.block-cache-size or 2% of system's total memory ## * raftdb.defaultcf.block-cache-size or 2% of system's total memory ## ## To deploy multiple TiKV nodes on a single physical machine, configure this parameter ## explicitly. Otherwise, the OOM problem might occur in TiKV. # capacity: "1GB" pd: ## PD endpoints. # endpoints: [] metric: ## Prometheus client push interval. ## Setting the value to 0s stops Prometheus client from pushing. # interval: "15s" ## Prometheus PushGateway address. ## Leaving it empty stops Prometheus client from pushing. # address: "" ## Prometheus client push job name. ## Note: A node id will automatically append, e.g., "tikv_1". # job: "tikv" raftstore: ## Store capacity, i.e. max data size allowed. ## If it is not set, disk capacity is used. # capacity: 0 ## Internal notify capacity. ## 40960 is suitable for about 7000 Regions. It is recommended to use the default value. # notify-capacity: 40960 ## Maximum number of internal messages to process in a tick. # messages-per-tick: 4096 ## Region heartbeat tick interval for reporting to PD. # pd-heartbeat-tick-interval: "60s" ## Store heartbeat tick interval for reporting to PD. # pd-store-heartbeat-tick-interval: "10s" ## How long the peer will be considered down and reported to PD when it hasn't been active for this ## time. # max-peer-down-duration: "5m" ## Interval to check whether to start manual compaction for a Region. # region-compact-check-interval: "5m" ## Interval (s) to check Region whether the data are consistent. # consistency-check-interval: 0 ## Delay time before deleting a stale peer. # clean-stale-peer-delay: "10m" ## Use how many threads to handle log apply # apply-pool-size: 2 ## Use how many threads to handle raft messages # store-pool-size: 2 coprocessor: rocksdb: ## Maximum number of threads of RocksDB background jobs. ## The background tasks include compaction and flush. For detailed information why RocksDB needs to ## do compaction, see RocksDB-related materials. When write traffic (like the importing data size) ## is big, it is recommended to enable more threads. But set the number of the enabled threads ## smaller than that of CPU cores. For example, when importing data, for a machine with a 32-core ## CPU, set the value to 28. # max-background-jobs: 8 ## Represents the maximum number of threads that will concurrently perform a sub-compaction job by ## breaking it into multiple, smaller ones running simultaneously. # max-sub-compactions: 1 ## Number of open files that can be used by the DB. ## Value -1 means files opened are always kept open and RocksDB will prefetch index and filter ## blocks into block cache at startup. So if your database has a large working set, it will take ## several minutes to open the DB. You may need to increase this if your database has a large ## working set. You can estimate the number of files based on `target-file-size-base` and ## `target_file_size_multiplier` for level-based compaction. # max-open-files: 40960 ## RocksDB Write-Ahead Logs (WAL) recovery mode. ## 0 : TolerateCorruptedTailRecords, tolerate incomplete record in trailing data on all logs; ## 1 : AbsoluteConsistency, We don't expect to find any corruption in the WAL; ## 2 : PointInTimeRecovery, Recover to point-in-time consistency; ## 3 : SkipAnyCorruptedRecords, Recovery after a disaster; # wal-recovery-mode: 2 ## RocksDB WAL directory. ## This config specifies the absolute directory path for WAL. ## If it is not set, the log files will be in the same directory as data. When you set the path to ## RocksDB directory in memory like in `/dev/shm`, you may want to set`wal-dir` to a directory on a ## persistent storage. See https://github.com/facebook/rocksdb/wiki/How-to-persist-in-memory-RocksDB-database . ## If there are two disks on the machine, storing RocksDB data and WAL logs on different disks can ## improve performance. # wal-dir: "/tmp/tikv/store" ## The following two fields affect how archived WAL will be deleted. ## 1. If both values are set to 0, logs will be deleted ASAP and will not get into the archive. ## 2. If `wal-ttl-seconds` is 0 and `wal-size-limit` is not 0, WAL files will be checked every 10 ## min and if total size is greater than `wal-size-limit`, they will be deleted starting with the ## earliest until `wal-size-limit` is met. All empty files will be deleted. ## 3. If `wal-ttl-seconds` is not 0 and `wal-size-limit` is 0, then WAL files will be checked every ## `wal-ttl-seconds / 2` and those that are older than `wal-ttl-seconds` will be deleted. ## 4. If both are not 0, WAL files will be checked every 10 min and both checks will be performed ## with ttl being first. ## When you set the path to RocksDB directory in memory like in `/dev/shm`, you may want to set ## `wal-ttl-seconds` to a value greater than 0 (like 86400) and backup your DB on a regular basis. ## See https://github.com/facebook/rocksdb/wiki/How-to-persist-in-memory-RocksDB-database . # wal-ttl-seconds: 0 # wal-size-limit: 0 ## Max RocksDB WAL size in total # max-total-wal-size: "4GB" ## RocksDB Statistics provides cumulative stats over time. ## Turning statistics on will introduce about 5%-10% overhead for RocksDB, but it can help you to ## know the internal status of RocksDB. # enable-statistics: true ## Dump statistics periodically in information logs. ## Same as RocksDB's default value (10 min). # stats-dump-period: "10m" ## Refer to: https://github.com/facebook/rocksdb/wiki/RocksDB-FAQ ## If you want to use RocksDB on multi disks or spinning disks, you should set value at least 2MB. # compaction-readahead-size: 0 ## Max buffer size that is used by WritableFileWrite. # writable-file-max-buffer-size: "1MB" ## Use O_DIRECT for both reads and writes in background flush and compactions. # use-direct-io-for-flush-and-compaction: false ## Allows OS to incrementally sync files to disk while they are being written, asynchronously, ## in the background. # bytes-per-sync: "1MB" ## Allows OS to incrementally sync WAL to disk while it is being written. # wal-bytes-per-sync: "512KB" ## Options for "Default" Column Family, which stores actual user data. defaultcf: ## Compression method (if any) is used to compress a block. ## no: kNoCompression ## snappy: kSnappyCompression ## zlib: kZlibCompression ## bzip2: kBZip2Compression ## lz4: kLZ4Compression ## lz4hc: kLZ4HCCompression ## zstd: kZSTD ## `lz4` is a compression algorithm with moderate speed and compression ratio. The compression ## ratio of `zlib` is high. It is friendly to the storage space, but its compression speed is ## slow. This compression occupies many CPU resources. ## Per level compression. ## This config should be chosen carefully according to CPU and I/O resources. For example, if you ## use the compression mode of "no:no:lz4:lz4:lz4:zstd:zstd" and find much I/O pressure of the ## system (run the `iostat` command to find %util lasts 100%, or run the `top` command to find many ## iowaits) when writing (importing) a lot of data while the CPU resources are adequate, you can ## compress level-0 and level-1 and exchange CPU resources for I/O resources. If you use the ## compression mode of "no:no:lz4:lz4:lz4:zstd:zstd" and you find the I/O pressure of the system is ## not big when writing a lot of data, but CPU resources are inadequate. Then run the `top` command ## and choose the `-H` option. If you find a lot of bg threads (namely the compression thread of ## RocksDB) are running, you can exchange I/O resources for CPU resources and change the compression ## mode to "no:no:no:lz4:lz4:zstd:zstd". In a word, it aims at making full use of the existing ## resources of the system and improving TiKV performance in terms of the current resources. # compression-per-level: ["no", "no", "lz4", "lz4", "lz4", "zstd", "zstd"] ## The data block size. RocksDB compresses data based on the unit of block. ## Similar to page in other databases, block is the smallest unit cached in block-cache. Note that ## the block size specified here corresponds to uncompressed data. # block-size: "64KB" ## If you're doing point lookups you definitely want to turn bloom filters on. We use bloom filters ## to avoid unnecessary disk reads. Default bits_per_key is 10, which yields ~1% false positive ## rate. Larger `bloom-filter-bits-per-key` values will reduce false positive rate, but increase ## memory usage and space amplification. # bloom-filter-bits-per-key: 10 # level0-file-num-compaction-trigger: 4 ## Soft limit on number of level-0 files. ## When the number of SST files of level-0 reaches the limit of `level0-slowdown-writes-trigger`, ## RocksDB tries to slow down the write operation, because too many SST files of level-0 can cause ## higher read pressure of RocksDB. # level0-slowdown-writes-trigger: 20 ## Maximum number of level-0 files. ## When the number of SST files of level-0 reaches the limit of `level0-stop-writes-trigger`, ## RocksDB stalls the new write operation. # level0-stop-writes-trigger: 36 ## Amount of data to build up in memory (backed by an unsorted log on disk) before converting to a ## sorted on-disk file. It is the RocksDB MemTable size. # write-buffer-size: "128MB" ## The maximum number of the MemTables. The data written into RocksDB is first recorded in the WAL ## log, and then inserted into MemTables. When the MemTable reaches the size limit of ## `write-buffer-size`, it turns into read only and generates a new MemTable receiving new write ## operations. The flush threads of RocksDB will flush the read only MemTable to the disks to become ## an SST file of level0. `max-background-flushes` controls the maximum number of flush threads. ## When the flush threads are busy, resulting in the number of the MemTables waiting to be flushed ## to the disks reaching the limit of `max-write-buffer-number`, RocksDB stalls the new operation. ## "Stall" is a flow control mechanism of RocksDB. When importing data, you can set the ## `max-write-buffer-number` value higher, like 10. # max-write-buffer-number: 5 ## The minimum number of write buffers that will be merged together before writing to storage. # min-write-buffer-number-to-merge: 1 ## Control maximum total data size for base level (level 1). ## When the level-1 data size reaches the limit value of `max-bytes-for-level-base`, the SST files ## of level-1 and their overlap SST files of level-2 will be compacted. The golden rule: the first ## reference principle of setting `max-bytes-for-level-base` is guaranteeing that the ## `max-bytes-for-level-base` value is roughly equal to the data volume of level-0. Thus ## unnecessary compaction is reduced. For example, if the compression mode is ## "no:no:lz4:lz4:lz4:lz4:lz4", the `max-bytes-for-level-base` value can be `write-buffer-size * 4`, ## because there is no compression of level-0 and level-1 and the trigger condition of compaction ## for level-0 is that the number of the SST files reaches 4 (the default value). When both level-0 ## and level-1 adopt compaction, it is necessary to analyze RocksDB logs to know the size of an SST ## file compressed from a MemTable. For example, if the file size is 32MB, the proposed value of ## `max-bytes-for-level-base` is 32MB * 4 = 128MB. # max-bytes-for-level-base: "512MB" ## Target file size for compaction. ## The SST file size of level-0 is influenced by the compaction algorithm of `write-buffer-size` ## and level0. `target-file-size-base` is used to control the size of a single SST file of level1 to ## level6. # target-file-size-base: "8MB" ## Max bytes for `compaction.max_compaction_bytes`. # max-compaction-bytes: "2GB" ## There are four different compaction priorities. ## 0 : ByCompensatedSize ## 1 : OldestLargestSeqFirst ## 2 : OldestSmallestSeqFirst ## 3 : MinOverlappingRatio # compaction-pri: 3 ## Enable read amplification statistics. ## value => memory usage (percentage of loaded blocks memory) ## 1 => 12.50 % ## 2 => 06.25 % ## 4 => 03.12 % ## 8 => 01.56 % ## 16 => 00.78 % # read-amp-bytes-per-bit: 0 ## Options for "Titan" for "Default" Column Family titan: ## The smallest value to store in blob files. Value smaller than ## this threshold will be inlined in base DB. ## default: 1KB # min-blob-size: "1KB" ## The compression algorithm used to compress data in blob files. ## Compression method. ## no: kNoCompression ## snappy: kSnappyCompression ## zlib: kZlibCompression ## bzip2: kBZip2Compression ## lz4: kLZ4Compression ## lz4hc: kLZ4HCCompression ## zstd: kZSTD ## default: lz4 # blob-file-compression: "lz4" ## Specifics cache size for blob records ## default: 0 # blob-cache-size: "0GB" ## The minimum batch size of one gc job. The total blob file size ## of one gc job cannot smaller than this threshold. ## default: 16MB # min-gc-batch-size: "16MB" ## The maximum batch size of one gc job. The total blob file size ## of one gc job cannot exceed this threshold. # max-gc-batch-size: "64MB" ## If the ratio of discardable size of a blob file is larger than ## this threshold, the blob file will be GCed out. ## default: 0.5 # discardable-ratio: 0.5 ## The gc job will sample the target blob files to see if its ## discardable ratio is smaller than discardable-ratio metioned ## above before gc start, if so the blob file will be exclude. # sample-ratio: 0.1 ## If the size of the blob file is smaller than this threshold, ## the blob file will be merge. ## default: 8MB # merge-small-file-threshold: "8MB" ## Options for "Write" Column Family, which stores MVCC commit information writecf: ## Recommend to set it the same as `rocksdb.defaultcf.compression-per-level`. # compression-per-level: ["no", "no", "lz4", "lz4", "lz4", "zstd", "zstd"] # block-size: "64KB" ## Recommend to set it the same as `rocksdb.defaultcf.write-buffer-size`. # write-buffer-size: "128MB" # max-write-buffer-number: 5 # min-write-buffer-number-to-merge: 1 ## Recommend to set it the same as `rocksdb.defaultcf.max-bytes-for-level-base`. # max-bytes-for-level-base: "512MB" # target-file-size-base: "8MB" # level0-file-num-compaction-trigger: 4 # level0-slowdown-writes-trigger: 20 # level0-stop-writes-trigger: 36 # cache-index-and-filter-blocks: true # pin-l0-filter-and-index-blocks: true # compaction-pri: 3 # read-amp-bytes-per-bit: 0 # dynamic-level-bytes: true lockcf: # compression-per-level: ["no", "no", "no", "no", "no", "no", "no"] # block-size: "16KB" # write-buffer-size: "128MB" # max-write-buffer-number: 5 # min-write-buffer-number-to-merge: 1 # max-bytes-for-level-base: "128MB" # target-file-size-base: "8MB" # level0-slowdown-writes-trigger: 20 # level0-stop-writes-trigger: 36 # cache-index-and-filter-blocks: true # pin-l0-filter-and-index-blocks: true # compaction-pri: 0 # read-amp-bytes-per-bit: 0 # dynamic-level-bytes: true raftdb: # max-background-jobs: 4 # max-sub-compactions: 2 # max-open-files: 40960 # max-manifest-file-size: "20MB" # create-if-missing: true # enable-statistics: true # stats-dump-period: "10m" # compaction-readahead-size: 0 # writable-file-max-buffer-size: "1MB" # use-direct-io-for-flush-and-compaction: false # enable-pipelined-write: true # allow-concurrent-memtable-write: false # bytes-per-sync: "1MB" # wal-bytes-per-sync: "512KB" # info-log-max-size: "1GB" # info-log-roll-time: "0" # info-log-keep-log-file-num: 10 # info-log-dir: "" # optimize-filters-for-hits: true defaultcf: ## Recommend to set it the same as `rocksdb.defaultcf.compression-per-level`. # compression-per-level: ["no", "no", "lz4", "lz4", "lz4", "zstd", "zstd"] # block-size: "64KB" ## Recommend to set it the same as `rocksdb.defaultcf.write-buffer-size`. # write-buffer-size: "128MB" # max-write-buffer-number: 5 # min-write-buffer-number-to-merge: 1 ## Recommend to set it the same as `rocksdb.defaultcf.max-bytes-for-level-base`. # max-bytes-for-level-base: "512MB" # target-file-size-base: "8MB" # level0-file-num-compaction-trigger: 4 # level0-slowdown-writes-trigger: 20 # level0-stop-writes-trigger: 36 # cache-index-and-filter-blocks: true # pin-l0-filter-and-index-blocks: true # compaction-pri: 0 # read-amp-bytes-per-bit: 0 # dynamic-level-bytes: true # optimize-filters-for-hits: true security: ## The path for TLS certificates. Empty string means disabling secure connections. # ca-path: "" # cert-path: "" # key-path: "" # cert-allowed-cn: [] ## Configurations for encryption at rest. Experimental. encryption: ## Encryption method to use for data files. ## Possible values are "plaintext", "aes128-ctr", "aes192-ctr" and "aes256-ctr". Value other than ## "plaintext" means encryption is enabled, in which case master key must be specified. # data-encryption-method: "plaintext" ## Specifies how often TiKV rotates data encryption key. # data-key-rotation-period = "7d" ## Specifies master key if encryption is enabled. There are three types of master key: ## ## * "plaintext": ## ## Plaintext as master key means no master key is given and only applicable when ## encryption is not enabled, i.e. data-encryption-method = "plaintext". This type doesn't ## have sub-config items. Example: ## ## master-key: ## type: "plaintext" ## ## * "kms": ## ## Use a KMS service to supply master key. Currently only AWS KMS is supported. This type of ## master key is recommended for production use. Example: ## ## master-key: ## type: "kms" ## ## KMS CMK key id. Must be a valid KMS CMK where the TiKV process has access to. ## ## In production is recommended to grant access of the CMK to TiKV using IAM. ## key-id = "1234abcd-12ab-34cd-56ef-1234567890ab" ## ## AWS region of the KMS CMK. ## region: "us-west-2" ## ## (Optional) AWS KMS service endpoint. Only required when non-default KMS endpoint is ## ## desired. ## endpoint: "https://kms.us-west-2.amazonaws.com" ## ## * "file": ## ## Supply a custom encryption key stored in a file. It is recommended NOT to use in production, ## as it breaks the purpose of encryption at rest, unless the file is stored in tempfs. ## The file must contain a 256-bits (32 bytes, regardless of key length implied by ## data-encryption-method) key encoded as hex string and end with newline ("\n"). Example: ## ## master-key: ## type: "file" ## path: "/path/to/master/key/file" ## # master-key: # type = "plaintext" ## Specifies the old master key when rotating master key. Same config format as master-key. ## The key is only access once during TiKV startup, after that TiKV do not need access to the key. ## And it is okay to leave the stale previous-master-key config after master key rotation. # previous-master-key: # type: "plaintext" import: pessimistic_txn: ## Enable pessimistic transaction # enabled: true ## Time to wait in milliseconds before responding to TiDB when pessimistic ## transactions encounter locks # wait-for-lock-timeout: "1s" ## If more than one transaction is waiting for the same lock, only the one with smallest ## start timestamp will be waked up immediately when the lock is released. Others will ## be waked up after `wake_up_delay_duration(ms)` to reduce contention and make the oldest ## one more likely acquires the lock. # wake-up-delay-duration: "20ms" gc: ## The number of keys to GC in one batch. # batch-keys: 512 ## Max bytes that GC worker can write to rocksdb in one second. ## If it is set to 0, there is no limit. # max-write-bytes-per-sec: "0" ================================================ FILE: roles/tikv_importer/defaults/main.yml ================================================ --- dummy: # the listening address of tikv-importer. tidb-lightning needs to connect to this address to write data. Set it to the actual IP address. tikv_importer_port: 8287 # this directory is used to store the data written by `tidb-lightning` import_dir: "{{ deploy_dir }}/data.import" importer_log_dir: "{{ deploy_dir }}/log" importer_log_file: "tikv_importer.log" importer_conf_dir: "{{ deploy_dir }}/conf" ================================================ FILE: roles/tikv_importer/meta/main.yml ================================================ --- dependencies: - role: common_dir ================================================ FILE: roles/tikv_importer/tasks/binary_deployment.yml ================================================ --- - name: deploy tikv-importer binary copy: src="{{ resources_dir }}/bin/tikv-importer" dest="{{ deploy_dir }}/bin/" mode=0755 backup=yes register: tikv_importer - name: backup tikv-importer binary file command: mv "{{ tikv_importer.backup_file }}" "{{ backup_dir }}" when: tikv_importer.changed and tikv_importer.backup_file is defined - name: create run script template: src: "{{ item }}_importer_binary.sh.j2" dest: "{{ deploy_dir }}/scripts/{{ item }}_importer.sh" mode: "0755" backup: yes with_items: - start - stop register: importer_script - name: backup script file command: mv "{{ item.backup_file }}" "{{ backup_dir }}" when: - item.changed - item.backup_file is defined with_items: "{{ importer_script.results }}" ================================================ FILE: roles/tikv_importer/tasks/main.yml ================================================ --- # tasks file for lightning - name: create deploy directories file: path="{{ item }}" state=directory mode=0755 with_items: - "{{ import_dir }}" - "{{ importer_log_dir }}" - "{{ importer_conf_dir }}" - name: "load tikv-importer customized config: tidb-ansible/conf/tikv-importer.yml" include_vars: file={{ playbook_dir }}/conf/tikv-importer.yml name=tikv_importer_conf_custom - name: load tikv-importer default config include_vars: file=tikv-importer.yml name=tikv_importer_conf_default - name: generate tikv-importer dynamic config set_fact: tikv_importer_conf_generated: global: log-file: "{{ importer_log_dir }}/{{ importer_log_file }}" server: addr: "{{ hostvars[groups.importer_server[0]].ansible_host | default(hostvars[groups.importer_server[0]].inventory_hostname) }}:{{ tikv_importer_port }}" import: import-dir: "{{ import_dir }}" metric: address: >- {{ hostvars[groups.monitoring_servers[0]].ansible_host | default(hostvars[groups.monitoring_servers[0]].inventory_hostname) ~ ':' ~ (hostvars.get(groups.monitoring_servers.0).pushgateway_port | default('9091')) | default('') }} - name: generate tikv-importer final config set_fact: tikv_importer_conf: "{{ tikv_importer_conf_custom | with_default_dicts(tikv_importer_conf_generated, tikv_importer_conf_default) | update_default_dicts }}" - debug: var=tikv_importer_conf - name: create tikv-importer configuration file template: src=tikv-importer.toml.j2 dest={{ deploy_dir }}/conf/tikv-importer.toml mode=0644 backup=yes register: tikv_importer_conf_st - name: backup tikv-importer conf file command: mv "{{ tikv_importer_conf_st.backup_file }}" "{{ backup_dir }}" when: tikv_importer_conf_st.changed and tikv_importer_conf_st.backup_file is defined - include_tasks: "binary_deployment.yml" ================================================ FILE: roles/tikv_importer/templates/start_importer_binary.sh.j2 ================================================ #!/bin/bash set -e ulimit -n 1000000 cd "{{ deploy_dir }}" || exit 1 mkdir -p status export RUST_BACKTRACE=1 export TZ=${TZ:-/etc/localtime} echo -n 'sync ... ' stat=$(time sync) echo ok echo $stat nohup ./bin/tikv-importer -C ./conf/tikv-importer.toml &> log/tikv_importer_stderr.log & echo $! > "status/tikv-importer.pid" ================================================ FILE: roles/tikv_importer/templates/stop_importer_binary.sh.j2 ================================================ #!/bin/bash set -e ulimit -n 1000000 cd "{{ deploy_dir }}" || exit 1 export RUST_BACKTRACE=1 export TZ=${TZ:-/etc/localtime} echo -n 'sync ... ' stat=$(time sync) echo ok echo $stat if [ `ps aux |grep tikv-importer |grep $(cat status/tikv-importer.pid) |wc -l` -eq 1 ];then kill `cat status/tikv-importer.pid` fi ================================================ FILE: roles/tikv_importer/templates/tikv-importer.toml.j2 ================================================ # importer Configuration {% for item, value in tikv_importer_conf.global | dictsort -%} {{ item }} = {{ value | to_json }} {% endfor %} [server] {% for item, value in tikv_importer_conf.server | dictsort -%} {{ item }} = {{ value | to_json }} {% endfor %} [metric] {% for item, value in tikv_importer_conf.metric | dictsort -%} {{ item }} = {{ value | to_json}} {% endfor %} [rocksdb] {% for item, value in tikv_importer_conf.rocksdb | dictsort_by_value_type -%} {% if value is not mapping -%} {{ item }} = {{ value | to_json}} {% else %} [rocksdb.{{ item }}] {% for sub_item, sub_value in value | dictsort -%} {{ sub_item }} = {{ sub_value | to_json }} {% endfor %} {% endif %} {% endfor %} [import] {% for item, value in tikv_importer_conf.import | dictsort -%} {{ item }} = {{ value | to_json }} {% endfor %} ================================================ FILE: roles/tikv_importer/vars/tikv-importer.yml ================================================ --- # TiKV Importer configuration file template global: # log file. log-file: "log/tikv_importer.log" # log level: trace, debug, info, warn, error, off. log-level: "info" server: # size of thread pool for the gRPC server. # grpc-concurrency: 16 metric: # the Prometheus client push job name. job: "tikv-importer" # the Prometheus client push interval. interval: "15s" # the Prometheus Pushgateway address. # address: "" rocksdb: # the maximum number of concurrent background jobs. max-background-jobs: 32 defaultcf: # amount of data to build up in memory before flushing data to the disk. write-buffer-size: "1GB" # the maximum number of write buffers that are built up in memory. max-write-buffer-number: 8 # the compression algorithms used in different levels. # the algorithm at level-0 is used to compress KV data. # the algorithm at level-6 is used to compress SST files. # the algorithms at level-1 ~ level-5 are not used now. compression-per-level: ["lz4", "no", "no", "no", "no", "no", "lz4"] writecf: compression-per-level: ["lz4", "no", "no", "no", "no", "no", "lz4"] import: # this directory is used to store the data written by `tidb-lightning`. # import-dir: "/tmp/tikv/import" # the number of threads to handle RPC requests. num-threads: 16 # the number of concurrent import jobs. num-import-jobs: 24 # the stream channel window size. Stream will be blocked when the channel is full. stream-channel-window: 128 # maximum duration to prepare regions. # max-prepare-duration = "5m" # split regions into this size according to the importing data. # region-split-size: "512MB" # maximum number of open engines # max-open-engines must be >= index-concurrency + table-concurrency value in tidb-lightning.toml max-open-engines: 8 # speed limit of uploading SST to TiKV (unit: byte/s) # upload-speed-limit: "512MB" # minimum ratio of target store available space: store_available_space / store_capacity # Importer will pause to upload SST to target store if its available ratio less than # this value, and give the store some time window to balance regions. min-available-ratio: 0.05 # Note: the machine's memory size should be more than # (write-buffer-size * max-write-buffer-number * 2) + (num-import-jobs * region-split-size * 2) ================================================ FILE: roles/tispark/tasks/main.yml ================================================ --- - name: deploy spark-2.4.3-bin-hadoop2.7 unarchive: > creates="{{ deploy_dir }}/spark/sbin/start-master.sh" src={{ downloads_dir }}/spark-2.4.3-bin-hadoop2.7.tgz dest={{ deploy_dir }}/ - name: rename spark deploy dir shell: > creates="{{ deploy_dir }}/spark/sbin/start-master.sh" mv {{ deploy_dir }}/spark-* "{{ deploy_dir }}/spark" - name: deploy tispark copy: src: "{{ resources_dir }}/bin/tispark-assembly-SNAPSHOT.jar" dest: "{{ deploy_dir }}/spark/jars/" - name: load customized spark_env include_vars: file={{ playbook_dir }}/conf/spark-env.yml name=spark_env_custom - name: create spark_env.sh file template: src: spark-env.sh.j2 dest: "{{ deploy_dir }}/spark/conf/spark-env.sh" mode: 0644 backup: yes - name: load customized spark_defaults include_vars: file={{ playbook_dir }}/conf/spark-defaults.yml name=spark_defaults_custom - name: create spark_defaults.conf file template: src: spark-defaults.conf.j2 dest: "{{ deploy_dir }}/spark/conf/spark-defaults.conf" mode: 0644 backup: yes - name: create spark-slave.sh template: src: "start-slave.sh.j2" dest: "{{ deploy_dir }}/spark/sbin/start-slave.sh" mode: 0755 backup: yes when: "'spark_slaves' in group_names" - name: create log4j.properties file template: src: log4j.properties.j2 dest: "{{ deploy_dir }}/spark/conf/log4j.properties" mode: 0644 backup: yes ================================================ FILE: roles/tispark/templates/log4j.properties.j2 ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # Set everything to be logged to the console log4j.rootCategory=INFO, console log4j.appender.console=org.apache.log4j.ConsoleAppender log4j.appender.console.target=System.err log4j.appender.console.layout=org.apache.log4j.PatternLayout log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n # Set the default spark-shell log level to WARN. When running the spark-shell, the # log level for this class is used to overwrite the root logger's log level, so that # the user can have different defaults for the shell and regular Spark apps. log4j.logger.org.apache.spark.repl.Main=WARN # Settings to quiet third party logs that are too verbose log4j.logger.org.spark_project.jetty=WARN log4j.logger.org.spark_project.jetty.util.component.AbstractLifeCycle=ERROR log4j.logger.org.apache.spark.repl.SparkIMain$exprTyper=INFO log4j.logger.org.apache.spark.repl.SparkILoop$SparkILoopInterpreter=INFO log4j.logger.org.apache.parquet=ERROR log4j.logger.parquet=ERROR # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR # tispark disable "WARN ObjectStore:568 - Failed to get database" log4j.logger.org.apache.hadoop.hive.metastore.ObjectStore=ERROR ================================================ FILE: roles/tispark/templates/spark-defaults.conf.j2 ================================================ # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # Default system properties included when running spark-submit. # This is useful for setting default environmental settings. # Example: #spark.eventLog.dir: "hdfs://namenode:8021/directory" # spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three" {% for item, value in spark_defaults_custom | dictsort -%} {{ item }} {{ value }} {% endfor %} {% set tispark_master = [] -%} {% set tispark_master_hosts = groups.spark_master %} {% for host in tispark_master_hosts -%} {% set tispark_master_ip = hostvars[host].ansible_host | default(hostvars[host].inventory_hostname) -%} {% set _ = tispark_master.append("%s:%s" % (tispark_master_ip, '7077')) -%} {% endfor -%} {% if tispark_master %} spark.master spark://{{ tispark_master | join('') }} {% endif %} {% set all_pd = [] -%} {% for host in groups.pd_servers -%} {% set other_ip = hostvars[host].ansible_host | default(hostvars[host].inventory_hostname) -%} {% set other_port = hostvars[host]['pd_client_port'] -%} {% set _ = all_pd.append("%s:%s" % (other_ip, other_port)) -%} {% endfor -%} spark.tispark.pd.addresses {{ all_pd | join(',') }} ================================================ FILE: roles/tispark/templates/spark-env.sh.j2 ================================================ #!/usr/bin/env bash # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # This file is sourced when running various Spark programs. # Copy it as spark-env.sh and edit that to configure Spark for your site. # Options read when launching programs locally with # ./bin/run-example or ./bin/spark-submit # - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files # - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node # - SPARK_PUBLIC_DNS, to set the public dns name of the driver program # - SPARK_CLASSPATH, default classpath entries to append # Options read by executors and drivers running inside the cluster # - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node # - SPARK_PUBLIC_DNS, to set the public DNS name of the driver program # - SPARK_CLASSPATH, default classpath entries to append # - SPARK_LOCAL_DIRS, storage directories to use on this node for shuffle and RDD data # - MESOS_NATIVE_JAVA_LIBRARY, to point to your libmesos.so if you use Mesos # Options read in YARN client mode # - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files # - SPARK_EXECUTOR_INSTANCES, Number of executors to start (Default: 2) # - SPARK_EXECUTOR_CORES, Number of cores for the executors (Default: 1). # - SPARK_EXECUTOR_MEMORY, Memory per Executor (e.g. 1000M, 2G) (Default: 1G) # - SPARK_DRIVER_MEMORY, Memory for Driver (e.g. 1000M, 2G) (Default: 1G) # Options for the daemons used in the standalone deploy mode # - SPARK_MASTER_HOST, to bind the master to a different IP address or hostname # - SPARK_MASTER_PORT / SPARK_MASTER_WEBUI_PORT, to use non-default ports for the master # - SPARK_MASTER_OPTS, to set config properties only for the master (e.g. "-Dx=y") # - SPARK_WORKER_CORES, to set the number of cores to use on this machine # - SPARK_WORKER_MEMORY, to set how much total memory workers have to give executors (e.g. 1000m, 2g) # - SPARK_WORKER_PORT / SPARK_WORKER_WEBUI_PORT, to use non-default ports for the worker # - SPARK_WORKER_INSTANCES, to set the number of worker processes per node # - SPARK_WORKER_DIR, to set the working directory of worker processes # - SPARK_WORKER_OPTS, to set config properties only for the worker (e.g. "-Dx=y") # - SPARK_DAEMON_MEMORY, to allocate to the master, worker and history server themselves (default: 1g). # - SPARK_HISTORY_OPTS, to set config properties only for the history server (e.g. "-Dx=y") # - SPARK_SHUFFLE_OPTS, to set config properties only for the external shuffle service (e.g. "-Dx=y") # - SPARK_DAEMON_JAVA_OPTS, to set config properties for all daemons (e.g. "-Dx=y") # - SPARK_PUBLIC_DNS, to set the public dns name of the master or workers # Generic options for the daemons used in the standalone deploy mode # - SPARK_CONF_DIR Alternate conf dir. (Default: ${SPARK_HOME}/conf) # - SPARK_LOG_DIR Where log files are stored. (Default: ${SPARK_HOME}/logs) # - SPARK_PID_DIR Where the pid file is stored. (Default: /tmp) # - SPARK_IDENT_STRING A string representing this instance of spark. (Default: $USER) # - SPARK_NICENESS The scheduling priority for daemons. (Default: 0) # - SPARK_NO_DAEMONIZE Run the proposed command in the foreground. It will not output a PID file. #export JAVA_HOME, to set jdk home {% for item, value in spark_env_custom | dictsort -%} {{ item }}={{ value }} {% endfor %} {% set tispark_master = [] -%} {% set tispark_master_hosts = groups.spark_master %} {% for host in tispark_master_hosts -%} {% set tispark_master_ip = hostvars[host].ansible_host | default(hostvars[host].inventory_hostname) -%} {% set _ = tispark_master.append("%s" % (tispark_master_ip)) -%} {% endfor -%} {% if tispark_master %} SPARK_MASTER_HOST={{ tispark_master | join('') }} {% endif %} ================================================ FILE: roles/tispark/templates/start-slave.sh.j2 ================================================ #!/usr/bin/env bash # # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # Starts a slave on the machine this script is executed on. # # Environment Variables # # SPARK_WORKER_INSTANCES The number of worker instances to run on this # slave. Default is 1. # SPARK_WORKER_PORT The base port number for the first worker. If set, # subsequent workers will increment this number. If # unset, Spark will find a valid port number, but # with no guarantee of a predictable pattern. # SPARK_WORKER_WEBUI_PORT The base port for the web interface of the first # worker. Subsequent workers will increment this # number. Default is 8081. if [ -z "${SPARK_HOME}" ]; then export SPARK_HOME="$(cd "`dirname "$0"`"/..; pwd)" fi # NOTE: This exact class name is matched downstream by SparkSubmit. # Any changes need to be reflected there. CLASS="org.apache.spark.deploy.worker.Worker" if [[ "$@" = *--help ]] || [[ "$@" = *-h ]]; then echo "Usage: ./sbin/start-slave.sh [options] " pattern="Usage:" pattern+="\|Using Spark's default log4j profile:" pattern+="\|Registered signal handlers for" "${SPARK_HOME}"/bin/spark-class $CLASS --help 2>&1 | grep -v "$pattern" 1>&2 exit 1 fi . "${SPARK_HOME}/sbin/spark-config.sh" . "${SPARK_HOME}/bin/load-spark-env.sh" # First argument should be the master; we need to store it aside because we may # need to insert arguments between it and the other arguments {% set tispark_master = [] -%} {% set tispark_master_hosts = groups.spark_master %} {% for host in tispark_master_hosts -%} {% set tispark_master_ip = hostvars[host].ansible_host | default(hostvars[host].inventory_hostname) -%} {% set _ = tispark_master.append("%s:%s" % (tispark_master_ip, '7077')) -%} {% endfor -%} MASTER=spark://{{ tispark_master | join('') }} shift # Determine desired worker port if [ "$SPARK_WORKER_WEBUI_PORT" = "" ]; then SPARK_WORKER_WEBUI_PORT=8081 fi # Start up the appropriate number of workers on this machine. # quick local function to start a worker function start_instance { WORKER_NUM=$1 shift if [ "$SPARK_WORKER_PORT" = "" ]; then PORT_FLAG= PORT_NUM= else PORT_FLAG="--port" PORT_NUM=$(( $SPARK_WORKER_PORT + $WORKER_NUM - 1 )) fi WEBUI_PORT=$(( $SPARK_WORKER_WEBUI_PORT + $WORKER_NUM - 1 )) "${SPARK_HOME}/sbin"/spark-daemon.sh start $CLASS $WORKER_NUM \ --webui-port "$WEBUI_PORT" $PORT_FLAG $PORT_NUM $MASTER "$@" } if [ "$SPARK_WORKER_INSTANCES" = "" ]; then start_instance 1 "$@" else for ((i=0; i<$SPARK_WORKER_INSTANCES; i++)); do start_instance $(( 1 + $i )) "$@" done fi ================================================ FILE: rolling_update.yml ================================================ --- # Copyright 2016 PingCAP, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # See the License for the specific language governing permissions and # limitations under the License. # The rolling update playbook of TiDB - name: check config locally hosts: localhost any_errors_fatal: true tags: - always roles: - check_config_static - name: check system environment hosts: monitored_servers any_errors_fatal: true tags: - always roles: - check_system_dynamic - name: gather all facts, and check dest hosts: all any_errors_fatal: true tags: - always roles: - check_config_dynamic - name: Pre-check for rolling update hosts: tidb_servers any_errors_fatal: true tags: - always tasks: - shell: "{{ deploy_dir }}/bin/tidb-server -V" register: current_version - name: Check whether can perform rolling update fail: msg: "Rolling update from {{ current_version.stdout_lines[0].replace(' ','').split(':')[1] }} to {{ tidb_version }} is forbidden" when: - current_version.stdout_lines[0].replace(' ','').split(':')[1].strip('v') < "2.0.1" - tidb_version == "latest" or tidb_version >= "v2.1.0" - name: Pre-check PD configuration hosts: pd_servers[0] tags: - pd roles: - check_config_pd - name: Pre-check TiKV configuration hosts: tikv_servers[0] tags: - tikv roles: - check_config_tikv - name: Pre-check TiDB configuration hosts: tidb_servers[0] tags: - tidb roles: - check_config_tidb - hosts: pd_servers[0] any_errors_fatal: true serial: 1 tags: - pd tasks: - name: Check pd cluster status uri: url: "http://{{ ansible_host }}:{{ pd_client_port }}/pd/health" method: GET return_content: yes status_code: 200 register: pd_status when: not enable_tls|default(false) - name: Check pd cluster status when enable_tls uri: url: "https://{{ ansible_host }}:{{ pd_client_port }}/pd/health" validate_certs: no client_cert: "{{ pd_cert_dir }}/pd-server-{{ ansible_host }}.pem" client_key: "{{ pd_cert_dir }}/pd-server-{{ ansible_host }}-key.pem" method: GET return_content: yes status_code: 200 register: pd_status_tls when: enable_tls|default(false) - name: Failed when one node of pd is unhealthy fail: msg: "Some pd nodes are unhealthy" when: - not enable_tls|default(false) - "'false' in pd_status.content" - name: Failed when one node of pd is unhealthy when enable_tls fail: msg: "Some pd nodes are unhealthy" when: - enable_tls|default(false) - "'false' in pd_status_tls.content" - hosts: pd_servers any_errors_fatal: true serial: 1 tags: - pd tasks: - set_fact: pd_addr: "{{ ansible_host }}:{{ pd_client_port }}" - include_tasks: "common_tasks/get_pd_leader.yml" when: not enable_tls|default(false) - include_tasks: "common_tasks/get_pd_leader_tls.yml" when: enable_tls|default(false) - set_fact: pd_leader_name: "{{ pd_leader_info.json.name }}" - include_tasks: "common_tasks/get_pd_name.yml" when: not enable_tls|default(false) - include_tasks: "common_tasks/get_pd_name_tls.yml" when: enable_tls|default(false) - name: Set pd follower list add_host: name: "{{ inventory_hostname }}" ansible_host: "{{ ansible_host }}" ansible_ssh_host: "{{ ansible_ssh_host }}" groups: pd_servers_followers deploy_dir: "{{ deploy_dir }}" pd_client_port: "{{ pd_client_port }}" pd_peer_port: "{{ pd_peer_port }}" pd_data_dir: "{{ pd_data_dir }}" pd_log_dir: "{{ pd_log_dir }}" pd_cert_dir: "{{ pd_cert_dir }}" when: pd_leader_name != pd_name - name: Set pd leader list add_host: name: "{{ inventory_hostname }}" ansible_host: "{{ ansible_host }}" ansible_ssh_host: "{{ ansible_ssh_host }}" groups: pd_servers_leader deploy_dir: "{{ deploy_dir }}" pd_client_port: "{{ pd_client_port }}" pd_peer_port: "{{ pd_peer_port }}" pd_data_dir: "{{ pd_data_dir }}" pd_log_dir: "{{ pd_log_dir }}" pd_cert_dir: "{{ pd_cert_dir }}" when: pd_leader_name == pd_name - name: rolling update PD cluster hosts: pd_servers_followers, pd_servers_leader any_errors_fatal: true serial: 1 tags: - pd pre_tasks: - set_fact: pd_addr: "{{ ansible_host }}:{{ pd_client_port }}" - include_tasks: "common_tasks/get_pd_name.yml" when: not enable_tls|default(false) - include_tasks: "common_tasks/get_pd_name_tls.yml" when: enable_tls|default(false) - name: display PD name debug: var: pd_name - name: display PD address debug: var: pd_addr - include_tasks: "common_tasks/get_pd_leader.yml" when: not enable_tls|default(false) - include_tasks: "common_tasks/get_pd_leader_tls.yml" when: enable_tls|default(false) - include_tasks: "common_tasks/transfer_pd_leader.yml" - name: stop PD by supervise shell: cd {{ deploy_dir }}/scripts && ./stop_pd.sh when: process_supervision == 'supervise' - name: stop PD by systemd systemd: name=pd-{{ pd_client_port }}.service state=stopped become: true when: process_supervision == 'systemd' - name: wait until the PD port is down wait_for: host: "{{ ansible_host }}" port: "{{ pd_client_port }}" state: stopped msg: "the PD port {{ pd_client_port }} is not down" roles: - pd post_tasks: - name: start PD by supervise shell: cd {{ deploy_dir }}/scripts && ./start_pd.sh when: process_supervision == 'supervise' - name: start PD by systemd systemd: name=pd-{{ pd_client_port }}.service state=started become: true when: process_supervision == 'systemd' - name: wait until the PD port is up wait_for: host: "{{ ansible_host }}" port: "{{ pd_client_port }}" state: started msg: "the PD port {{ pd_client_port }} is not up" - name: wait until the PD health page is available uri: url: "http://{{ ansible_host }}:{{ pd_client_port }}/health" return_content: yes register: pd_http_result until: pd_http_result.status == 200 and 'true' in pd_http_result.content retries: 12 delay: 5 when: not enable_tls|default(false) - name: wait until the PD health page is available when enable_tls uri: url: "https://{{ ansible_host }}:{{ pd_client_port }}/health" validate_certs: no client_cert: "{{ pd_cert_dir }}/pd-server-{{ ansible_host }}.pem" client_key: "{{ pd_cert_dir }}/pd-server-{{ ansible_host }}-key.pem" return_content: yes register: pd_https_result until: pd_https_result.status == 200 and 'true' in pd_https_result.content retries: 12 delay: 5 when: enable_tls|default(false) - name: wait until the PD cluster is available uri: url: "http://{{ ansible_host }}:{{ pd_client_port }}/pd/health" return_content: yes register: pd_cluster_status until: pd_cluster_status.status == 200 and 'false' not in pd_cluster_status.content retries: 12 delay: 5 when: not enable_tls|default(false) - name: wait until the PD cluster is available when enable_tls uri: url: "https://{{ ansible_host }}:{{ pd_client_port }}/pd/health" validate_certs: no client_cert: "{{ pd_cert_dir }}/pd-server-{{ ansible_host }}.pem" client_key: "{{ pd_cert_dir }}/pd-server-{{ ansible_host }}-key.pem" return_content: yes register: pd_cluster_status until: pd_cluster_status.status == 200 and 'false' not in pd_cluster_status.content retries: 12 delay: 5 when: enable_tls|default(false) - name: rolling update TiKV cluster hosts: tikv_servers any_errors_fatal: true serial: 1 tags: - tikv pre_tasks: - include_tasks: "common_tasks/get_pd_tikv_addr.yml" - include_tasks: "common_tasks/get_store_id.yml" when: not enable_tls|default(false) - include_tasks: "common_tasks/get_store_id_tls.yml" when: enable_tls|default(false) - include_tasks: "common_tasks/add_evict_leader_scheduler.yml" - name: stop TiKV by supervise shell: cd {{ deploy_dir }}/scripts && ./stop_tikv.sh when: process_supervision == 'supervise' - name: stop TiKV by systemd systemd: name=tikv-{{ tikv_port }}.service state=stopped become: true when: process_supervision == 'systemd' - name: wait until the TiKV port is down wait_for: host: "{{ ansible_host }}" port: "{{ tikv_port }}" state: stopped msg: "the TiKV port {{ tikv_port }} is not down" - command: cat {{ deploy_dir }}/status/tikv.pid register: old_tikv_pid ignore_errors: yes changed_when: false - name: display old tikv pid debug: msg: "tikv binary or docker pid: {{ old_tikv_pid.stdout }}" roles: - tikv post_tasks: - name: Check if tikv_port already in use wait_for: host: "{{ ansible_host }}" port: "{{ tikv_port }}" state: stopped timeout: 3 msg: "{{ tikv_port }} already in use" - name: Check if tikv_status_port already in use wait_for: host: "{{ ansible_host }}" port: "{{ tikv_status_port }}" state: stopped timeout: 3 msg: "{{ tikv_status_port }} already in use" - name: start TiKV by supervise shell: cd {{ deploy_dir }}/scripts && ./start_tikv.sh when: process_supervision == 'supervise' - name: start TiKV by systemd systemd: name=tikv-{{ tikv_port }}.service state=started become: true when: process_supervision == 'systemd' - name: wait until the TiKV port is up wait_for: host: "{{ ansible_host }}" port: "{{ tikv_port }}" state: started msg: "the TiKV port {{ tikv_port }} is not up" - name: wait until the TiKV status page is available uri: url: "http://{{ ansible_host }}:{{ tikv_status_port }}/status" return_content: yes register: tikv_http_result until: tikv_http_result.status == 200 retries: 12 delay: 5 when: not enable_tls|default(false) - name: wait until the TiKV status page is available when enable_tls uri: url: "https://{{ ansible_host }}:{{ tikv_status_port }}/status" validate_certs: no client_cert: "{{ tikv_cert_dir }}/tikv-server-{{ ansible_host }}.pem" client_key: "{{ tikv_cert_dir }}/tikv-server-{{ ansible_host }}-key.pem" return_content: yes register: tikv_https_result until: tikv_https_result.status == 200 retries: 10 delay: 5 when: enable_tls|default(false) - command: cat {{ deploy_dir }}/status/tikv.pid register: new_tikv_pid ignore_errors: yes changed_when: false - name: display new tikv pid debug: msg: "tikv binary or docker pid: {{ new_tikv_pid.stdout }}" - include_tasks: "common_tasks/remove_evict_leader_scheduler.yml" - name: rolling update pump cluster hosts: pump_servers any_errors_fatal: true serial: 1 tags: - pump pre_tasks: - name: stop pump by supervise shell: cd {{ deploy_dir }}/scripts && ./stop_{{ item }}.sh with_items: - pump when: - enable_binlog|default(false) - process_supervision == 'supervise' - name: stop pump by systemd systemd: name=pump-{{ pump_port }}.service state=stopped become: true when: - enable_binlog|default(false) - process_supervision == 'systemd' - name: wait until the pump port is down wait_for: host: "{{ ansible_host }}" port: "{{ pump_port }}" state: stopped msg: "the pump port {{ pump_port }} is not down" when: - enable_binlog|default(false) roles: - { role: pump, when: enable_binlog|default(false) } post_tasks: - name: start pump by supervise shell: cd {{ deploy_dir }}/scripts && ./start_{{ item }}.sh when: - enable_binlog|default(false) - process_supervision == 'supervise' with_items: - pump - name: start pump by systemd systemd: name=pump-{{ pump_port }}.service state=started become: true when: - enable_binlog|default(false) - process_supervision == 'systemd' - name: wait until the pump port is up wait_for: host: "{{ ansible_host }}" port: "{{ pump_port }}" state: started msg: "the pump port {{ pump_port }} is not up" when: - enable_binlog|default(false) - name: rolling update TiDB cluster hosts: tidb_servers any_errors_fatal: true serial: 1 tags: - tidb pre_tasks: - name: stop TiDB by supervise shell: cd {{ deploy_dir }}/scripts && ./stop_tidb.sh when: process_supervision == 'supervise' - name: stop TiDB by systemd systemd: name=tidb-{{ tidb_port }}.service state=stopped become: true when: process_supervision == 'systemd' - name: wait until the TiDB port is down wait_for: host: "{{ ansible_host }}" port: "{{ tidb_port }}" state: stopped msg: "the TiDB port {{ tidb_port }} is not down" roles: - { role: tidb } post_tasks: - name: Check if tidb_port already in use wait_for: host: "{{ ansible_host }}" port: "{{ tidb_port }}" state: stopped timeout: 3 msg: "{{ tidb_port }} already in use" - name: Check if tidb_status_port already in use wait_for: host: "{{ ansible_host }}" port: "{{ tidb_status_port }}" state: stopped timeout: 3 msg: "{{ tidb_status_port }} already in use" - name: start TiDB by supervise shell: cd {{ deploy_dir }}/scripts && ./start_tidb.sh when: process_supervision == 'supervise' - name: start TiDB by systemd systemd: name=tidb-{{ tidb_port }}.service state=started become: true when: process_supervision == 'systemd' - name: wait until the TiDB port is up wait_for: host: "{{ ansible_host }}" port: "{{ tidb_port }}" state: started msg: "the TiDB port {{ tidb_port }} is not up" - name: wait until the TiDB status page is available uri: url: "http://{{ ansible_host }}:{{ tidb_status_port }}/status" return_content: yes register: tidb_http_result until: tidb_http_result.status == 200 and 'TiDB' in tidb_http_result.content retries: 12 delay: 5 when: not enable_tls|default(false) - name: wait until the TiDB status page is available when enable_tls uri: url: "https://{{ ansible_host }}:{{ tidb_status_port }}/status" validate_certs: no client_cert: "{{ tidb_cert_dir }}/tidb-server-{{ ansible_host }}.pem" client_key: "{{ tidb_cert_dir }}/tidb-server-{{ ansible_host }}-key.pem" return_content: yes register: tidb_https_result until: tidb_https_result.status == 200 and 'TiDB' in tidb_https_result.content retries: 10 delay: 5 when: enable_tls|default(false) - name: rolling update TiFlash cluster hosts: tiflash_servers any_errors_fatal: true serial: 1 tags: - tiflash pre_tasks: - name: stop TiFlash by supervise shell: cd {{ deploy_dir }}/scripts && ./stop_tiflash.sh when: process_supervision == 'supervise' and cpu_architecture == 'amd64' - name: stop TiFlash by systemd systemd: name=tiflash-{{ tcp_port }}.service state=stopped become: true when: process_supervision == 'systemd' and cpu_architecture == 'amd64' - name: wait until the TiFlash port is down wait_for: host: "{{ ansible_host }}" port: "{{ http_port }}" state: stopped msg: "the TiFlash port {{ http_port }} is not down" when: cpu_architecture == 'amd64' roles: - { role: tiflash, when: cpu_architecture == 'amd64' } post_tasks: - name: start TiFlash by supervise shell: cd {{ deploy_dir }}/scripts && ./start_tiflash.sh when: process_supervision == 'supervise' and cpu_architecture == 'amd64' - name: start TiFlash by systemd systemd: name=tiflash-{{ tcp_port }}.service state=started become: true when: process_supervision == 'systemd' and cpu_architecture == 'amd64' - name: wait until the TiFlash port is up wait_for: host: "{{ ansible_host }}" port: "{{ http_port }}" state: started msg: "the TiFlash port {{ http_port }} is not up" when: cpu_architecture == 'amd64' - name: wait until the TiFlash status page is available uri: url: "http://{{ ansible_host }}:{{ http_port }}/?query=select%20version()" return_content: yes register: tiflash_http_result until: tiflash_http_result.status == 200 retries: 12 delay: 5 when: not enable_tls|default(false) and cpu_architecture == 'amd64' - hosts: localhost tags: - always roles: - { role: dashboard_topo } ================================================ FILE: rolling_update_monitor.yml ================================================ --- # Copyright 2016 PingCAP, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # See the License for the specific language governing permissions and # limitations under the License. # The rolling update playbook of TiDB - name: check config locally hosts: localhost any_errors_fatal: true tags: - always roles: - check_config_static - name: gather all facts, and check dest hosts: all any_errors_fatal: true tags: - always roles: - check_config_dynamic - name: rolling update node_exporter hosts: monitored_servers any_errors_fatal: true tags: - node_exporter pre_tasks: - name: check node_exporter existed stat: path: "{{ deploy_dir }}/bin/node_exporter" register: node_exporter_binary_file - name: stop node_exporter by supervise shell: cd {{ deploy_dir }}/scripts && ./stop_node_exporter.sh when: - process_supervision == 'supervise' - node_exporter_binary_file.stat.exists == True - name: stop node_exporter by systemd systemd: name=node_exporter-{{ node_exporter_port }}.service state=stopped become: true when: - process_supervision == 'systemd' - node_exporter_binary_file.stat.exists == True - name: wait until the node_exporter port is down wait_for: host: "{{ ansible_host }}" port: "{{ node_exporter_port }}" state: stopped msg: "the node_exporter port {{ node_exporter_port }} is not down" roles: - node_exporter post_tasks: - name: start node_exporter by supervise shell: cd {{ deploy_dir }}/scripts && ./start_node_exporter.sh when: process_supervision == 'supervise' - name: start node_exporter by systemd systemd: name=node_exporter-{{ node_exporter_port }}.service state=started become: true when: process_supervision == 'systemd' - name: wait until the node_exporter port is up wait_for: host: "{{ ansible_host }}" port: "{{ node_exporter_port }}" state: started msg: "the node_exporter port {{ node_exporter_port }} is not up" - name: wait until the node_exporter metrics page is available uri: url: "http://{{ ansible_host }}:{{ node_exporter_port }}/metrics" register: node_exporter_http_result until: node_exporter_http_result.status == 200 retries: 12 delay: 5 - name: rolling update blackbox_exporter hosts: monitored_servers any_errors_fatal: true tags: - blackbox_exporter pre_tasks: - name: check blackbox_exporter existed stat: path: "{{ deploy_dir }}/conf/blackbox.yml" register: blackbox_exporter_configure_file - name: stop blackbox_exporter by supervise shell: cd {{ deploy_dir }}/scripts && ./stop_blackbox_exporter.sh when: - process_supervision == 'supervise' - blackbox_exporter_configure_file.stat.exists == True - name: stop blackbox_exporter by systemd systemd: name=blackbox_exporter-{{ blackbox_exporter_port }}.service state=stopped become: true when: - process_supervision == 'systemd' - blackbox_exporter_configure_file.stat.exists == True - name: wait until the blackbox_exporter port is down wait_for: host: "{{ ansible_host }}" port: "{{ blackbox_exporter_port }}" state: stopped msg: "the blackbox_exporter port {{ blackbox_exporter_port }} is not down" roles: - blackbox_exporter post_tasks: - name: start blackbox_exporter by supervise shell: cd {{ deploy_dir }}/scripts && ./start_blackbox_exporter.sh when: process_supervision == 'supervise' - name: start blackbox_exporter by systemd systemd: name=blackbox_exporter-{{ blackbox_exporter_port }}.service state=started become: true when: process_supervision == 'systemd' - name: wait until the blackbox_exporter port is up wait_for: host: "{{ ansible_host }}" port: "{{ blackbox_exporter_port }}" state: started msg: "the blackbox_exporter port {{ blackbox_exporter_port }} is not up" - name: wait until the blackbox_exporter metrics page is available uri: url: "http://{{ ansible_host }}:{{ blackbox_exporter_port }}/metrics" register: blackbox_exporter_http_result until: blackbox_exporter_http_result.status == 200 retries: 12 delay: 5 - name: rolling update alertmanager hosts: alertmanager_servers any_errors_fatal: true tags: - alertmanager pre_tasks: - name: check alertmanager existed stat: path: "{{ deploy_dir }}/conf/alertmanager.yml" register: alertmanager_configure_file - name: stop alertmanager by supervise shell: cd {{ deploy_dir }}/scripts && ./stop_alertmanager.sh when: - process_supervision == 'supervise' - alertmanager_configure_file.stat.exists == True - name: stop alertmanager by systemd systemd: name=alertmanager-{{ alertmanager_port }}.service state=stopped become: true when: - process_supervision == 'systemd' - alertmanager_configure_file.stat.exists == True - name: wait until the alertmanager port is down wait_for: host: "{{ ansible_host }}" port: "{{ alertmanager_port }}" state: stopped msg: "the alertmanager port {{ alertmanager_port }} is not down" roles: - alertmanager post_tasks: - name: start alertmanager by supervise shell: cd {{ deploy_dir }}/scripts && ./start_alertmanager.sh when: process_supervision == 'supervise' - name: start alertmanager by systemd systemd: name=alertmanager-{{ alertmanager_port }}.service state=started become: true when: process_supervision == 'systemd' - name: wait until the alertmanager port is up wait_for: host: "{{ ansible_host }}" port: "{{ alertmanager_port }}" state: started msg: "the alertmanager port {{ alertmanager_port }} is not up" - name: rolling update pushgateway hosts: monitoring_servers any_errors_fatal: true tags: - pushgateway pre_tasks: - name: stop pushgateway by supervise shell: cd {{ deploy_dir }}/scripts && ./stop_{{ item }}.sh with_items: - pushgateway when: process_supervision == 'supervise' - name: stop pushgateway by systemd systemd: name={{ item }} state=stopped when: process_supervision == 'systemd' become: true with_items: - pushgateway-{{ pushgateway_port }}.service - name: wait until the pushgateway port is down wait_for: host: "{{ ansible_host }}" port: "{{ pushgateway_port }}" state: stopped msg: "the pushgateway port {{ pushgateway_port }} is not down" roles: - pushgateway post_tasks: - name: start pushgateway by supervise shell: cd {{ deploy_dir }}/scripts && ./start_{{ item }}.sh when: process_supervision == 'supervise' with_items: - pushgateway - name: start pushgateway by systemd systemd: name={{ item }} state=started enabled=no when: process_supervision == 'systemd' become: true with_items: - pushgateway-{{ pushgateway_port }}.service - name: wait until the pushgateway port is up wait_for: host: "{{ ansible_host }}" port: "{{ pushgateway_port }}" state: started msg: "the pushgateway port {{ pushgateway_port }} is not up" - name: wait until the pushgateway metrics page is available uri: url: "http://{{ ansible_host }}:{{ pushgateway_port }}/metrics" register: pushgateway_http_result until: pushgateway_http_result.status == 200 retries: 12 delay: 5 - name: rolling update prometheus hosts: monitoring_servers any_errors_fatal: true tags: - prometheus pre_tasks: - name: stop prometheus by supervise shell: cd {{ deploy_dir }}/scripts && ./stop_{{ item }}.sh with_items: - prometheus when: process_supervision == 'supervise' - name: stop prometheus by systemd systemd: name={{ item }} state=stopped when: process_supervision == 'systemd' become: true with_items: - prometheus-{{ prometheus_port }}.service - name: wait until the prometheus port is down wait_for: host: "{{ ansible_host }}" port: "{{ prometheus_port }}" state: stopped msg: "the prometheus port {{ prometheus_port }} is not down" roles: - prometheus post_tasks: - name: start prometheus by supervise shell: cd {{ deploy_dir }}/scripts && ./start_{{ item }}.sh when: process_supervision == 'supervise' with_items: - prometheus - name: start prometheus by systemd systemd: name={{ item }} state=started enabled=no when: process_supervision == 'systemd' become: true with_items: - prometheus-{{ prometheus_port }}.service - name: wait until the prometheus port is up wait_for: host: "{{ ansible_host }}" port: "{{ prometheus_port }}" state: started msg: "the prometheus port {{ prometheus_port }} is not up" - name: wait until the prometheus metrics page is available uri: url: "http://{{ ansible_host }}:{{ prometheus_port }}/metrics" register: prometheus_http_result until: prometheus_http_result.status == 200 retries: 12 delay: 5 - name: rolling update grafana hosts: grafana_servers any_errors_fatal: true tags: - grafana pre_tasks: - name: stop grafana by supervise shell: cd {{ deploy_dir }}/scripts && ./stop_{{ item }}.sh when: process_supervision == 'supervise' with_items: - grafana - name: stop grafana by systemd systemd: name=grafana-{{ grafana_port }}.service state=stopped become: true when: process_supervision == 'systemd' - name: wait until the grafana port is down wait_for: host: "{{ ansible_host }}" port: "{{ grafana_port }}" state: stopped msg: "the grafana port {{ grafana_port }} is not down" roles: - grafana post_tasks: - name: start grafana by supervise shell: cd {{ deploy_dir }}/scripts && ./start_{{ item }}.sh when: process_supervision == 'supervise' with_items: - grafana - name: start grafana by systemd systemd: name=grafana-{{ grafana_port }}.service state=started enabled=no when: process_supervision == 'systemd' become: true - name: wait until the grafana port is up wait_for: host: "{{ ansible_host }}" port: "{{ grafana_port }}" state: started msg: "the grafana port {{ grafana_port }} is not up" - name: wait until the grafana login page is available uri: url: "http://{{ ansible_host }}:{{ grafana_port }}/login" register: grafana_http_result until: grafana_http_result.status == 200 retries: 12 delay: 5 - set_fact: grafana_host: "{{ ansible_host }}" - include_tasks: "common_tasks/create_grafana_api_keys.yml" - name: import grafana data source shell: > chdir={{ grafana_data_dir }} warn=no curl -q -X POST -d @data_source.json --header 'Content-Type: application/json' "http://{{ grafana_admin_user }}:{{ grafana_admin_password }}@127.0.0.1:{{ grafana_port }}/api/datasources" - name: import grafana dashboards - prepare config delegate_to: localhost template: src=grafana.dest.json.j2 dest={{ playbook_dir }}/scripts/dests-{{ inventory_hostname }}.json vars: - ansible_become: false - ansible_connection: local - grafana_dest_config: name: "{{ cluster_name | title }}" url: "http://{{ grafana_host }}:{{ grafana_port }}/" user: "{{ grafana_admin_user }}" password: "{{ grafana_admin_password }}" apikey: "{{ lookup('file', grafana_api_keys_dir + '/grafana_apikey.key') }}" datasource: "{{ cluster_name }}" titles: br: "{{ cluster_name | title }}-Backup-Restore" node: "{{ cluster_name | title }}-Node_exporter" pd: "{{ cluster_name | title }}-PD" tidb: "{{ cluster_name | title }}-TiDB" tidb_summary: "{{ cluster_name | title }}-TiDB-Summary" tikv_summary: "{{ cluster_name | title }}-TiKV-Summary" tikv_details: "{{ cluster_name | title }}-TiKV-Details" tikv_trouble_shot: "{{ cluster_name | title }}-TiKV-Trouble-Shooting" tiflash_summary: "{{ cluster_name | title }}-TiFlash-Summary" tiflash_proxy_summary: "{{ cluster_name | title }}-TiFlash-Proxy-Summary" tiflash_proxy_details: "{{ cluster_name | title }}-TiFlash-Proxy-Details" binlog: "{{ cluster_name | title }}-Binlog" overview: "{{ cluster_name | title }}-Overview" disk_performance: "{{ cluster_name | title }}-Disk-Performance" blackbox_exporter: "{{ cluster_name | title }}-Blackbox_exporter" kafka_overview: "{{ cluster_name | title }}-Kafka-Overview" lightning: "{{ cluster_name | title }}-Lightning" performance_read: "{{ cluster_name | title }}-Performance-Read" performance_write: "{{ cluster_name | title }}-Performance-Write" - name: import grafana dashboards - run import script delegate_to: localhost shell: "python grafana-config-copy.py dests-{{ inventory_hostname }}.json" args: chdir: "{{ playbook_dir }}/scripts" vars: - ansible_become: false - ansible_connection: local - name: rolling update kafka_exporter hosts: kafka_exporter_servers any_errors_fatal: true tags: - kafka_exporter pre_tasks: - name: check kafka_exporter existed stat: path: "{{ deploy_dir }}/bin/kafka_exporter" register: kafka_exporter_binary_file when: enable_binlog|default(false) - name: stop kafka_exporter by supervise shell: cd {{ deploy_dir }}/scripts && ./stop_kafka_exporter.sh when: - enable_binlog|default(false) - process_supervision == 'supervise' - kafka_exporter_binary_file.stat.exists == True - name: stop kafka_exporter by systemd become: true systemd: name=kafka_exporter-{{ kafka_exporter_port }}.service state=stopped enabled=no when: - enable_binlog|default(false) - process_supervision == 'systemd' - kafka_exporter_binary_file.stat.exists == True - name: wait until the kafka_exporter port is down wait_for: host: "{{ ansible_host }}" port: "{{ kafka_exporter_port }}" state: stopped msg: "the kafka_exporter port {{ kafka_exporter_port }} is not down" when: enable_binlog|default(false) roles: - { role: kafka_exporter, when: 'enable_binlog|default(false) and kafka_addrs|default("") != ""' } post_tasks: - name: start kafka_exporter by supervise shell: cd {{ deploy_dir }}/scripts && ./start_kafka_exporter.sh when: - enable_binlog|default(false) - process_supervision == 'supervise' - name: start kafka_exporter by systemd become: true systemd: name=kafka_exporter-{{ kafka_exporter_port }}.service state=started enabled=no when: - enable_binlog|default(false) - process_supervision == 'systemd' - name: wait until the kafka_exporter port is up wait_for: host: "{{ ansible_host }}" port: "{{ kafka_exporter_port }}" state: started msg: "the kafka_exporter port {{ kafka_exporter_port }} is not up" when: enable_binlog|default(false) ================================================ FILE: scripts/binlog.json ================================================ { "__inputs": [ { "name": "DS_TEST-CLUSTER", "label": "test-cluster", "description": "", "type": "datasource", "pluginId": "prometheus", "pluginName": "Prometheus" } ], "__requires": [ { "type": "grafana", "id": "grafana", "name": "Grafana", "version": "6.1.6" }, { "type": "panel", "id": "graph", "name": "Graph", "version": "" }, { "type": "datasource", "id": "prometheus", "name": "Prometheus", "version": "1.0.0" }, { "type": "panel", "id": "singlestat", "name": "Singlestat", "version": "" } ], "annotations": { "list": [ { "builtIn": 1, "datasource": "${DS_TEST-CLUSTER}", "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "type": "dashboard" } ] }, "editable": true, "gnetId": null, "graphTooltip": 0, "id": null, "iteration": 1569404109122, "links": [], "panels": [ { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, "id": 73, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 1 }, "hideTimeOverride": false, "id": 68, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "binlog_pump_storage_storage_size_bytes", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{instance}} : {{type}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Storage Size", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "decbytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 1 }, "id": 63, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": true, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "binlog_pump_storage_gc_ts", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} : gc_tso", "refId": "A" }, { "expr": "binlog_pump_storage_max_commit_ts", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} : max_commit_tso", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Metadata", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "dateTimeAsIso", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 8 }, "id": 7, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "rate(binlog_pump_rpc_duration_seconds_count{method=\"WriteBinlog\"}[1m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} :: {{label}}", "metric": "binlog_cistern_rpc_duration_seconds_bucket", "refId": "A", "step": 2 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Write Binlog QPS by Instance", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 8 }, "id": 3, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, rate(binlog_pump_rpc_duration_seconds_bucket{method=\"WriteBinlog\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} : {{method}}:99", "refId": "A" }, { "expr": "histogram_quantile(0.95, rate(binlog_pump_rpc_duration_seconds_bucket{method=\"WriteBinlog\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} : {{method}} : 95", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Write Binlog Latency", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 15 }, "id": 44, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, rate(binlog_pump_storage_write_binlog_size_bucket[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} : {{type}} : 99", "refId": "B" }, { "expr": "histogram_quantile(0.95, rate(binlog_pump_storage_write_binlog_size_bucket[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} : {{type}} : 95", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Storage Write Binlog Size", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "Bps", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 15 }, "id": 66, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, rate(binlog_pump_storage_write_binlog_duration_time_bucket[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} : {{type}}:99", "refId": "A" }, { "expr": "histogram_quantile(0.95, rate(binlog_pump_storage_write_binlog_duration_time_bucket[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} : {{type}}:95", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Storage Write Binlog Latency", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 22 }, "id": 48, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "rate(binlog_pump_storage_error_count[1m])", "format": "time_series", "instant": false, "intervalFactor": 2, "legendFormat": "{{instance}}:{{type}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Pump Storage Error By Type", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 22 }, "id": 67, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "binlog_pump_storage_query_tikv_count", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Query Tikv", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 29 }, "id": 76, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "tidb_server_critical_error_total", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "TiDB Server Skip Binlog Count", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "pump", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 1 }, "id": 74, "panels": [ { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a" ], "datasource": "${DS_TEST-CLUSTER}", "format": "dateTimeAsIso", "gauge": { "maxValue": null, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "gridPos": { "h": 7, "w": 8, "x": 0, "y": 23 }, "hideTimeOverride": false, "id": 70, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "repeat": null, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "__name__", "targets": [ { "expr": "binlog_drainer_checkpoint_tso{instance = \"$drainer_instance\"}", "format": "time_series", "instant": true, "intervalFactor": 2, "legendFormat": "checkpoint tso", "refId": "A" } ], "thresholds": "", "timeFrom": null, "timeShift": null, "title": "Checkpoint TSO", "transparent": false, "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 7, "w": 16, "x": 8, "y": 23 }, "id": 69, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "binlog_drainer_pump_position{instance = \"$drainer_instance\"}", "format": "time_series", "hide": false, "instant": false, "intervalFactor": 2, "legendFormat": "{{nodeID}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Pump Handle TSO", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "transparent": false, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "dateTimeAsIso", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 30 }, "id": 62, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(binlog_drainer_read_binlog_size_count{instance = \"$drainer_instance\"}[1m])) by (nodeID)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{nodeID}}", "metric": "binlog_drainer_event", "refId": "A", "step": 2 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Pull Binlog QPS by Pump NodeID", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": null, "logBase": 10, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 30 }, "id": 53, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.95, rate(binlog_drainer_binlog_reach_duration_time_bucket{instance = \"$drainer_instance\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{nodeID}}", "metric": "binlog_drainer_event", "refId": "A", "step": 2 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "95% Binlog Reach Duration By Pump", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 10, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 37 }, "id": 58, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "binlog_drainer_error_count{instance = \"$drainer_instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "metric": "binlog_drainer_position", "refId": "A", "step": 2 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Error By Type", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": "", "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 37 }, "id": 6, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "irate(binlog_drainer_event{instance = \"$drainer_instance\"}[1m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "metric": "binlog_drainer_event", "refId": "A", "step": 2 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Drainer Event", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 10, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 44 }, "id": 15, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, rate(binlog_drainer_execute_duration_time_bucket{instance = \"$drainer_instance\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{job}}", "metric": "binlog_drainer_txn_duration_time_bucket", "refId": "A", "step": 2 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "99% Execute Time", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 44 }, "id": 71, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, rate(binlog_drainer_query_duration_time_bucket{instance = \"$drainer_instance\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "metric": "binlog_drainer_txn_duration_time_bucket", "refId": "A", "step": 2 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "99% sql query Time", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 51 }, "id": 55, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.95, rate(binlog_drainer_read_binlog_size_bucket{instance = \"$drainer_instance\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "pump: {{nodeID}}", "metric": "binlog_drainer_event", "refId": "A", "step": 2 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "95% Binlog Size", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "Bps", "label": null, "logBase": 10, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 51 }, "id": 52, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "binlog_drainer_ddl_jobs_total{instance = \"$drainer_instance\"}", "format": "time_series", "instant": false, "intervalFactor": 2, "legendFormat": "ddl job count", "metric": "binlog_drainer_position", "refId": "A", "step": 2 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "DDL Job Count", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": "", "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 7, "w": 24, "x": 0, "y": 58 }, "id": 72, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "binlog_drainer_queue_size{instance = \"$drainer_instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{name}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "queue size", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] } ], "repeat": null, "title": "drainer", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 2 }, "id": 75, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 59 }, "id": 9, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "go_goroutines{job=~\"binlog|pump|drainer\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "go_goroutines", "refId": "A", "step": 2 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Goroutine", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 59 }, "id": 39, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "go_memstats_heap_inuse_bytes{job=~\"binlog|pump|drainer\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "go_goroutines", "refId": "A", "step": 2 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Memory", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bits", "label": "", "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] } ], "repeat": null, "title": "node", "type": "row" } ], "refresh": "10s", "schemaVersion": 18, "style": "dark", "tags": [], "templating": { "list": [ { "allValue": null, "current": {}, "datasource": "${DS_TEST-CLUSTER}", "definition": "", "hide": 0, "includeAll": false, "label": null, "multi": false, "name": "drainer_instance", "options": [], "query": "label_values(binlog_drainer_ddl_jobs_total, instance)", "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 0, "tagValuesQuery": "", "tags": [], "tagsQuery": "", "type": "query", "useTags": false } ] }, "time": { "from": "now-1h", "to": "now" }, "timepicker": { "refresh_intervals": [ "5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d" ], "time_options": [ "5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d" ] }, "timezone": "browser", "title": "Test-Cluster-Binlog", "uid": "RDdDTFvZz", "version": 9 } ================================================ FILE: scripts/blackbox_exporter.json ================================================ { "__inputs": [ { "name": "DS_TEST-CLUSTER", "label": "test-cluster", "description": "", "type": "datasource", "pluginId": "prometheus", "pluginName": "Prometheus" } ], "__requires": [ { "type": "grafana", "id": "grafana", "name": "Grafana", "version": "6.1.6" }, { "type": "panel", "id": "graph", "name": "Graph", "version": "" }, { "type": "datasource", "id": "prometheus", "name": "Prometheus", "version": "1.0.0" }, { "type": "panel", "id": "table", "name": "Table", "version": "" } ], "annotations": { "list": [ { "builtIn": 1, "datasource": "${DS_TEST-CLUSTER}", "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "type": "dashboard" } ] }, "editable": true, "gnetId": null, "graphTooltip": 0, "id": null, "iteration": 1564734554417, "links": [], "panels": [ { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, "id": 30, "panels": [], "repeat": null, "title": "Network Status", "type": "row" }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 7, "w": 24, "x": 0, "y": 1 }, "id": 1, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": true, "rightSide": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "probe_duration_seconds{job=\"$PingJobHost\"}", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "", "refId": "A", "step": 20 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Ping Latency", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 8 }, "id": 31, "panels": [], "repeat": null, "title": "Services Port Status", "type": "row" }, { "columns": [ { "text": "Current", "value": "current" } ], "datasource": "${DS_TEST-CLUSTER}", "fontSize": "100%", "gridPos": { "h": 8, "w": 12, "x": 0, "y": 9 }, "hideTimeOverride": true, "id": 28, "links": [], "pageSize": null, "scroll": true, "showHeader": true, "sort": { "col": null, "desc": false }, "styles": [ { "alias": "Time", "dateFormat": "YYYY-MM-DD HH:mm:ss", "pattern": "Time", "type": "date" }, { "alias": "Service", "colorMode": null, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "pattern": "Metric", "thresholds": [], "type": "string", "unit": "short" }, { "alias": "Up", "colorMode": "cell", "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 0, "pattern": "Current", "thresholds": [ "0", "1" ], "type": "number", "unit": "short" }, { "alias": "", "colorMode": null, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "decimals": 2, "pattern": "/.*/", "thresholds": [], "type": "number", "unit": "short" } ], "targets": [ { "expr": "count(probe_success{group=\"tidb\"} == 1)", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "TiDB", "refId": "A" }, { "expr": "count(probe_success{group=\"pd\"} == 1)", "format": "time_series", "intervalFactor": 2, "legendFormat": "PD", "refId": "B" }, { "expr": "count(probe_success{group=\"tikv\"} == 1)", "format": "time_series", "intervalFactor": 2, "legendFormat": "TiKV", "refId": "C" }, { "expr": "count(probe_success{group=\"pump\"} == 1)", "format": "time_series", "intervalFactor": 2, "legendFormat": "Pump", "refId": "D" }, { "expr": "count(probe_success{group=\"drainer\"} == 1)", "format": "time_series", "intervalFactor": 2, "legendFormat": "Drainer", "refId": "E" }, { "expr": "count(probe_success{group=\"kafka\"} == 1)", "format": "time_series", "intervalFactor": 2, "legendFormat": "Kafka", "refId": "F" }, { "expr": "count(probe_success{group=\"zookeeper\"} == 1)", "format": "time_series", "intervalFactor": 2, "legendFormat": "Zookeeper", "refId": "G" }, { "expr": "count(probe_success{group=\"node_exporter\"} == 1)", "format": "time_series", "intervalFactor": 2, "legendFormat": "Node_exporter", "refId": "H" }, { "expr": "count(probe_success{group=\"blackbox_exporter\"} == 1)", "format": "time_series", "intervalFactor": 2, "legendFormat": "Blackbox_exporter", "refId": "I" }, { "expr": "count(probe_success{group=\"grafana\"} == 1)", "format": "time_series", "intervalFactor": 2, "legendFormat": "Grafana", "refId": "J" }, { "expr": "count(probe_success{job=\"blackbox_exporter_http\"} == 1)", "format": "time_series", "intervalFactor": 2, "legendFormat": "Pushgateway", "refId": "K" }, { "expr": "count(probe_success{group=\"kafka_exporter\"} == 1)", "format": "time_series", "intervalFactor": 2, "legendFormat": "Kafka_exporter", "refId": "L" }, { "expr": "\ncount(probe_success{group=\"tiflash\"} == 1)", "format": "time_series", "intervalFactor": 2, "legendFormat": "TiFlash", "refId": "M" } ], "timeFrom": "1s", "title": "", "transform": "timeseries_aggregations", "type": "table" }, { "columns": [ { "text": "Current", "value": "current" } ], "datasource": "${DS_TEST-CLUSTER}", "fontSize": "100%", "gridPos": { "h": 8, "w": 12, "x": 12, "y": 9 }, "hideTimeOverride": true, "id": 29, "links": [], "pageSize": null, "scroll": true, "showHeader": true, "sort": { "col": null, "desc": false }, "styles": [ { "alias": "Time", "dateFormat": "YYYY-MM-DD HH:mm:ss", "pattern": "Time", "type": "date" }, { "alias": "Service", "colorMode": null, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "pattern": "Metric", "thresholds": [], "type": "string", "unit": "short" }, { "alias": "Down", "colorMode": "cell", "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 0, "pattern": "Current", "thresholds": [ "100", "200" ], "type": "number", "unit": "short" }, { "alias": "", "colorMode": null, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "decimals": 2, "pattern": "/.*/", "thresholds": [], "type": "number", "unit": "short" } ], "targets": [ { "expr": "count(probe_success{group=\"tidb\"} == 0)", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "TiDB", "refId": "A" }, { "expr": "count(probe_success{group=\"pd\"} == 0)", "format": "time_series", "intervalFactor": 2, "legendFormat": "PD", "refId": "B" }, { "expr": "count(probe_success{group=\"tikv\"} == 0)", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "TiKV", "refId": "C" }, { "expr": "count(probe_success{group=\"pump\"} == 0)", "format": "time_series", "intervalFactor": 2, "legendFormat": "Pump", "refId": "D" }, { "expr": "count(probe_success{group=\"drainer\"} == 0)", "format": "time_series", "intervalFactor": 2, "legendFormat": "Drainer", "refId": "E" }, { "expr": "count(probe_success{group=\"kafka\"} == 0)", "format": "time_series", "intervalFactor": 2, "legendFormat": "Kafka", "refId": "F" }, { "expr": "count(probe_success{group=\"zookeeper\"} == 0)", "format": "time_series", "intervalFactor": 2, "legendFormat": "Zookeeper", "refId": "G" }, { "expr": "count(probe_success{group=\"node_exporter\"} == 0)", "format": "time_series", "intervalFactor": 2, "legendFormat": "Node_exporter", "refId": "H" }, { "expr": "count(probe_success{group=\"blackbox_exporter\"} == 0)", "format": "time_series", "intervalFactor": 2, "legendFormat": "Blackbox_exporter", "refId": "I" }, { "expr": "count(probe_success{group=\"grafana\"} == 0)", "format": "time_series", "intervalFactor": 2, "legendFormat": "Grafana", "refId": "J" }, { "expr": "count(probe_success{job=\"blackbox_exporter_http\"} == 0)", "format": "time_series", "intervalFactor": 2, "legendFormat": "Pushgateway", "refId": "K" }, { "expr": "count(probe_success{group=\"kafka_exporter\"} == 0)", "format": "time_series", "intervalFactor": 2, "legendFormat": "Kafka_exporter", "refId": "L" }, { "expr": "count(probe_success{group=\"tiflash\"} == 0)", "format": "time_series", "intervalFactor": 2, "legendFormat": "TiFlash", "refId": "M" } ], "timeFrom": "1s", "title": "", "transform": "timeseries_aggregations", "type": "table" } ], "schemaVersion": 18, "style": "dark", "tags": [], "templating": { "list": [ { "allValue": null, "current": {}, "datasource": "${DS_TEST-CLUSTER}", "definition": "", "hide": 0, "includeAll": false, "label": "PingJobHost", "multi": false, "name": "PingJobHost", "options": [], "query": "label_values(probe_duration_seconds,job)", "refresh": 1, "regex": "/blackbox_exporter.*icmp/", "skipUrlSync": false, "sort": 1, "tagValuesQuery": "", "tags": [], "tagsQuery": "", "type": "query", "useTags": false } ] }, "time": { "from": "now-5m", "to": "now" }, "timepicker": { "refresh_intervals": [ "5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d" ], "time_options": [ "5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d" ] }, "timezone": "", "title": "Test-Cluster-Blackbox_exporter", "uid": "DaODoKDZk", "version": 2 } ================================================ FILE: scripts/br.json ================================================ { "__inputs": [ { "name": "DS_TEST-CLUSTER", "label": "test-cluster", "description": "", "type": "datasource", "pluginId": "prometheus", "pluginName": "Prometheus" } ], "__requires": [ { "type": "grafana", "id": "grafana", "name": "Grafana", "version": "6.1.6" }, { "type": "panel", "id": "graph", "name": "Graph", "version": "" }, { "type": "panel", "id": "heatmap", "name": "Heatmap", "version": "" }, { "type": "datasource", "id": "prometheus", "name": "Prometheus", "version": "1.0.0" }, { "type": "panel", "id": "singlestat", "name": "Singlestat", "version": "" } ], "annotations": { "list": [ { "builtIn": 1, "datasource": "${DS_TEST_CLUSTER}", "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "type": "dashboard" } ] }, "editable": true, "gnetId": null, "graphTooltip": 0, "id": 16, "iteration": 1577953179687, "links": [], "panels": [ { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, "id": 15, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 1 }, "id": 4, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"backup_worker.*\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "backup-worker", "metric": "tikv_thread_cpu_seconds_total", "refId": "A", "step": 4 }, { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"backup_endpoint\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "backup-endpoint", "metric": "tikv_thread_cpu_seconds_total", "refId": "B", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Backup CPU Utilization", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 1 }, "id": 13, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": false, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": true, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "rate(node_disk_io_time_seconds_total[1m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} - {{device}}", "metric": "tikv_thread_cpu_seconds_total", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "IO Utilization", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 2, "description": "", "fill": 1, "gridPos": { "h": 7, "w": 7, "x": 0, "y": 8 }, "id": 10, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "rate(tikv_backup_error_counter[1m])", "format": "time_series", "hide": false, "intervalFactor": 1, "legendFormat": "{{error}} {{instance}}", "refId": "D" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Backup Errors", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "none", "label": null, "logBase": 1, "max": null, "min": "0", "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 9, "x": 7, "y": 8 }, "id": 2, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_backup_range_size_bytes_sum{instance=~\"$instance\"}[1m]))", "format": "time_series", "hide": true, "intervalFactor": 2, "legendFormat": "backup-flow", "metric": "", "refId": "A", "step": 4 }, { "expr": "rate(tikv_backup_range_size_bytes_sum[1m])", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "", "refId": "B", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "BackupSST Generation Throughput", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "Bps", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 8, "x": 16, "y": 8 }, "id": 6, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_backup_range_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le, type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}} - 99%", "metric": "", "refId": "A", "step": 4 }, { "expr": "histogram_quantile(0.95, sum(rate(tikv_backup_range_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le, type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}} - 95%", "refId": "B", "step": 4 }, { "expr": "sum(rate(tikv_backup_range_duration_seconds_sum{instance=~\"$instance\"}[1m])) by (type) / sum(rate(tikv_backup_range_duration_seconds_count{instance=~\"$instance\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}} - avg", "refId": "C", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "One Backup Range Duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 6, "w": 12, "x": 0, "y": 15 }, "id": 8, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": false, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": true, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_backup_request_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": " 99%", "metric": "", "refId": "A", "step": 4 }, { "expr": "histogram_quantile(0.95, sum(rate(tikv_backup_request_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "refId": "B", "step": 4 }, { "expr": "sum(rate(tikv_backup_request_duration_seconds_sum{instance=~\"$instance\"}[1m])) / sum(rate(tikv_backup_request_duration_seconds_count{instance=~\"$instance\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "refId": "C", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "One Backup Subtask Duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 6, "w": 12, "x": 12, "y": 15 }, "id": 12, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(1, sum(rate(tikv_coprocessor_request_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le,req))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{req}}-100%", "refId": "E" }, { "expr": "histogram_quantile(0.99, sum(rate(tikv_coprocessor_request_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le,req))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{req}}-99%", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Checksum Request Duration", "tooltip": { "msResolution": false, "shared": true, "sort": 1, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 1, "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } } ], "title": "Backup", "type": "row" }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 1 }, "id": 17, "panels": [], "title": "Restore", "type": "row" }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 2 }, "id": 21, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "CPU", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 2 }, "id": 19, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "rate(node_disk_io_time_seconds_total[1m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} - {{device}}", "metric": "tikv_thread_cpu_seconds_total", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "IO Utilization", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": " \tThe number of leaders on each TiKV instance", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 10 }, "id": 25, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "total", "lines": false } ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(tikv_raftstore_region_count{instance=~\"$instance\", type=\"leader\"}) by (instance)", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 10 }, { "expr": "delta(tikv_raftstore_region_count{instance=~\"$instance\", type=\"leader\"}[30s]) < -10", "format": "time_series", "hide": true, "intervalFactor": 2, "legendFormat": "", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Leader", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The number of Regions on each TiKV instance", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 10 }, "id": 29, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(tikv_raftstore_region_count{instance=~\"$instance\", type=\"region\"}) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Region", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": "", "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 18 }, "id": 33, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_import_download_duration_bucket{instance=~\"$instance\"}[1m])) by (le, type))", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{type}}-99%", "refId": "A" }, { "expr": "histogram_quantile(0.95, sum(rate(tikv_import_download_duration_bucket{instance=~\"$instance\"}[1m])) by (le, type))", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{type}}-95%", "refId": "B" }, { "expr": "sum(rate(tikv_import_download_duration_bucket{instance=~\"$instance\"}[1m])) by (type) / sum(rate(tikv_import_download_duration_bucket{instance=~\"$instance\"}[1m])) by (type)", "format": "time_series", "hide": true, "intervalFactor": 1, "legendFormat": "{{type}}-avg", "refId": "C" }, { "expr": "histogram_quantile(0.99, sum(rate(tikv_import_ingest_duration_bucket{instance=~\"$instance\"}[1m])) by (le, type))", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{type}}-99%", "refId": "D" }, { "expr": "histogram_quantile(0.95, sum(rate(tikv_import_ingest_duration_bucket{instance=~\"$instance\"}[1m])) by (le, type))", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{type}}-95%", "refId": "E" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Process SST Duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 18 }, "id": 31, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_import_download_bytes_sum{instance=~\"$instance\"}[1m]))", "format": "time_series", "intervalFactor": 1, "legendFormat": "download-flow", "refId": "A" }, { "expr": "rate(tikv_import_download_bytes_sum[1m])", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{instance}}", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "DownLoad SST Throughput", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "Bps", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 26 }, "id": 27, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "rate(tikv_import_error_counter[1m])", "format": "time_series", "hide": true, "intervalFactor": 2, "legendFormat": "{{error}}-{{instance}}", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Restore Errors", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 26 }, "id": 23, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(1, sum(rate(tikv_coprocessor_request_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le,req))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{req}}-100%", "refId": "E" }, { "expr": "histogram_quantile(0.99, sum(rate(tikv_coprocessor_request_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le,req))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{req}}-99%", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Checksum Request Duration", "tooltip": { "msResolution": false, "shared": true, "sort": 1, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 1, "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } } ], "refresh": false, "schemaVersion": 18, "style": "dark", "tags": [], "templating": { "list": [ { "allValue": ".*", "current": {}, "datasource": "${DS_TEST-CLUSTER}", "definition": "label_values(tikv_engine_size_bytes, instance)", "hide": 0, "includeAll": true, "label": "Instance", "multi": false, "name": "instance", "options": [], "query": "label_values(tikv_engine_size_bytes, instance)", "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 1, "tagValuesQuery": "", "tags": [], "tagsQuery": "", "type": "query", "useTags": false } ] }, "time": { "from": "now-30m", "to": "now" }, "timepicker": { "refresh_intervals": [ "5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d" ], "time_options": [ "5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d" ] }, "timezone": "", "title": "Test-Cluster-Backup & Restore", "uid": "AzvioWLWz", "version": 25 } ================================================ FILE: scripts/check/check_cpufreq.py ================================================ # -*- coding: utf-8 -*- import os import sys import re import argparse sysfs_cpu_online = "/sys/devices/system/cpu/online" def get_file_content(path, default=None, strip=True): data = default if os.path.exists(path) and os.access(path, os.R_OK): try: try: datafile = open(path) data = datafile.read() if strip: data = data.strip() if len(data) == 0: data = default finally: datafile.close() except Exception: pass return data def parse_opts(): parser = argparse.ArgumentParser( description="Check Linux system CPUfreq governor.") parser.add_argument("--available-governors", action="store_true", default=False, help="Show the CPUfreq governors available in the kernel.") parser.add_argument("--current-governor", action="store_true", default=False, help="Show the currently active governor.") return parser.parse_args() if __name__ == '__main__': args = parse_opts() cpu_online = get_file_content(sysfs_cpu_online) if cpu_online is not None: cpu_num = re.split(',|-', cpu_online)[0] sysfs_cpufreq = "/sys/devices/system/cpu/cpu{0}/cpufreq".format(cpu_num) sysfs_cpufreq_available_governors = "{0}/scaling_available_governors".format(sysfs_cpufreq) sysfs_cpufreq_governor = "{0}/scaling_governor".format(sysfs_cpufreq) else: print(cpu_online) sys.exit() available_governors = get_file_content(sysfs_cpufreq_available_governors) current_governor = get_file_content(sysfs_cpufreq_governor) if args.available_governors: print(available_governors) sys.exit() if args.current_governor: print(current_governor) sys.exit() ================================================ FILE: scripts/check/epoll_chk.cc ================================================ /* * * Copyright 2017 gRPC authors. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * */ #include #include #include #include #include #ifndef EPOLLEXCLUSIVE #define EPOLLEXCLUSIVE (1 << 28) #endif /* This polling engine is only relevant on linux kernels supporting epoll() */ bool grpc_is_epollexclusive_available(void) { static bool logged_why_not = false; int fd = epoll_create1(EPOLL_CLOEXEC); if (fd < 0) { if (!logged_why_not) { printf( "epoll_create1 failed with error: %d. Not using epollex polling " "engine.", fd); logged_why_not = true; } return false; } int evfd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC); if (evfd < 0) { if (!logged_why_not) { printf( "eventfd failed with error: %d. Not using epollex polling " "engine.", fd); logged_why_not = true; } close(fd); return false; } struct epoll_event ev; /* choose events that should cause an error on EPOLLEXCLUSIVE enabled kernels - specifically the combination of EPOLLONESHOT and EPOLLEXCLUSIVE */ ev.events = (uint32_t)(EPOLLET | EPOLLIN | EPOLLEXCLUSIVE | EPOLLONESHOT); ev.data.ptr = NULL; if (epoll_ctl(fd, EPOLL_CTL_ADD, evfd, &ev) != 0) { if (errno != EINVAL) { if (!logged_why_not) { printf( "epoll_ctl with EPOLLEXCLUSIVE | EPOLLONESHOT failed with error: " "%d. Not using epollex polling engine.", errno); logged_why_not = true; } close(fd); close(evfd); return false; } } else { if (!logged_why_not) { printf( "epoll_ctl with EPOLLEXCLUSIVE | EPOLLONESHOT succeeded. This is " "evidence of no EPOLLEXCLUSIVE support. Not using " "epollex polling engine."); logged_why_not = true; } close(fd); close(evfd); return false; } close(evfd); close(fd); return true; } int main() { if (grpc_is_epollexclusive_available()) { printf("True: epollexclusive is available\n"); return 0; } else { printf("False: epollexclusive is NOT available, please upgrade the Linux kernel\n"); return 1; } } ================================================ FILE: scripts/check/parse_fio_output.py ================================================ #!/usr/bin/env python # -*- coding: utf-8 -*- import sys import json import argparse def parse_opts(): parser = argparse.ArgumentParser(description="Parse fio output.") parser.add_argument("--read-iops", action="store_true", default=False, help="fio read IOPS.") parser.add_argument("--read-lat", action="store_true", default=False, help="fio read average latency. (ns)") parser.add_argument("--write-iops", action="store_true", default=False, help="fio write IOPS.") parser.add_argument("--write-lat", action="store_true", default=False, help="fio write average latency. (ns)") parser.add_argument("--target", action="store", default=None, help="file path of fio JSON output.") parser.add_argument("--summary", action="store_true", default=False, help="fio output summary.") return parser.parse_args() if __name__ == '__main__': args = parse_opts() if not args.target: print("Please add `--target` flag to specify file path of fio JSON output.") sys.exit(1) else: output_file = args.target with open(output_file) as fp: result = json.load(fp) jobname = result['jobs'][0]['jobname'] rw = result['global options']['rw'] result_read = result['jobs'][0]['read'] result_write = result['jobs'][0]['write'] read_lag_ns = result_read['lat_ns'] write_lag_ns = result_write['lat_ns'] read_clag_ns = result_read['clat_ns']['percentile'] write_clag_ns = result_write['clat_ns']['percentile'] read_iops = int(result_read['iops']) read_lag_ns_min = int(read_lag_ns['min']) read_lag_ns_avg = int(read_lag_ns['mean']) read_lag_ns_max = int(read_lag_ns['max']) read_clag_ns_95 = int(read_clag_ns['95.000000']) read_clag_ns_99 = int(read_clag_ns['99.000000']) write_iops = int(result_write['iops']) write_lag_ns_min = int(write_lag_ns['min']) write_lag_ns_avg = int(write_lag_ns['mean']) write_lag_ns_max = int(write_lag_ns['max']) write_clag_ns_95 = int(write_clag_ns['95.000000']) write_clag_ns_99 = int(write_clag_ns['99.000000']) read_summary = "read: IOPS={}\nlat (ns): min={}, max={}, avg={}\nclat percentiles (ns): 95.00th={}, 99.00th={}".format(read_iops, read_lag_ns_min, read_lag_ns_max, read_lag_ns_avg, read_clag_ns_95, read_clag_ns_99) write_summary = "write: IOPS={}\nlat (ns): min={}, max={}, avg={}\nclat percentiles (ns): 95.00th={}, 99.00th={}".format(write_iops, write_lag_ns_min, write_lag_ns_max, write_lag_ns_avg, write_clag_ns_95, write_clag_ns_99) if args.read_iops: print(read_iops) sys.exit() if args.read_lat: print(read_lag_ns_avg) sys.exit() if args.write_iops: print(write_iops) sys.exit() if args.write_lat: print(write_lag_ns_avg) sys.exit() if args.summary: print("jobname: {}".format(jobname)) if rw in ("read","randread","readwrite","rw","randrw"): print(read_summary) if rw in ("write","randwrite","readwrite","rw","randrw","trimwrite"): print(write_summary) sys.exit() ================================================ FILE: scripts/clsrun.sh ================================================ #!/bin/bash export LANG=en_US.UTF-8 export TZ="Asia/Shanghai" NODE_LIST="t001 t002 t003 t004" NODE_LIST_TIKV="t002 t003 t004" NODE_LIST_TIDB="t001" NODE_LIST_PD="t001 t002 t003" # breakpoint resume for scp function rscp() { if [ -z "$1" -o -z "$2" ] ; then echo "Usage: rscp src target" else while true ; do rsync -v -P -e "ssh " $1 $2 if [ $? -eq 0 ] ; then break else sleep 1; echo try again at $(date)... fi done fi } alias rscp=rscp function cls_cp() { SELF="`hostname`" if [ -z "$NODE_LIST" ]; then echo echo Error: NODE_LIST environment variable must be set in .bash_profile exit 1 fi if [[ $1 = '--tikv' ]]; then shift HOST_LIST=$NODE_LIST_TIKV elif [[ $1 = '--tidb' ]]; then shift HOST_LIST=$NODE_LIST_TIDB elif [[ $1 = '--pd' ]]; then shift HOST_LIST=$NODE_LIST_PD else HOST_LIST=$NODE_LIST fi if [[ "$1" = '--background' ]]; then shift for i in $HOST_LIST; do if [ ! "$i" = "$SELF" ]; then if [ "$1" = "-r" ]; then scp $sshauth -oStrictHostKeyChecking=no -r $2 $i:$3 & else scp $sshauth -oStrictHostKeyChecking=no $1 $i:$2 & fi fi done wait else for i in $HOST_LIST; do if [ ! "$i" = "$SELF" ]; then if [ "$1" = "-r" ]; then scp $sshauth -oStrictHostKeyChecking=no -r $2 $i:$3 else scp $sshauth -oStrictHostKeyChecking=no $1 $i:$2 fi fi done fi } alias cls_cp=cls_cp function cls_run() { if [ -z "$NODE_LIST" ]; then echo echo Error: NODE_LIST environment variable must be set in .bash_profile exit 1 fi if [[ $1 = '--tikv' ]]; then shift HOST_LIST=$NODE_LIST_TIKV elif [[ $1 = '--tidb' ]]; then shift HOST_LIST=$NODE_LIST_TIDB elif [[ $1 = '--pd' ]]; then shift HOST_LIST=$NODE_LIST_PD else HOST_LIST=$NODE_LIST fi if [[ $1 = '--background' ]]; then shift for i in $HOST_LIST; do ssh $sshauth -oStrictHostKeyChecking=no -n $i "$@" & done wait else for i in $HOST_LIST; do ssh $sshauth -oStrictHostKeyChecking=no $i "$@" done fi } alias cls_run=cls_run export TERM=linux ================================================ FILE: scripts/dashboard_topo.py ================================================ #!/usr/bin/env python2 from __future__ import print_function, \ unicode_literals import urllib import urllib2 import base64 import json import argparse ComponentToRegister = ('alertmanager', 'grafana', 'pd', 'prometheus') def parse_opts(): """ parse_opts parse the input of involved components and pd address. """ parser = argparse.ArgumentParser(description="Parse output.") # pd is involved because we need to send http request for target in ComponentToRegister: parser.add_argument("--{}".format(target), help="the address list of {}".format(target)) args, unknown = parser.parse_known_args() return args def etcd_write(etcd_url, key, value): encoded_key = base64.b64encode(key) encoded_value = base64.b64encode(value) data = json.dumps({ "key": encoded_key, "value": encoded_value, }) req = urllib2.Request('http://' + etcd_url + '/v3/kv/put', data=data, headers={'Content-Type': 'application/json'}) try: resp = urllib2.urlopen(req) data = json.load(resp) return data except urllib2.HTTPError as error: raise error def parse_address(con): """ con: str for argument like "127.0.0.1:2379/deploy" return: Tuple[str, str] like ("127.0.0.1:2379", "/deploy") """ pos = con.find('/') return (con[:pos], con[pos:]) def request_topo(comp, topo, etcd_target): """ Sending request to etcd v3, and leave: under {pd_target}: write: /topology/{comp}: {ip: ip, address: address} comp: str for component name, which will be like "tidb" topo: str for topology address, like "127.0.0.1:4000" pd_target: the place to send etcd request, like "127.0.0.1:2379" """ if topo is None: # if topo is None, do nothing return if ',' in topo: topo = topo.split(',')[0] ip, add = parse_address(topo) ip, port = ip.split(':') message = json.dumps({ 'ip': ip, 'binary_path': add, 'port': int(port), }) etcd_write(etcd_target, "/topology/" + comp, message) def concat_to_address(ip, port): """ ip: str for address to concat, like "127.0.0.1" port: str for port, like "2379" return: str like "127.0.0.1:2379" return None if ip or port is None """ if ip is None or port is None: return None return ip.strip() + ":" + port.strip() if __name__ == '__main__': args = parse_opts() # parse from args pd_address = args.pd pd_address_zero, _ = parse_address(pd_address.split(',')[0]) alertmanager_address = args.alertmanager grafana_address = args.grafana prometheus_address = args.prometheus mapping = { 'alertmanager': alertmanager_address, 'grafana': grafana_address, 'prometheus': prometheus_address, } for comp in ComponentToRegister: if comp == 'pd': continue request_topo(comp, mapping[comp], pd_address_zero) ================================================ FILE: scripts/disk_performance.json ================================================ { "__inputs": [ { "name": "DS_TEST-CLUSTER", "label": "test-cluster", "description": "", "type": "datasource", "pluginId": "prometheus", "pluginName": "Prometheus" } ], "__requires": [ { "type": "grafana", "id": "grafana", "name": "Grafana", "version": "6.1.6" }, { "type": "panel", "id": "graph", "name": "Graph", "version": "" }, { "type": "datasource", "id": "prometheus", "name": "Prometheus", "version": "1.0.0" } ], "annotations": { "list": [ { "builtIn": 1, "datasource": "${DS_TEST-CLUSTER}", "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "type": "dashboard" } ] }, "editable": true, "gnetId": null, "graphTooltip": 1, "id": null, "iteration": 1564734594079, "links": [], "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 2, "description": "Shows average latency for Reads and Writes IO Devices. Higher than typical latency for highly loaded storage indicates saturation (overload) and is frequent cause of performance problems. Higher than normal latency also can indicate internal storage problems.", "editable": true, "error": false, "fill": 2, "grid": {}, "gridPos": { "h": 7, "w": 24, "x": 0, "y": 0 }, "id": 11, "legend": { "alignAsTable": true, "avg": true, "current": false, "hideEmpty": true, "hideZero": true, "max": true, "min": true, "rightSide": true, "show": true, "sort": null, "sortDesc": null, "total": false, "values": true }, "lines": false, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 1, "points": true, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "calculatedInterval": "2m", "datasourceErrors": {}, "errors": {}, "expr": "(rate(node_disk_read_time_seconds_total{device=~\"$device\", instance=\"$host\"}[$interval]) / rate(node_disk_reads_completed_total{device=~\"$device\", instance=\"$host\"}[$interval])) or (irate(node_disk_read_time_seconds_total{device=~\"$device\", instance=\"$host\"}[5m]) / irate(node_disk_reads_completed_total{device=~\"$device\", instance=\"$host\"}[5m]))", "format": "time_series", "interval": "$interval", "intervalFactor": 1, "legendFormat": "Read: {{ device }}", "metric": "", "refId": "A", "step": 300, "target": "" }, { "calculatedInterval": "2m", "datasourceErrors": {}, "errors": {}, "expr": "(rate(node_disk_write_time_seconds_total{device=~\"$device\", instance=\"$host\"}[$interval]) / rate(node_disk_writes_completed_total{device=~\"$device\", instance=\"$host\"}[$interval])) or (irate(node_disk_write_time_seconds_total{device=~\"$device\", instance=\"$host\"}[5m]) / irate(node_disk_writes_completed_total{device=~\"$device\", instance=\"$host\"}[5m]))", "format": "time_series", "hide": true, "interval": "$interval", "intervalFactor": 1, "legendFormat": "Write: {{ device }}", "metric": "", "refId": "B", "step": 300, "target": "" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Disk Latency", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": "", "logBase": 2, "max": null, "min": null, "show": true }, { "format": "s", "label": "", "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 2, "description": "Shows amount of physical IOs (reads and writes) different devices are serving. Spikes in number of IOs served often corresponds to performance problems due to IO subsystem overload.", "editable": true, "error": false, "fill": 2, "grid": {}, "gridPos": { "h": 7, "w": 24, "x": 0, "y": 7 }, "id": 15, "legend": { "alignAsTable": true, "avg": true, "current": false, "hideEmpty": false, "hideZero": true, "max": true, "min": true, "rightSide": true, "show": true, "sort": null, "sortDesc": null, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 1, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "calculatedInterval": "2m", "datasourceErrors": {}, "errors": {}, "expr": "rate(node_disk_reads_completed_total{device=~\"$device\", instance=\"$host\"}[$interval]) or irate(node_disk_reads_completed_total{device=~\"$device\", instance=\"$host\"}[5m])", "format": "time_series", "interval": "$interval", "intervalFactor": 1, "legendFormat": "Read: {{ device }}", "metric": "", "refId": "A", "step": 300, "target": "" }, { "calculatedInterval": "2m", "datasourceErrors": {}, "errors": {}, "expr": "rate(node_disk_writes_completed_total{device=~\"$device\", instance=\"$host\"}[$interval]) or irate(node_disk_writes_completed_total{device=~\"$device\", instance=\"$host\"}[5m])", "format": "time_series", "interval": "$interval", "intervalFactor": 1, "legendFormat": "Write: {{ device }}", "metric": "", "refId": "B", "step": 300, "target": "" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Disk Operations", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "iops", "label": "", "logBase": 1, "max": null, "min": 0, "show": true }, { "format": "short", "label": "", "logBase": 1, "max": null, "min": 0, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 2, "description": "Shows volume of reads and writes the storage is handling. This can be better measure of IO capacity usage for network attached and SSD storage as it is often bandwidth limited. Amount of data being written to the disk can be used to estimate Flash storage life time.", "editable": true, "error": false, "fill": 2, "grid": {}, "gridPos": { "h": 7, "w": 24, "x": 0, "y": 14 }, "id": 16, "legend": { "alignAsTable": true, "avg": true, "current": false, "hideEmpty": false, "hideZero": true, "max": true, "min": true, "rightSide": true, "show": true, "sort": null, "sortDesc": null, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 1, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "calculatedInterval": "2m", "datasourceErrors": {}, "errors": {}, "expr": "rate(node_disk_read_bytes_total{device=~\"$device\", instance=\"$host\"}[$interval]) or irate(node_disk_read_bytes_total{device=~\"$device\", instance=\"$host\"}[5m])", "format": "time_series", "interval": "$interval", "intervalFactor": 1, "legendFormat": "Read: {{ device }}", "metric": "", "refId": "A", "step": 300, "target": "" }, { "calculatedInterval": "2m", "datasourceErrors": {}, "errors": {}, "expr": "rate(node_disk_written_bytes_total{device=~\"$device\", instance=\"$host\"}[$interval]) or irate(node_disk_written_bytes_total{device=~\"$device\", instance=\"$host\"}[5m])", "format": "time_series", "interval": "$interval", "intervalFactor": 1, "legendFormat": "Write: {{ device }}", "metric": "", "refId": "B", "step": 300, "target": "" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Disk Bandwidth", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "Bps", "label": "", "logBase": 1, "max": null, "min": 0, "show": true }, { "format": "short", "label": "", "logBase": 1, "max": null, "min": 0, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 2, "description": "Shows how much disk was loaded for reads or writes as average number of outstanding requests at different period of time. High disk load is a good measure of actual storage utilization. Different storage types handle load differently - some will show latency increases on low loads others can handle higher load with no problems.", "editable": true, "error": false, "fill": 2, "grid": {}, "gridPos": { "h": 7, "w": 24, "x": 0, "y": 21 }, "id": 14, "legend": { "alignAsTable": true, "avg": true, "current": false, "hideEmpty": false, "hideZero": true, "max": true, "min": true, "rightSide": true, "show": true, "total": false, "values": true }, "lines": false, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 1, "points": true, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "calculatedInterval": "2m", "datasourceErrors": {}, "errors": {}, "expr": "rate(node_disk_read_time_seconds_total{device=~\"$device\", instance=\"$host\"}[$interval]) or irate(node_disk_read_time_seconds_total{device=~\"$device\", instance=\"$host\"}[5m])", "format": "time_series", "interval": "$interval", "intervalFactor": 1, "legendFormat": "Read: {{ device }}", "metric": "", "refId": "A", "step": 300, "target": "" }, { "calculatedInterval": "2m", "datasourceErrors": {}, "errors": {}, "expr": "rate(node_disk_write_time_seconds_total{device=~\"$device\", instance=\"$host\"}[$interval]) or irate(node_disk_write_time_seconds_total{device=~\"$device\", instance=\"$host\"}[5m])", "format": "time_series", "interval": "$interval", "intervalFactor": 1, "legendFormat": "Write: {{ device }}", "metric": "", "refId": "B", "step": 300, "target": "" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Disk Load", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": "", "logBase": 1, "max": null, "min": 0, "show": true }, { "format": "short", "label": "", "logBase": 1, "max": null, "min": 0, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 2, "description": "Shows disk Utilization as percent of the time when there was at least one IO request in flight. It is designed to match utilization available in iostat tool. It is not very good measure of true IO Capacity Utilization. Consider looking at IO latency and Disk Load Graphs instead.", "editable": true, "error": false, "fill": 2, "grid": {}, "gridPos": { "h": 7, "w": 24, "x": 0, "y": 28 }, "id": 17, "legend": { "alignAsTable": true, "avg": true, "current": false, "hideEmpty": false, "hideZero": true, "max": true, "min": true, "rightSide": true, "show": true, "sort": "avg", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 1, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "calculatedInterval": "2m", "datasourceErrors": {}, "errors": {}, "expr": "rate(node_disk_io_time_seconds_total{device=~\"$device\", instance=\"$host\"}[$interval]) or irate(node_disk_io_time_seconds_total{device=~\"$device\", instance=\"$host\"}[5m])", "format": "time_series", "interval": "$interval", "intervalFactor": 1, "legendFormat": "{{ device }}", "metric": "", "refId": "A", "step": 300, "target": "" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Disk IO Utilization", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": "", "logBase": 1, "max": null, "min": 0, "show": true }, { "format": "short", "label": "", "logBase": 1, "max": null, "min": 0, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 2, "description": "Shows how effectively Operating System is able to merge logical IO requests into physical requests. This is a good measure of the IO locality which can be used for workload characterization.", "editable": true, "error": false, "fill": 2, "grid": {}, "gridPos": { "h": 7, "w": 24, "x": 0, "y": 35 }, "id": 18, "legend": { "alignAsTable": true, "avg": true, "current": false, "hideEmpty": true, "hideZero": true, "max": true, "min": true, "rightSide": true, "show": true, "total": false, "values": true }, "lines": false, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 1, "points": true, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "calculatedInterval": "2m", "datasourceErrors": {}, "errors": {}, "expr": "(1 + rate(node_disk_reads_merged_total{device=~\"$device\", instance=\"$host\"}[$interval]) / rate(node_disk_reads_completed_total{device=~\"$device\", instance=\"$host\"}[$interval])) or (1 + irate(node_disk_reads_merged_total{device=~\"$device\", instance=\"$host\"}[5m]) / irate(node_disk_reads_completed_total{device=~\"$device\", instance=\"$host\"}[5m]))", "format": "time_series", "interval": "$interval", "intervalFactor": 1, "legendFormat": "Read Ratio: {{ device }}", "metric": "", "refId": "A", "step": 300, "target": "" }, { "calculatedInterval": "2m", "datasourceErrors": {}, "errors": {}, "expr": "(1 + rate(node_disk_writes_merged_total{device=~\"$device\", instance=\"$host\"}[$interval]) / rate(node_disk_writes_completed_total{device=~\"$device\", instance=\"$host\"}[$interval])) or (1 + irate(node_disk_writes_merged_total{device=~\"$device\", instance=\"$host\"}[5m]) / irate(node_disk_writes_completed_total{device=~\"$device\", instance=\"$host\"}[5m]))", "format": "time_series", "interval": "$interval", "intervalFactor": 1, "legendFormat": "Write Ratio: {{ device }}", "metric": "", "refId": "B", "step": 300, "target": "" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Disk Operations Merge Ratio", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": "", "logBase": 1, "max": null, "min": 0, "show": true }, { "format": "short", "label": "", "logBase": 1, "max": null, "min": 0, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": { "Read IO size: sdb": "#2F575E", "Read: sdb": "#3F6833" }, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 2, "description": "Shows average size of a single disk operation.", "editable": true, "error": false, "fill": 2, "grid": {}, "gridPos": { "h": 7, "w": 24, "x": 0, "y": 42 }, "id": 20, "legend": { "alignAsTable": true, "avg": true, "current": false, "hideEmpty": true, "hideZero": true, "max": true, "min": true, "rightSide": true, "show": true, "total": false, "values": true }, "lines": false, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 1, "points": true, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "calculatedInterval": "2m", "datasourceErrors": {}, "errors": {}, "expr": "rate(node_disk_read_bytes_totalread{instance=\"$host\", device=~\"$device\"}[$interval]) * 512 / rate(node_disk_reads_completed_total{instance=\"$host\", device=~\"$device\"}[$interval]) or irate(node_disk_read_bytes_total{instance=\"$host\", device=~\"$device\"}[5m]) * 512 / irate(node_disk_reads_completed_total{instance=\"$host\", device=~\"$device\"}[5m]) ", "format": "time_series", "interval": "$interval", "intervalFactor": 1, "legendFormat": "Read size: {{ device }}", "metric": "", "refId": "A", "step": 300, "target": "" }, { "calculatedInterval": "2m", "datasourceErrors": {}, "errors": {}, "expr": "rate(node_disk_written_bytes_total{instance=\"$host\", device=~\"$device\"}[$interval]) * 512 / rate(node_disk_writes_completed_total{instance=\"$host\", device=~\"$device\"}[$interval]) or irate(node_disk_written_bytes_total{instance=\"$host\", device=~\"$device\"}[5m]) * 512 / irate(node_disk_writes_completed_total{instance=\"$host\", device=~\"$device\"}[5m]) ", "format": "time_series", "interval": "$interval", "intervalFactor": 1, "legendFormat": "Write size: {{ device }}", "metric": "", "refId": "B", "step": 300, "target": "" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Disk IO Size", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": "", "logBase": 1, "max": null, "min": 0, "show": true }, { "format": "short", "label": "", "logBase": 1, "max": null, "min": 0, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "refresh": "30s", "schemaVersion": 18, "style": "dark", "tags": [], "templating": { "list": [ { "allFormat": "glob", "auto": true, "auto_count": 200, "auto_min": "1s", "current": { "text": "auto", "value": "$__auto_interval_interval" }, "datasource": "test-cluster", "hide": 0, "includeAll": false, "label": "Interval", "multi": false, "multiFormat": "glob", "name": "interval", "options": [ { "selected": true, "text": "auto", "value": "$__auto_interval_interval" }, { "selected": false, "text": "1s", "value": "1s" }, { "selected": false, "text": "5s", "value": "5s" }, { "selected": false, "text": "1m", "value": "1m" }, { "selected": false, "text": "5m", "value": "5m" }, { "selected": false, "text": "1h", "value": "1h" }, { "selected": false, "text": "6h", "value": "6h" }, { "selected": false, "text": "1d", "value": "1d" } ], "query": "1s,5s,1m,5m,1h,6h,1d", "refresh": 2, "skipUrlSync": false, "type": "interval" }, { "allFormat": "glob", "allValue": null, "current": {}, "datasource": "${DS_TEST-CLUSTER}", "definition": "label_values(node_disk_reads_completed_total, instance)", "hide": 0, "includeAll": false, "label": "Host", "multi": false, "multiFormat": "regex values", "name": "host", "options": [], "query": "label_values(node_disk_reads_completed_total, instance)", "refresh": 1, "refresh_on_load": false, "regex": "", "skipUrlSync": false, "sort": 1, "tagValuesQuery": "instance", "tags": [], "tagsQuery": "up", "type": "query", "useTags": false }, { "allFormat": "glob", "allValue": null, "current": {}, "datasource": "${DS_TEST-CLUSTER}", "definition": "label_values(node_disk_reads_completed_total{instance=\"$host\", device!~\"dm-.+\"}, device)", "hide": 0, "includeAll": true, "label": "Device", "multi": true, "multiFormat": "regex values", "name": "device", "options": [], "query": "label_values(node_disk_reads_completed_total{instance=\"$host\", device!~\"dm-.+\"}, device)", "refresh": 1, "refresh_on_load": false, "regex": "", "skipUrlSync": false, "sort": 1, "tagValuesQuery": "instance", "tags": [], "tagsQuery": "up", "type": "query", "useTags": false } ] }, "time": { "from": "now-1h", "to": "now" }, "timepicker": { "collapse": false, "enable": true, "notice": false, "now": true, "refresh_intervals": [ "5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d" ], "status": "Stable", "time_options": [ "5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d" ], "type": "timepicker" }, "timezone": "browser", "title": "Test-Cluster-Disk-Performance", "uid": "q2MgvJVWk", "version": 2 } ================================================ FILE: scripts/funcslower ================================================ #!/bin/bash # # funcslower - trace kernel functions slower than a threshold (microseconds). # Uses Linux ftrace. # # This uses the Linux ftrace function graph profiler to time kernel functions # and filter them based on a latency threshold. This is a proof of concept using # Linux ftrace capabilities on older kernels. # # USAGE: funcslower [-aChHPt] [-p PID] [-d secs] funcstring latency_us # # Run "funcslower -h" for full usage. # # REQUIREMENTS: FTRACE function graph, which you may already have available # and enabled in recent kernels. And awk. # # The output format is the same as the ftrace function graph trace format, # described in the kernel source under Documentation/trace/ftrace.txt. # Note that the output may be shuffled when different CPU buffers are read; # check the CPU column for changes, or include timestamps (-t) and post sort. # # WARNING: This uses dynamic tracing of kernel functions, and could cause # kernel panics or freezes. Test, and know what you are doing, before use. # # OVERHEADS: Timing and filtering is performed in-kernel context, costing # lower overheads than post-processing in user space. If you trace frequent # events (eg, pick a common function and a low threshold), you might want to # try the "-d secs" option, which buffers events in-kernel instead of printing # them live. # # From perf-tools: https://github.com/brendangregg/perf-tools # # COPYRIGHT: Copyright (c) 2014 Brendan Gregg. # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software Foundation, # Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # # (http://www.gnu.org/copyleft/gpl.html) # # 12-Jul-2014 Brendan Gregg Created this. ### default variables tracing=/sys/kernel/debug/tracing flock=/var/tmp/.ftrace-lock opt_duration=0; duration=; opt_pid=0; pid=; opt_tid=0; tid= pidtext=; opt_headers=0; opt_proc=0; opt_time=0; opt_cpu=0 trap ':' INT QUIT TERM PIPE HUP # sends execution to end tracing section function usage { cat <<-END >&2 USAGE: funcslower [-aChHPt] [-p PID] [-L TID] [-d secs] funcstring latency_us -a # all info (same as -HPt) -C # measure on-CPU time only -d seconds # trace duration, and use buffers -h # this usage message -H # include column headers -p PID # trace when this pid is on-CPU -L TID # trace when this thread is on-CPU -P # show process names & PIDs -t # show timestamps eg, funcslower vfs_read 10000 # trace vfs_read() slower than 10 ms See the man page and example file for more info. END exit } function warn { if ! eval "$@"; then echo >&2 "WARNING: command failed \"$@\"" fi } function end { # disable tracing echo 2>/dev/null echo "Ending tracing..." 2>/dev/null cd $tracing (( opt_time )) && warn "echo nofuncgraph-abstime > trace_options" (( opt_proc )) && warn "echo nofuncgraph-proc > trace_options" (( opt_cpu )) && warn "echo sleep-time > trace_options" warn "echo nop > current_tracer" (( opt_pid )) && warn "echo > set_ftrace_pid" warn "echo > set_ftrace_filter" warn "echo > set_graph_function" warn "echo 0 > tracing_thresh" warn "echo > trace" (( wroteflock )) && warn "rm $flock" } function die { echo >&2 "$@" exit 1 } function edie { # die with a quiet end() echo >&2 "$@" exec >/dev/null 2>&1 end exit 1 } ### process options while getopts aCd:hHp:L:Pt opt do case $opt in a) opt_headers=1; opt_proc=1; opt_time=1 ;; C) opt_cpu=1; ;; d) opt_duration=1; duration=$OPTARG ;; p) opt_pid=1; pid=$OPTARG ;; L) opt_tid=1; tid=$OPTARG ;; H) opt_headers=1; ;; P) opt_proc=1; ;; t) opt_time=1; ;; h|?) usage ;; esac done shift $(( $OPTIND - 1 )) ### option logic (( $# < 2 )) && usage (( opt_pid && opt_tid )) && edie "ERROR: You can use -p or -L but not both." funcs="$1" shift thresh=$1 (( opt_pid )) && pidtext=" for PID $pid" (( opt_tid )) && pidtext=" for TID $tid" printf "Tracing \"$funcs\"$pidtext slower than $thresh us" if (( opt_duration )); then echo " for $duration seconds..." else echo "... Ctrl-C to end." fi ## select awk if (( opt_duration )); then [[ -x /usr/bin/mawk ]] && awk=mawk || awk=awk else # workarounds for mawk/gawk fflush behavior if [[ -x /usr/bin/gawk ]]; then awk=gawk elif [[ -x /usr/bin/mawk ]]; then awk="mawk -W interactive" else awk=awk fi fi ### check permissions cd $tracing || die "ERROR: accessing tracing. Root user? Kernel has FTRACE? debugfs mounted? (mount -t debugfs debugfs /sys/kernel/debug)" ### ftrace lock [[ -e $flock ]] && die "ERROR: ftrace may be in use by PID $(cat $flock) $flock" echo $$ > $flock || die "ERROR: unable to write $flock." wroteflock=1 ### setup and commence tracing sysctl -q kernel.ftrace_enabled=1 # doesn't set exit status read mode < current_tracer [[ "$mode" != "nop" ]] && edie "ERROR: ftrace active (current_tracer=$mode)" if ! echo $thresh > tracing_thresh; then edie "ERROR: setting tracing_thresh to $thresh. Exiting." fi if (( opt_pid )); then echo '' > set_ftrace_pid # ftrace expects kernel pids, which are thread ids for tid in /proc/$pid/task/*; do if ! echo ${tid##*/} >> set_ftrace_pid; then edie "ERROR: setting -p $pid (PID exist?). Exiting." fi done fi if (( opt_tid )); then if ! echo $tid > set_ftrace_pid; then edie "ERROR: setting -L $tid (TID exist?). Exiting." fi fi if ! echo "$funcs" > set_ftrace_filter; then edie "ERROR: enabling \"$funcs\" filter. Function exist? Exiting." fi if ! echo "$funcs" > set_graph_function; then edie "ERROR: enabling \"$funcs\" graph. Exiting." fi if ! echo function_graph > current_tracer; then edie "ERROR: setting current_tracer to \"function_graph\". Exiting." fi if (( opt_cpu )); then if ! echo nosleep-time > trace_options; then edie "ERROR: setting -C (nosleep-time). Exiting." fi fi # the following must be done after setting current_tracer if (( opt_time )); then if ! echo funcgraph-abstime > trace_options; then edie "ERROR: setting -t (funcgraph-abstime). Exiting." fi fi if (( opt_proc )); then if ! echo funcgraph-proc > trace_options; then edie "ERROR: setting -P (funcgraph-proc). Exiting." fi fi ### setup output filter cat=cat if (( opt_proc )); then # remove proc change entries, since PID is included. example: # ------------------------------------------ # 0) supervi-1699 => supervi-1693 # ------------------------------------------ # cat=$awk' "/(^ ---|^$)/ || \$3 == \"=>\" { next } { print \$0 }"' fi ### print trace buffer warn "echo > trace" if (( opt_duration )); then sleep $duration if (( opt_headers )); then $cat trace else $cat trace | grep -v '^#' fi else # trace_pipe lack headers, so fetch them from trace (( opt_headers )) && cat trace eval $cat trace_pipe fi ### end tracing end ================================================ FILE: scripts/grafana-config-copy.py ================================================ #!/usr/bin/env python2 from __future__ import print_function, \ unicode_literals import sys import urllib import urllib2 import base64 import json # from pprint import pprint try: input = raw_input except: pass ############################################################ ################## CONFIGURATION ########################### ############################################################ # use a viewer key dests = [ ] if not dests: with open(sys.argv[1]) as fp: dests = json.load(fp) src = dict( dashboards={"node": 'node.json', "pd" : 'pd.json', "tidb": 'tidb.json', "tidb_summary": 'tidb_summary.json', "tikv_summary": 'tikv_summary.json', "tikv_details": 'tikv_details.json', "tikv_trouble_shot": 'tikv_trouble_shooting.json', "tiflash_summary": 'tiflash_summary.json', "tiflash_proxy_summary": 'tiflash_proxy_summary.json', "tiflash_proxy_details": 'tiflash_proxy_details.json', "binlog": "binlog.json", "overview": 'overview.json', "disk_performance": 'disk_performance.json', "blackbox_exporter": 'blackbox_exporter.json', "kafka_overview": 'kafka.json', "lightning": 'lightning.json', "br": "br.json", "performance_read": 'performance_read.json', "performance_write": 'performance_write.json'}) ############################################################ ################## CONFIGURATION ENDS ###################### ############################################################ def export_dashboard(api_url, api_key, dashboard_name): req = urllib2.Request(api_url + 'api/dashboards/db/' + dashboard_name, headers={'Authorization': "Bearer {}".format(api_key)}) resp = urllib2.urlopen(req) data = json.load(resp) return data['dashboard'] def fill_dashboard_with_dest_config(dashboard, dest, type_='node'): dashboard['title'] = dest['titles'][type_] dashboard['id'] = None # pprint(dashboard) if 'rows' in dashboard: panels = dashboard['rows'] else: panels = dashboard['panels'] for row in panels: if 'panels' in row: for panel in row['panels']: panel['datasource'] = dest['datasource'] else: row['datasource'] = dest['datasource'] if 'templating' in dashboard: for templating in dashboard['templating']['list']: if templating['type'] == 'query': templating['current'] = {} templating['options'] = [] templating['datasource'] = dest['datasource'] if 'annotations' in dashboard: for annotation in dashboard['annotations']['list']: annotation['datasource'] = dest['datasource'] if 'links' in dashboard: for link in dashboard['links']: if 'title' in link and link['title'] == 'Report': link['icon'] = "doc" link['includeVars'] = True link['keepTime'] = True link['targetBlank'] = True link['tooltip'] = "Open a pdf report for the current dashboard" link['type'] = "link" return dashboard def import_dashboard(api_url, api_key, dashboard): payload = {'dashboard': dashboard, 'overwrite': True} headers = {'Authorization': "Bearer {}".format(api_key), 'Content-Type': 'application/json'} req = urllib2.Request(api_url + 'api/dashboards/db', headers=headers, data=json.dumps(payload)) try: resp = urllib2.urlopen(req) data = json.load(resp) return data except urllib2.HTTPError, error: data = json.load(error) return data def import_dashboard_via_user_pass(api_url, user, password, dashboard): payload = {'dashboard': dashboard, 'overwrite': True} auth_string = base64.b64encode('%s:%s' % (user, password)) headers = {'Authorization': "Basic {}".format(auth_string), 'Content-Type': 'application/json'} req = urllib2.Request(api_url + 'api/dashboards/db', headers=headers, data=json.dumps(payload)) try: resp = urllib2.urlopen(req) data = json.load(resp) return data except urllib2.URLError, error: return error.reason if __name__ == '__main__': for type_ in src['dashboards']: print("[load] from <{}>:{}".format( src['dashboards'][type_], type_)) dashboard = json.load(open(src['dashboards'][type_])) for dest in dests: dashboard = fill_dashboard_with_dest_config(dashboard, dest, type_) print("[import] <{}> to [{}]".format( dashboard['title'], dest['name']), end='\t............. ') if 'user' in dest: ret = import_dashboard_via_user_pass(dest['url'], dest['user'], dest['password'], dashboard) else: ret = import_dashboard(dest['url'], dest['key'], dashboard) if isinstance(ret,dict): if ret['status'] != 'success': print('ERROR: ', ret) raise RuntimeError else: print(ret['status']) else: print('ERROR: ', ret) raise RuntimeError ================================================ FILE: scripts/grafana_pdf.py ================================================ #!/usr/bin/env python from __future__ import print_function, \ unicode_literals import argparse import os import time import json import tarfile import shutil try: # For Python 2 import urllib2 as urlreq from urllib2 import HTTPError, URLError except ImportError: # For Python 3 import urllib.request as urlreq from urllib.error import HTTPError, URLError dests = [] download_dir = "grafana_pdf" if not dests: with open("./dests.json") as fp: dests = json.load(fp) def make_tarfile(output_filename, source_dir): with tarfile.open(output_filename, "w:gz") as tar: tar.add(source_dir, arcname=os.path.basename(source_dir)) def read_url(url): try: f = urlreq.urlopen(url) return f.read() except HTTPError as e: print("HTTP Error: %s" % e.getcode()) return e.read() except URLError as e: print("Reading URL %s error: %s" % (url, e)) return None def parse_opts(): parser = argparse.ArgumentParser( description="Export Grafana charts to PDF") parser.add_argument("-t", "--time", action="store", default=None, help="Relative time to now, supported format is like: 2h, 4h. If not set, assume 3h by default.") parser.add_argument("--time-from", action="store", default=None, help="Start timestamp of time range, format: '%%Y-%%m-%%d %%H:%%M:%%S'.") parser.add_argument("--time-to", action="store", default=None, help="End timestamp of time range, format: '%%Y-%%m-%%d %%H:%%M:%%S'.") return parser.parse_args() def parse_timestamp(time_string): format_guess = [ "%Y-%m-%d %H:%M:%S", "%Y-%m-%d %H:%M", "%Y-%m-%d %H", "%Y-%m-%d", "%m-%d", "%H:%M:%S", "%H:%M", "%H" ] for time_format in format_guess: try: # Grafana API's timestamp is in ms return time.mktime(time.strptime(time_string, time_format)) * 1000 except ValueError: pass raise ValueError( "time data '%s' does not match any supported format." % time_string) if __name__ == '__main__': args = parse_opts() if not os.path.isdir(download_dir): os.makedirs(download_dir) if args.time: time_args = "&from=now-{0}&to=now".format(args.time) elif args.time_from: start_time = int(parse_timestamp(args.time_from)) end_time = "now" if args.time_to: end_time = int(parse_timestamp(args.time_to)) time_args = "&from={0}&to={1}".format(start_time, end_time) else: time_args = "&from=now-3h&to=now" for dest in dests: report_url = dest['report_url'] apikey = dest['apikey'] for dashboard in dest['titles']: url = "{0}api/report/{1}?apitoken={2}{3}".format( report_url, dest['titles'][dashboard].lower(), apikey, time_args) filename = "{0}.pdf".format(dest['titles'][dashboard]) print("Downloading: ", filename) data = read_url(url) with open(os.path.join(download_dir, filename), "wb") as pdf: pdf.write(data) tar_filename = "{0}.tar.gz".format(download_dir) print("Compressing: ", tar_filename) make_tarfile(tar_filename, download_dir) print("Clean up download directory") shutil.rmtree(download_dir) ================================================ FILE: scripts/inventory_check.py ================================================ # coding: utf-8 import sys from ansible.vars.manager import VariableManager from ansible.parsing.dataloader import DataLoader from ansible.inventory.manager import InventoryManager def parse_inventory(inventory): loader = DataLoader() inv = InventoryManager(loader=loader, sources=[inventory]) vars = VariableManager(loader=loader, inventory=inv) all_groups = inv.get_groups_dict() tidb_nodes = all_groups['tidb_servers'] tikv_nodes = all_groups['tikv_servers'] tidb_servers = {} tikv_servers = {} for tidb in tidb_nodes: var = vars.get_vars(host=inv.get_host(hostname=str(tidb))) ip = var['ansible_host'] if 'ansible_host' in var else var['inventory_hostname'] tidb_port = var.get('tidb_port', 4000) tidb_status_port = var.get('tidb_status_port', 10080) deploy_dir = var['deploy_dir'] if ip in tidb_servers: tidb_servers[ip].append([tidb_port, tidb_status_port, deploy_dir]) else: tidb_servers[ip] = [[tidb_port, tidb_status_port, deploy_dir]] for tikv in tikv_nodes: var = vars.get_vars(host=inv.get_host(hostname=str(tikv))) ip = var['ansible_host'] if 'ansible_host' in var else var['inventory_hostname'] tikv_port = var.get('tikv_port', 20160) tikv_status_port = var.get('tikv_status_port', 20180) deploy_dir = var['deploy_dir'] if ip in tikv_servers: tikv_servers[ip].append([tikv_port, tikv_status_port, deploy_dir]) else: tikv_servers[ip] = [[tikv_port, tikv_status_port, deploy_dir]] return [tidb_servers, tikv_servers] def check_conflict(server_list): conflict_ip = [] for ip, node_vars in server_list.iteritems(): length = len(node_vars) if length > 1: port_list = [var[0] for var in node_vars] sts_port_list = [var[1] for var in node_vars] dir_list = [var[2] for var in node_vars] if len(set(port_list)) < length \ or len(set(sts_port_list)) < length \ or len(set(dir_list)) < length: conflict_ip.append(ip) return conflict_ip if __name__ == '__main__': tidb_servers, tikv_servers = parse_inventory(sys.argv[1]) tidb_conf_conflict = check_conflict(tidb_servers) tikv_conf_conflict = check_conflict(tikv_servers) if tidb_conf_conflict: print('\n TiDB port or deployment directory conflicts on {} machine.' .format(','.join(tidb_conf_conflict))) if tikv_conf_conflict and not tidb_conf_conflict: print('\n TiKV port or deployment directory conflicts on {} machine.' .format(','.join(tikv_conf_conflict))) elif tikv_conf_conflict and tidb_conf_conflict: print(' TiKV port or deployment directory conflicts on {} machine.' .format(','.join(tikv_conf_conflict))) if tidb_conf_conflict or tikv_conf_conflict: print(' Please recheck the port, status_port, deploy_dir or other configuration in inventory.ini.') else: print('Check ok.') ================================================ FILE: scripts/iosnoop ================================================ #!/bin/bash # # iosnoop - trace block device I/O. # Written using Linux ftrace. # # This traces disk I/O at the block device interface, using the block: # tracepoints. This can help characterize the I/O requested for the storage # devices and their resulting performance. I/O completions can also be studied # event-by-event for debugging disk and controller I/O scheduling issues. # # USAGE: ./iosnoop [-hQst] [-d device] [-i iotype] [-p pid] [-n name] [duration] # # Run "iosnoop -h" for full usage. # # REQUIREMENTS: FTRACE CONFIG, block:block_rq_* tracepoints (you may # already have these on recent kernels). # # OVERHEAD: By default, iosnoop works without buffering, printing I/O events # as they happen (uses trace_pipe), context switching and consuming CPU to do # so. This has a limit of about 10,000 IOPS (depending on your platform), at # which point iosnoop will be consuming 1 CPU. The duration mode uses buffering, # and can handle much higher IOPS rates, however, the buffer has a limit of # about 50,000 I/O, after which events will be dropped. You can tune this with # bufsize_kb, which is per-CPU. Also note that the "-n" option is currently # post-filtered, so all events are traced. # # This was written as a proof of concept for ftrace. It would be better written # using perf_events (after some capabilities are added), which has a better # buffering policy, or a tracer such as SystemTap or ktap. # # From perf-tools: https://github.com/brendangregg/perf-tools # # See the iosnoop(8) man page (in perf-tools) for more info. # # COPYRIGHT: Copyright (c) 2014 Brendan Gregg. # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU General Public License # as published by the Free Software Foundation; either version 2 # of the License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program; if not, write to the Free Software Foundation, # Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. # # (http://www.gnu.org/copyleft/gpl.html) # # 12-Jul-2014 Brendan Gregg Created this. ### default variables tracing=/sys/kernel/debug/tracing flock=/var/tmp/.ftrace-lock bufsize_kb=4096 opt_duration=0; duration=; opt_name=0; name=; opt_pid=0; pid=; ftext= opt_start=0; opt_end=0; opt_device=0; device=; opt_iotype=0; iotype= opt_queue=0 trap ':' INT QUIT TERM PIPE HUP # sends execution to end tracing section function usage { cat <<-END >&2 USAGE: iosnoop [-hQst] [-d device] [-i iotype] [-p PID] [-n name] [duration] -d device # device string (eg, "202,1) -i iotype # match type (eg, '*R*' for all reads) -n name # process name to match on I/O issue -p PID # PID to match on I/O issue -Q # use queue insert as start time -s # include start time of I/O (s) -t # include completion time of I/O (s) -h # this usage message duration # duration seconds, and use buffers eg, iosnoop # watch block I/O live (unbuffered) iosnoop 1 # trace 1 sec (buffered) iosnoop -Q # include queueing time in LATms iosnoop -ts # include start and end timestamps iosnoop -i '*R*' # trace reads iosnoop -p 91 # show I/O issued when PID 91 is on-CPU iosnoop -Qp 91 # show I/O queued by PID 91, queue time See the man page and example file for more info. END exit } function warn { if ! eval "$@"; then echo >&2 "WARNING: command failed \"$@\"" fi } function end { # disable tracing echo 2>/dev/null echo "Ending tracing..." 2>/dev/null cd $tracing warn "echo 0 > events/block/$b_start/enable" warn "echo 0 > events/block/block_rq_complete/enable" if (( opt_device || opt_iotype || opt_pid )); then warn "echo 0 > events/block/$b_start/filter" warn "echo 0 > events/block/block_rq_complete/filter" fi warn "echo > trace" (( wroteflock )) && warn "rm $flock" } function die { echo >&2 "$@" exit 1 } function edie { # die with a quiet end() echo >&2 "$@" exec >/dev/null 2>&1 end exit 1 } ### process options while getopts d:hi:n:p:Qst opt do case $opt in d) opt_device=1; device=$OPTARG ;; i) opt_iotype=1; iotype=$OPTARG ;; n) opt_name=1; name=$OPTARG ;; p) opt_pid=1; pid=$OPTARG ;; Q) opt_queue=1 ;; s) opt_start=1 ;; t) opt_end=1 ;; h|?) usage ;; esac done shift $(( $OPTIND - 1 )) if (( $# )); then opt_duration=1 duration=$1 shift fi if (( opt_device )); then major=${device%,*} minor=${device#*,} dev=$(( (major << 20) + minor )) fi ### option logic (( opt_pid && opt_name )) && die "ERROR: use either -p or -n." (( opt_pid )) && ftext=" issued by PID $pid" (( opt_name )) && ftext=" issued by process name \"$name\"" if (( opt_duration )); then echo "Tracing block I/O$ftext for $duration seconds (buffered)..." else echo "Tracing block I/O$ftext. Ctrl-C to end." fi if (( opt_queue )); then b_start=block_rq_insert else b_start=block_rq_issue fi ### select awk (( opt_duration )) && use=mawk || use=gawk # workaround for mawk fflush() [[ -x /usr/bin/$use ]] && awk=$use || awk=awk wroteflock=1 ### check permissions cd $tracing || die "ERROR: accessing tracing. Root user? Kernel has FTRACE? debugfs mounted? (mount -t debugfs debugfs /sys/kernel/debug)" ### ftrace lock [[ -e $flock ]] && die "ERROR: ftrace may be in use by PID $(cat $flock) $flock" echo $$ > $flock || die "ERROR: unable to write $flock." ### setup and begin tracing echo nop > current_tracer warn "echo $bufsize_kb > buffer_size_kb" filter= if (( opt_iotype )); then filter="rwbs ~ \"$iotype\"" fi if (( opt_device )); then [[ "$filter" != "" ]] && filter="$filter && " filter="${filter}dev == $dev" fi filter_i=$filter if (( opt_pid )); then [[ "$filter_i" != "" ]] && filter_i="$filter_i && " filter_i="${filter_i}common_pid == $pid" [[ "$filter" == "" ]] && filter=0 fi if (( opt_iotype || opt_device || opt_pid )); then if ! echo "$filter_i" > events/block/$b_start/filter || \ ! echo "$filter" > events/block/block_rq_complete/filter then edie "ERROR: setting -d or -t filter. Exiting." fi fi if ! echo 1 > events/block/$b_start/enable || \ ! echo 1 > events/block/block_rq_complete/enable; then edie "ERROR: enabling block I/O tracepoints. Exiting." fi (( opt_start )) && printf "%-15s " "STARTs" (( opt_end )) && printf "%-15s " "ENDs" printf "%-12.12s %-6s %-4s %-8s %-12s %-6s %8s\n" \ "COMM" "PID" "TYPE" "DEV" "BLOCK" "BYTES" "LATms" # # Determine output format. It may be one of the following (newest first): # TASK-PID CPU# |||| TIMESTAMP FUNCTION # TASK-PID CPU# TIMESTAMP FUNCTION # To differentiate between them, the number of header fields is counted, # and an offset set, to skip the extra column when needed. # offset=$($awk 'BEGIN { o = 0; } $1 == "#" && $2 ~ /TASK/ && NF == 6 { o = 1; } $2 ~ /TASK/ { print o; exit }' trace) ### print trace buffer warn "echo > trace" ( if (( opt_duration )); then # wait then dump buffer sleep $duration cat trace else # print buffer live cat trace_pipe fi ) | $awk -v o=$offset -v opt_name=$opt_name -v name=$name \ -v opt_duration=$opt_duration -v opt_start=$opt_start -v opt_end=$opt_end \ -v b_start=$b_start ' # common fields $1 != "#" { # task name can contain dashes comm = pid = $1 sub(/-[0-9][0-9]*/, "", comm) sub(/.*-/, "", pid) time = $(3+o); sub(":", "", time) dev = $(5+o) } # block I/O request $1 != "#" && $0 ~ b_start { if (opt_name && match(comm, name) == 0) next # # example: (fields1..4+o) 202,1 W 0 () 12862264 + 8 [tar] # The cmd field "()" might contain multiple words (hex), # hence stepping from the right (NF-3). # loc = $(NF-3) starts[dev, loc] = time comms[dev, loc] = comm pids[dev, loc] = pid next } # block I/O completion $1 != "#" && $0 ~ /rq_complete/ { # # example: (fields1..4+o) 202,1 W () 12862256 + 8 [0] # dir = $(6+o) loc = $(NF-3) nsec = $(NF-1) if (starts[dev, loc] > 0) { latency = sprintf("%.2f", 1000 * (time - starts[dev, loc])) comm = comms[dev, loc] pid = pids[dev, loc] if (opt_start) printf "%-15s ", starts[dev, loc] if (opt_end) printf "%-15s ", time printf "%-12.12s %-6s %-4s %-8s %-12s %-6s %8s\n", comm, pid, dir, dev, loc, nsec * 512, latency if (!opt_duration) fflush() delete starts[dev, loc] delete comms[dev, loc] delete pids[dev, loc] } next } $0 ~ /LOST.*EVENTS/ { print "WARNING: " $0 > "/dev/stderr" } ' ### end tracing end ================================================ FILE: scripts/kafka.json ================================================ { "__inputs": [ { "name": "DS_TEST-CLUSTER", "label": "test-cluster", "description": "", "type": "datasource", "pluginId": "prometheus", "pluginName": "Prometheus" } ], "__requires": [ { "type": "grafana", "id": "grafana", "name": "Grafana", "version": "6.1.6" }, { "type": "panel", "id": "graph", "name": "Graph", "version": "" }, { "type": "datasource", "id": "prometheus", "name": "Prometheus", "version": "1.0.0" }, { "type": "panel", "id": "singlestat", "name": "Singlestat", "version": "" } ], "annotations": { "list": [ { "builtIn": 1, "datasource": "${DS_TEST-CLUSTER}", "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "type": "dashboard" } ] }, "editable": true, "gnetId": null, "graphTooltip": 0, "id": null, "iteration": 1564734780838, "links": [], "panels": [ { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a" ], "datasource": "${DS_TEST-CLUSTER}", "format": "none", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "gridPos": { "h": 7, "w": 8, "x": 0, "y": 0 }, "id": 7, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { "expr": "kafka_brokers", "format": "time_series", "intervalFactor": 2, "refId": "A" } ], "thresholds": "", "title": "Kafka Brokers", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "avg" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a" ], "datasource": "${DS_TEST-CLUSTER}", "format": "none", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "gridPos": { "h": 7, "w": 8, "x": 8, "y": 0 }, "id": 8, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { "expr": "kafka_topic_partition_oldest_offset{topic=\"$topic\"}", "format": "time_series", "intervalFactor": 2, "refId": "A" } ], "thresholds": "", "title": "Topic Partition Oldest Offset", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "avg" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a" ], "datasource": "${DS_TEST-CLUSTER}", "format": "none", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "gridPos": { "h": 7, "w": 8, "x": 16, "y": 0 }, "id": 9, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { "expr": "sum(kafka_topic_partition_current_offset{topic=\"$topic\"})", "format": "time_series", "intervalFactor": 2, "refId": "A" } ], "thresholds": "", "title": "Topic Partition Current Offset", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "avg" }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 0, "description": "Number of partitions for this Topic", "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 7 }, "id": 1, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": false, "show": true, "sideWidth": null, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum (kafka_topic_partitions) by (topic)", "format": "time_series", "intervalFactor": 2, "legendFormat": "", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Topic Partitions Count", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 0, "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 0, "description": "Leader Broker ID of Topic/Partition", "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 7 }, "id": 2, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(kafka_topic_partition_leader) by (topic, partition)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{topic=\"{{ topic }}\", partition=\"{{ partition}}\"}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Leader Broker ID", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 0, "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "Number of Replicas for this Topic/Partition", "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 14 }, "id": 5, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": false, "show": true, "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum (kafka_topic_partition_replicas) by (topic, partition)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{topic=\"{{ topic }}\", partition=\"{{ partition}}\"}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Replicas", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 0, "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "Number of In-Sync Replicas for this Topic/Partition", "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 14 }, "id": 6, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(kafka_topic_partition_in_sync_replica) by (topic, partition)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{topic=\"{{ topic }}\", partition=\"{{ partition}}\"}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "In Sync Replica", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 0, "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "refresh": false, "schemaVersion": 18, "style": "dark", "tags": [], "templating": { "list": [ { "allValue": null, "current": {}, "datasource": "${DS_TEST-CLUSTER}", "definition": "", "hide": 0, "includeAll": false, "label": "", "multi": false, "name": "topic", "options": [], "query": "label_values(kafka_topic_partition_leader, topic)", "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 1, "tagValuesQuery": "", "tags": [], "tagsQuery": "", "type": "query", "useTags": false } ] }, "time": { "from": "now-5m", "to": "now" }, "timepicker": { "refresh_intervals": [ "5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d" ], "time_options": [ "5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d" ] }, "timezone": "", "title": "Test-Cluster-Kafka-Overview", "uid": "hydvTFDWk", "version": 2 } ================================================ FILE: scripts/lightning.json ================================================ { "__inputs": [ { "name": "DS_LIGHTNING", "label": "lightning", "description": "", "type": "datasource", "pluginId": "prometheus", "pluginName": "Prometheus" } ], "__requires": [ { "type": "grafana", "id": "grafana", "name": "Grafana", "version": "4.6.3" }, { "type": "panel", "id": "graph", "name": "Graph", "version": "" }, { "type": "datasource", "id": "prometheus", "name": "Prometheus", "version": "1.0.0" }, { "type": "panel", "id": "singlestat", "name": "Singlestat", "version": "" }, { "type": "panel", "id": "table", "name": "Table", "version": "" } ], "annotations": { "list": [ { "builtIn": 1, "datasource": "-- Grafana --", "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "type": "dashboard" } ] }, "editable": true, "gnetId": null, "graphTooltip": 0, "hideControls": false, "id": null, "links": [], "refresh": false, "rows": [ { "collapse": false, "height": "250px", "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_LIGHTNING}", "fill": 1, "id": 1, "legend": { "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "span": 6, "stack": false, "steppedLine": false, "targets": [ { "expr": "rate(tikv_import_write_chunk_bytes_sum[1m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "write from lightning", "refId": "B" }, { "expr": "sum(rate(tikv_import_upload_chunk_bytes_sum[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "upload to tikv", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Import speed", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "Bps", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "none", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_LIGHTNING}", "fill": 1, "id": 10, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": false, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "span": 6, "stack": false, "steppedLine": false, "targets": [ { "expr": "1/rate(lightning_chunks{state=\"finished\"}[1m]) ", "format": "time_series", "intervalFactor": 2, "legendFormat": "", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Chunk process duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": "", "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": "", "logBase": 1, "max": null, "min": null, "show": true } ] } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": false, "title": "Import Speed", "titleSize": "h6" }, { "collapse": false, "height": 250, "panels": [ { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "#d44a3a", "rgba(237, 129, 40, 0.89)", "#299c46" ], "datasource": "${DS_LIGHTNING}", "decimals": null, "format": "percentunit", "gauge": { "maxValue": 1, "minValue": 0, "show": true, "thresholdLabels": false, "thresholdMarkers": false }, "id": 4, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "span": 3, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": true, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { "expr": "lightning_chunks{state=\"finished\"} / ignoring(state) lightning_chunks{state=\"estimated\"}", "format": "time_series", "instant": false, "intervalFactor": 2, "refId": "A" } ], "thresholds": "0,0", "title": "Import Progress", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "#d44a3a", "rgba(237, 129, 40, 0.89)", "#299c46" ], "datasource": "${DS_LIGHTNING}", "format": "percentunit", "gauge": { "maxValue": 1, "minValue": 0, "show": true, "thresholdLabels": false, "thresholdMarkers": false }, "id": 12, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "span": 3, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { "expr": "lightning_tables{state=\"completed\"} / ignoring(state) lightning_tables{state=\"pending\"}", "format": "time_series", "instant": false, "intervalFactor": 1, "refId": "A" } ], "thresholds": "0,0", "title": "Checksum progress", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "avg" }, { "columns": [ { "text": "Max", "value": "max" } ], "datasource": "${DS_LIGHTNING}", "fontSize": "100%", "id": 8, "links": [], "pageSize": null, "scroll": true, "showHeader": true, "sort": { "col": 0, "desc": true }, "span": 6, "styles": [ { "alias": "Time", "dateFormat": "YYYY-MM-DD HH:mm:ss", "pattern": "Time", "type": "date" }, { "alias": "Step", "colorMode": "cell", "colors": [ "rgba(50, 172, 45, 0.97)", "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)" ], "decimals": 0, "pattern": "Metric", "thresholds": [ "1", "2" ], "type": "number", "unit": "none" }, { "alias": "Tables", "colorMode": "cell", "colors": [ "rgba(50, 172, 45, 0.97)", "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)" ], "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 0, "pattern": "Max", "thresholds": [ "0", "0" ], "type": "number", "unit": "none" } ], "targets": [ { "expr": "lightning_tables{result=\"failure\"}", "format": "time_series", "instant": false, "intervalFactor": 2, "legendFormat": "{{state}}", "refId": "A" } ], "title": "Failures", "transform": "timeseries_aggregations", "type": "table" } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": false, "title": "Import Progress", "titleSize": "h6" }, { "collapse": false, "height": 250, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_LIGHTNING}", "fill": 1, "id": 7, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "span": 4, "stack": false, "steppedLine": false, "targets": [ { "expr": "process_resident_memory_bytes{job=\"tikv-importer\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "importer RSS", "refId": "A" }, { "expr": "go_memstats_heap_inuse_bytes{job=\"lightning\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "lightning heap", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Memory usage", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_LIGHTNING}", "fill": 1, "id": 9, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": false, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "span": 4, "stack": false, "steppedLine": false, "targets": [ { "expr": "go_goroutines{job=\"lightning\"}", "format": "time_series", "instant": false, "intervalFactor": 2, "legendFormat": "", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Number of Lightning Goroutines", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 0, "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_LIGHTNING}", "fill": 1, "id": 3, "legend": { "alignAsTable": false, "avg": true, "current": false, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "span": 4, "stack": false, "steppedLine": false, "targets": [ { "expr": "rate(process_cpu_seconds_total{job=\"lightning\"}[30s])*100", "format": "time_series", "intervalFactor": 2, "legendFormat": "Lightning", "refId": "A" }, { "expr": "rate(process_cpu_seconds_total{job=\"tikv-importer\"}[30s])*100", "format": "time_series", "intervalFactor": 2, "legendFormat": "Importer", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "CPU%", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": null, "format": "percent", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": false, "title": "Resource usage", "titleSize": "h6" }, { "collapse": false, "height": 250, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_LIGHTNING}", "fill": 1, "id": 5, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "span": 5, "stack": false, "steppedLine": true, "targets": [ { "expr": "lightning_idle_workers", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{name}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Idle workers", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 0, "format": "none", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_LIGHTNING}", "fill": 1, "id": 6, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "span": 5, "stack": false, "steppedLine": true, "targets": [ { "expr": "lightning_kv_encoder{type=\"open\"} - ignoring(type) lightning_kv_encoder{type=\"closed\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "KV Encoder", "refId": "A" }, { "expr": "lightning_importer_engine{type=\"open\"} - ignoring(type) lightning_importer_engine{type=\"closed\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "Importer Engines (via Lightning)", "refId": "B" }, { "expr": "tikv_import_rpc_duration_count{request=\"open_engine\",result=\"ok\"} - ignoring(request) tikv_import_rpc_duration_count{request=\"close_engine\",result=\"ok\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "Importer Engines (via Importer)", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "External resources", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 0, "format": "none", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "columns": [ { "text": "Current", "value": "current" } ], "datasource": "${DS_LIGHTNING}", "fontSize": "100%", "id": 21, "links": [], "pageSize": null, "scroll": true, "showHeader": true, "sort": { "col": 0, "desc": true }, "span": 2, "styles": [ { "alias": "TiKV", "pattern": "Metric" }, { "alias": "", "colorMode": "cell", "colors": [ "#E0B400", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "decimals": 2, "link": false, "mappingType": 2, "pattern": "Current", "rangeMaps": [ { "from": "0", "text": "Import", "to": "0" }, { "from": "1", "text": "Normal", "to": "Infinity" } ], "thresholds": [ "1", "1" ], "type": "string", "unit": "short" } ], "targets": [ { "expr": "min(tikv_config_rocksdb{name=\"hard_pending_compaction_bytes_limit\"}) by (instance)", "format": "time_series", "instant": false, "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A" } ], "title": "Import/Normal mode", "transform": "timeseries_aggregations", "type": "table" } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": false, "title": "Resource usage", "titleSize": "h6" }, { "collapse": false, "height": 223, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_LIGHTNING}", "fill": 1, "id": 13, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "span": 6, "stack": false, "steppedLine": false, "targets": [ { "expr": "rate(lightning_chunk_parser_read_block_seconds_sum[30s]) / rate(lightning_chunk_parser_read_block_seconds_count[30s])", "format": "time_series", "intervalFactor": 2, "legendFormat": "read block", "refId": "A" }, { "expr": "rate(lightning_apply_worker_seconds_sum{name = \"io\"}[30s]) /rate(lightning_apply_worker_seconds_count{name = \"io\"}[30s]) ", "format": "time_series", "intervalFactor": 2, "legendFormat": "apply worker", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Chunk parser read block duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_LIGHTNING}", "fill": 1, "id": 15, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "span": 6, "stack": false, "steppedLine": false, "targets": [ { "expr": "rate(lightning_row_encode_seconds_sum[30s]) / rate(lightning_row_encode_seconds_count[30s])", "format": "time_series", "intervalFactor": 2, "legendFormat": "row encode", "refId": "A" }, { "expr": "rate(lightning_block_deliver_seconds_sum[30s]) / rate(lightning_block_deliver_seconds_count[30s])", "format": "time_series", "intervalFactor": 2, "legendFormat": "block deliver", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "SQL process duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": false, "title": "Dashboard Row", "titleSize": "h6" }, { "collapse": false, "height": 235, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_LIGHTNING}", "fill": 1, "id": 16, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "span": 6, "stack": false, "steppedLine": false, "targets": [ { "expr": "rate(lightning_block_deliver_bytes_sum[30s])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{kind}} deliver rate", "refId": "B" }, { "expr": "sum(rate(lightning_block_deliver_bytes_sum[30s]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "total deliver rate", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "SQL process rate", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "Bps", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_LIGHTNING}", "fill": 1, "id": 17, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "span": 6, "stack": false, "steppedLine": false, "targets": [ { "expr": "lightning_row_read_bytes_sum", "format": "time_series", "intervalFactor": 2, "legendFormat": "parser read size", "refId": "A" }, { "expr": "lightning_block_deliver_bytes_sum", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{kind}} deliver size", "refId": "B" }, { "expr": "pd_cluster_status{type=\"storage_size\"} / ignoring(type) pd_config_status{type=\"max_replicas\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "storage_size / replicas", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Total bytes", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "decbytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": false, "title": "Dashboard Row", "titleSize": "h6" }, { "collapse": false, "height": 243, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_LIGHTNING}", "fill": 1, "id": 18, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "span": 6, "stack": false, "steppedLine": false, "targets": [ { "expr": "rate(tikv_import_range_delivery_duration_sum[30s]) / rate(tikv_import_range_delivery_duration_count[30s])", "format": "time_series", "intervalFactor": 2, "legendFormat": "range deliver", "refId": "A" }, { "expr": "rate(tikv_import_sst_delivery_duration_sum[30s]) / rate(tikv_import_sst_delivery_duration_count[30s])", "format": "time_series", "intervalFactor": 2, "legendFormat": "SST file deliver", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Deliver duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_LIGHTNING}", "fill": 1, "id": 19, "legend": { "avg": false, "current": false, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "SST size", "yaxis": 2 } ], "spaceLength": 10, "span": 6, "stack": false, "steppedLine": false, "targets": [ { "expr": "rate(tikv_import_split_sst_duration_sum[30s]) / rate(tikv_import_split_sst_duration_count[30s])", "format": "time_series", "intervalFactor": 2, "legendFormat": "Split SST", "refId": "C" }, { "expr": "rate(tikv_import_sst_upload_duration_sum[30s]) / rate(tikv_import_sst_upload_duration_count[30s])", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "SST upload", "refId": "D" }, { "expr": "rate(tikv_import_sst_ingest_duration_sum[30s]) / rate(tikv_import_sst_ingest_duration_count[30s])", "format": "time_series", "instant": false, "intervalFactor": 2, "legendFormat": "SST ingest", "refId": "E" }, { "expr": "rate(tikv_import_sst_chunk_bytes_sum[30s])", "format": "time_series", "intervalFactor": 2, "legendFormat": "SST size", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "SST process duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "decbytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": false, "title": "Dashboard Row", "titleSize": "h6" } ], "schemaVersion": 14, "style": "dark", "tags": [], "templating": { "list": [] }, "time": { "from": "now-6h", "to": "now" }, "timepicker": { "refresh_intervals": [ "5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d" ], "time_options": [ "5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d" ] }, "timezone": "browser", "title": "Test-Cluster-Lightning", "version": 4 } ================================================ FILE: scripts/loader.json ================================================ { "__inputs": [ { "name": "DS_TIDB-CLUSTER", "label": "tidb-cluster", "description": "", "type": "datasource", "pluginId": "prometheus", "pluginName": "Prometheus" } ], "__requires": [ { "type": "grafana", "id": "grafana", "name": "Grafana", "version": "4.6.3" }, { "type": "panel", "id": "graph", "name": "Graph", "version": "" }, { "type": "datasource", "id": "prometheus", "name": "Prometheus", "version": "1.0.0" }, { "type": "panel", "id": "singlestat", "name": "Singlestat", "version": "" } ], "annotations": { "list": [ { "builtIn": 1, "datasource": "-- Grafana --", "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "type": "dashboard" } ] }, "editable": true, "gnetId": null, "graphTooltip": 0, "hideControls": false, "id": null, "links": [], "refresh": "5s", "rows": [ { "collapse": false, "height": 250, "panels": [ { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a" ], "datasource": "${DS_TIDB-CLUSTER}", "format": "none", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "id": 1, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "span": 2, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { "expr": "loader_database_count", "format": "time_series", "intervalFactor": 2, "refId": "A" } ], "thresholds": "", "title": "Number of Databases", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a" ], "datasource": "${DS_TIDB-CLUSTER}", "format": "none", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "id": 2, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "span": 2, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { "expr": "loader_table_count", "format": "time_series", "hide": false, "intervalFactor": 2, "refId": "A" } ], "thresholds": "", "title": "Number of Tables", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a" ], "datasource": "${DS_TIDB-CLUSTER}", "format": "none", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "id": 3, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "span": 2, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { "expr": "loader_data_file_count", "format": "time_series", "intervalFactor": 2, "refId": "A" } ], "thresholds": "", "title": "Number of Data Files", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a" ], "datasource": "${DS_TIDB-CLUSTER}", "format": "bytes", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "id": 4, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "span": 2, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { "expr": "loader_data_size_count", "format": "time_series", "intervalFactor": 2, "refId": "A" } ], "thresholds": "", "title": "Data Size", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a" ], "datasource": "${DS_TIDB-CLUSTER}", "format": "percentunit", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "id": 5, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "span": 4, "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { "expr": "loader_progress", "format": "time_series", "intervalFactor": 2, "refId": "A" } ], "thresholds": "", "title": "Progess", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": false, "title": "Dashboard Row", "titleSize": "h6" }, { "collapse": false, "height": 250, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "fill": 1, "id": 6, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "span": 12, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.98, sum(rate(loader_txn_duration_time_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Txn duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": false, "title": "Dashboard Row", "titleSize": "h6" }, { "collapse": false, "height": 250, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "fill": 1, "id": 7, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "span": 12, "stack": false, "steppedLine": false, "targets": [ { "expr": "loader_tidb_unknown_error", "format": "time_series", "intervalFactor": 2, "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "TiDB Unknown Error Count", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": false, "title": "Dashboard Row", "titleSize": "h6" }, { "collapse": false, "height": 250, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "fill": 1, "id": 8, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "span": 12, "stack": false, "steppedLine": false, "targets": [ { "expr": "loader_progress", "format": "time_series", "intervalFactor": 2, "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Progress", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": false, "title": "Dashboard Row", "titleSize": "h6" } ], "schemaVersion": 14, "style": "dark", "tags": [], "templating": { "list": [] }, "time": { "from": "now-15m", "to": "now" }, "timepicker": { "refresh_intervals": [ "5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d" ], "time_options": [ "5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d" ] }, "timezone": "", "title": "TiDB-Loader", "version": 11 } ================================================ FILE: scripts/metrics-delete.py ================================================ #!/usr/bin/env python2 from __future__ import print_function, \ unicode_literals, division import urllib import urllib2 import json from pprint import pprint from urlparse import urljoin import time import re import calendar import os SECOND = 1 MINUTE = 60 * SECOND HOUR = 60 * MINUTE DAY = 24 * HOUR PROMETHEUS_URL = "http://127.0.0.1:9090/" PUSHGATEWAY_URL = "http://127.0.0.1:9091/" TIMEOUT = 5 * MINUTE if os.getenv('PROMETHEUS_URL'): PROMETHEUS_URL = os.getenv('PROMETHEUS_URL') if os.getenv('PUSHGATEWAY_URL'): PUSHGATEWAY_URL = os.getenv('PUSHGATEWAY_URL') if os.getenv('TIMEOUT'): TIMEOUT = os.getenv('TIMEOUT') current_ts = time.time() def query_metadata(): url = urljoin(PROMETHEUS_URL, '/api/v1/series?match[]=up') print(url) resp = urllib2.urlopen(url) payload = json.load(resp) pprint(payload) def query_all_series(): url = urljoin(PROMETHEUS_URL, r'/api/v1/query?query={instance=~"[\\S].*"}') print(url) resp = urllib2.urlopen(url) payload = json.load(resp) pprint(payload) for item in payload['data']['result']: name = item['metric']['__name__'] value_ts = item['value'][0] if value_ts - current_ts > 2: print(name, time.ctime(value_ts)) def delete_series_by_job_instance(job, instance): url = urljoin(PROMETHEUS_URL, r'/api/v1/series?match[]={job="%s",instance="%s"}' % (job, instance)) req = urllib2.Request(url) req.get_method = lambda : 'DELETE' resp = urllib2.urlopen(req) payload = json.load(resp) pprint(payload) return True def query_out_dated_job_from_pushgateway(timeout): html = urllib2.urlopen(PUSHGATEWAY_URL).read() pattern = re.compile( r'\s+' 'job="(.*?)"\s+' 'instance="(.*?)"\s+' '.*?last pushed: (.*?)\s+', re.DOTALL ) ret = [] for job, instance, last_update in pattern.findall(html): last_update = last_update.replace('+', '+') last_update = last_update.replace('-', '-') tz_offset = 0 # parse TZ offset into seconds if not last_update.endswith('+0000 UTC'): offset = int(last_update.split()[-2], 10) tz_offset += offset % 100 * MINUTE tz_offset += offset // 100 * HOUR last_update = time.strptime(last_update.split('.')[0], "%Y-%m-%d %H:%M:%S") # local # update_ts = time.mktime(last_update) # gmt update_ts = calendar.timegm(last_update) - tz_offset diff = abs(current_ts - update_ts) print("%s@%s is %s seconds behind." % (job, instance, int(diff))) if diff > timeout: print(" MARKED as OUTDATED!") ret.append((job, instance)) return ret def delete_job_from_pushgateway(job, instance): url = urljoin(PUSHGATEWAY_URL, '/metrics/job/%s/instance/%s' % (job, instance)) req = urllib2.Request(url) req.get_method = lambda : 'DELETE' resp = urllib2.urlopen(req) resp.read() if __name__ == '__main__': #query_metadata() #query_all_series() print('prometheus url: {}'.format(PROMETHEUS_URL)) print('pushgateway url: {}'.format(PUSHGATEWAY_URL)) for job, instance in query_out_dated_job_from_pushgateway(timeout=TIMEOUT): print("deleting", job, instance) delete_job_from_pushgateway(job, instance) delete_series_by_job_instance(job, instance) ================================================ FILE: scripts/montidb.sh ================================================ #!/bin/bash #export ANS_HOME=/home/pingcap/tidb-ansible PDIP=172.16.10.12:2379 # get command. obj=$1 case "$obj" in "stores") echo "######### Get the information of tikv stores . #########" curl -s http://$PDIP/pd/api/v1/$obj | egrep '(id|address|state_name|capacity|available|leader_count|region_count)' | awk '{if(NR%7!=0)ORS="\t"; else ORS="\n"}1' | sed 's/[ ][ ]*/ /g' | sed 's/"//g' | sort -k 2 -t ‘:’ ;; "members") echo "######### Get the information of pd member leader. #########" curl -s http://$PDIP/pd/api/v1/leader | egrep '(name|member_id|http)' | awk '{if(NR%4!=0)ORS="\t"; else ORS="\n"}1' | sed 's/[ ][ ]*/ /g' | sed 's/"//g' | sort -k 2 -t ‘:’ echo echo "######### Get the information of pd member . #########" curl -s http://$PDIP/pd/api/v1/$obj | egrep '(name|member_id|http)' | awk '{if(NR%4!=0)ORS="\t"; else ORS="\n"}1' | sed 's/[ ][ ]*/ /g' | sed 's/"//g' | sort -k 2 -t ‘:’ ;; "regions") echo "######### Get the information of regions . #########" curl -s http://$PDIP/pd/api/v1/$obj | egrep '(id|store_id)' | awk '{if(NR%7!=0)ORS="\t"; else ORS="\n"}1' | sed 's/[ ][ ]*/ /g' | sed 's/"//g' | sort -k 2 -t ‘:’ ;; "config") echo "######### Get the information of config . #########" curl -s http://$PDIP/pd/api/v1/$obj ;; "labels") echo "######### Get the information of labels . #########" curl -s http://$PDIP/pd/api/v1/labels/stores | egrep '(id|address|state_name|value)' | awk '{if(NR%4!=0)ORS="\t"; else ORS="\n"}1' | sed 's/[ ][ ]*/ /g' | sed 's/"//g' | sort -k 2 -t ‘:’ ;; *) #其它输入 echo "Usage: $0 stores|members|regions|config|labels" ;; esac ================================================ FILE: scripts/node.json ================================================ { "__inputs": [ { "name": "DS_TIDB-CLUSTER", "label": "tidb-cluster", "description": "", "type": "datasource", "pluginId": "prometheus", "pluginName": "Prometheus" } ], "__requires": [ { "type": "panel", "id": "gauge", "name": "Gauge", "version": "" }, { "type": "grafana", "id": "grafana", "name": "Grafana", "version": "6.1.6" }, { "type": "panel", "id": "graph", "name": "Graph", "version": "" }, { "type": "datasource", "id": "prometheus", "name": "Prometheus", "version": "1.0.0" }, { "type": "panel", "id": "singlestat", "name": "Singlestat", "version": "" }, { "type": "panel", "id": "table", "name": "Table", "version": "" } ], "annotations": { "list": [ { "builtIn": 1, "datasource": "${DS_TIDB-CLUSTER}", "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "type": "dashboard" } ] }, "description": "Prometheus for system metrics. \r\nLoad, CPU, RAM, network, process ... ", "editable": true, "gnetId": 159, "graphTooltip": 1, "id": null, "iteration": 1566457258708, "links": [], "panels": [ { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, "id": 45, "panels": [ { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_TIDB-CLUSTER}", "editable": true, "error": false, "format": "none", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "gridPos": { "h": 4, "w": 4, "x": 0, "y": 2 }, "height": "55px", "id": 25, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "80%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { "expr": "count(node_cpu_seconds_total{instance=\"$host\", mode=\"user\"})", "format": "time_series", "interval": "5m", "intervalFactor": 1, "refId": "A", "step": 300 } ], "thresholds": "", "title": "Virtual CPUs", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a" ], "datasource": "${DS_TIDB-CLUSTER}", "decimals": 1, "format": "bytes", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "gridPos": { "h": 4, "w": 4, "x": 4, "y": 2 }, "id": 116, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { "expr": "node_memory_MemTotal_bytes{instance=~\"$host\"}", "format": "time_series", "intervalFactor": 1, "refId": "A" } ], "thresholds": "", "timeFrom": null, "timeShift": null, "title": "Total RAM", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a" ], "datasource": "${DS_TIDB-CLUSTER}", "format": "bytes", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "gridPos": { "h": 4, "w": 4, "x": 8, "y": 2 }, "id": 118, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { "expr": "node_memory_SwapTotal_bytes{instance=\"$host\"}", "format": "time_series", "intervalFactor": 1, "refId": "A" } ], "thresholds": "", "timeFrom": null, "timeShift": null, "title": "Total Swap", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": true, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_TIDB-CLUSTER}", "decimals": 1, "editable": true, "error": false, "format": "s", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "gridPos": { "h": 4, "w": 6, "x": 12, "y": 2 }, "height": "50px", "id": 19, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "s", "postfixFontSize": "80%", "prefix": "", "prefixFontSize": "80%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { "calculatedInterval": "10m", "datasourceErrors": {}, "errors": {}, "expr": "node_time_seconds{instance=\"$host\"} - node_boot_time_seconds{instance=\"$host\"}", "format": "time_series", "interval": "5m", "intervalFactor": 1, "legendFormat": "", "metric": "", "prometheusLink": "/api/datasources/proxy/1/graph#%5B%7B%22expr%22%3A%22node_time%7Binstance%3D%5C%22%24host%5C%22%7D%20-%20node_boot_time%7Binstance%3D%5C%22%24host%5C%22%7D%22%2C%22range_input%22%3A%2243200s%22%2C%22end_input%22%3A%222015-9-18%2013%3A25%22%2C%22step_input%22%3A%22%22%2C%22tab%22%3A0%7D%5D", "refId": "A", "step": 300 } ], "thresholds": "300,3600", "title": "System Uptime", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a" ], "datasource": "${DS_TIDB-CLUSTER}", "format": "none", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "gridPos": { "h": 4, "w": 6, "x": 18, "y": 2 }, "id": 52, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "version", "targets": [ { "expr": "node_exporter_build_info", "format": "table", "interval": "", "intervalFactor": 1, "legendFormat": "", "refId": "A" } ], "thresholds": "", "timeFrom": null, "timeShift": null, "title": "Node_exporter version", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "avg" }, { "cacheTimeout": null, "datasource": "${DS_TIDB-CLUSTER}", "gridPos": { "h": 4, "w": 4, "x": 0, "y": 6 }, "id": 74, "links": [], "options": { "maxValue": 100, "minValue": 0, "orientation": "auto", "showThresholdLabels": false, "showThresholdMarkers": true, "thresholds": [ { "color": "green", "index": 0, "value": null }, { "color": "red", "index": 1, "value": 80 } ], "valueMappings": [], "valueOptions": { "decimals": 2, "prefix": "", "stat": "last", "suffix": "", "unit": "percentunit" } }, "pluginVersion": "6.1.6", "targets": [ { "expr": "1 - (sum(rate(node_cpu_seconds_total{instance=\"$host\", mode=\"idle\"}[$interval])) / count(node_cpu_seconds_total{instance=\"$host\", mode=\"idle\"}) or sum(irate(node_cpu_seconds_total{instance=\"$host\", mode=\"idle\"}[30s])) / count(node_cpu_seconds_total{instance=\"$host\", mode=\"idle\"}))", "format": "time_series", "intervalFactor": 1, "legendFormat": "percent", "refId": "A" } ], "timeFrom": null, "timeShift": null, "title": "CPU Used", "type": "gauge" }, { "cacheTimeout": null, "datasource": "${DS_TIDB-CLUSTER}", "gridPos": { "h": 4, "w": 4, "x": 4, "y": 6 }, "id": 72, "links": [], "options": { "maxValue": 100, "minValue": 0, "orientation": "auto", "showThresholdLabels": false, "showThresholdMarkers": true, "thresholds": [ { "color": "green", "index": 0, "value": null }, { "color": "red", "index": 1, "value": 80 } ], "valueMappings": [], "valueOptions": { "decimals": 2, "prefix": "", "stat": "last", "suffix": "", "unit": "percentunit" } }, "pluginVersion": "6.1.6", "targets": [ { "expr": "1 - (node_memory_MemAvailable_bytes{instance=\"$host\"} or (node_memory_MemFree_bytes{instance=\"$host\"} + node_memory_Buffers_bytes{instance=\"$host\"} + node_memory_Cached_bytes{instance=\"$host\"})) / node_memory_MemTotal_bytes{instance=\"$host\"}", "format": "time_series", "instant": false, "interval": "", "intervalFactor": 1, "legendFormat": "", "refId": "A" } ], "timeFrom": null, "timeShift": null, "title": "Memory Used", "type": "gauge" }, { "cacheTimeout": null, "colorBackground": false, "colorPostfix": false, "colorPrefix": false, "colorValue": true, "colors": [ "#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a" ], "datasource": "${DS_TIDB-CLUSTER}", "description": "When the result is `False`, the swap is off.", "format": "percent", "gauge": { "maxValue": 100, "minValue": 0, "show": true, "thresholdLabels": false, "thresholdMarkers": true }, "gridPos": { "h": 4, "w": 4, "x": 8, "y": 6 }, "id": 114, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "pluginVersion": "6.1.6", "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { "expr": "((node_memory_SwapTotal_bytes{instance=~\"$node:$port\",job=~\"$job\"} - node_memory_SwapFree_bytes{instance=~\"$node:$port\",job=~\"$job\"}) / (node_memory_SwapTotal_bytes{instance=~\"$node:$port\",job=~\"$job\"} )) * 100", "format": "time_series", "intervalFactor": 1, "legendFormat": "", "refId": "A" } ], "thresholds": "0.0001,1", "timeFrom": null, "timeShift": null, "title": "Swap Used", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "False", "value": "null" } ], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a" ], "datasource": "${DS_TIDB-CLUSTER}", "decimals": 2, "format": "short", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "gridPos": { "h": 4, "w": 4, "x": 12, "y": 6 }, "id": 120, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { "expr": "node_load1{instance=\"$host\"}", "format": "time_series", "intervalFactor": 1, "refId": "A" } ], "thresholds": "", "timeFrom": null, "timeShift": null, "title": "Load1", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a" ], "datasource": "${DS_TIDB-CLUSTER}", "decimals": 2, "format": "short", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "gridPos": { "h": 4, "w": 4, "x": 16, "y": 6 }, "id": 122, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { "expr": "node_load5{instance=\"$host\"}", "format": "time_series", "intervalFactor": 1, "refId": "A" } ], "thresholds": "", "timeFrom": null, "timeShift": null, "title": "Load5", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a" ], "datasource": "${DS_TIDB-CLUSTER}", "decimals": 2, "format": "short", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "gridPos": { "h": 4, "w": 4, "x": 20, "y": 6 }, "id": 124, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { "expr": "node_load15{instance=\"$host\"}", "format": "time_series", "intervalFactor": 1, "refId": "A" } ], "thresholds": "", "timeFrom": null, "timeShift": null, "title": "Load15", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" } ], "repeat": null, "title": "Overview", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 1 }, "id": 54, "panels": [ { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a" ], "datasource": "${DS_TIDB-CLUSTER}", "format": "none", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "gridPos": { "h": 4, "w": 7, "x": 0, "y": 8 }, "id": 58, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "sysname", "targets": [ { "expr": "node_uname_info{instance=~\"$host\"}", "format": "table", "instant": true, "intervalFactor": 1, "legendFormat": "", "refId": "A" } ], "thresholds": "", "timeFrom": null, "timeShift": null, "title": "OS", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "avg" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a" ], "datasource": "${DS_TIDB-CLUSTER}", "format": "none", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "gridPos": { "h": 4, "w": 7, "x": 7, "y": 8 }, "id": 56, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "machine", "targets": [ { "expr": "node_uname_info{instance=~\"$host\"}", "format": "table", "instant": true, "intervalFactor": 1, "legendFormat": "", "refId": "A" } ], "thresholds": "", "timeFrom": null, "timeShift": null, "title": "Machine", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "avg" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "#299c46", "rgba(237, 129, 40, 0.89)", "#d44a3a" ], "datasource": "${DS_TIDB-CLUSTER}", "decimals": null, "format": "none", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "gridPos": { "h": 4, "w": 10, "x": 14, "y": 8 }, "id": 60, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "release", "targets": [ { "expr": "node_uname_info{instance=~\"$host\"}", "format": "table", "instant": true, "interval": "", "intervalFactor": 1, "legendFormat": "", "refId": "A" } ], "thresholds": "", "timeFrom": null, "timeShift": null, "title": "Kernel version", "type": "singlestat", "valueFontSize": "50%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "avg" }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "decimals": 2, "editable": true, "error": false, "fill": 2, "grid": {}, "gridPos": { "h": 5, "w": 12, "x": 0, "y": 12 }, "id": 28, "instanceColors": {}, "legend": { "alignAsTable": false, "avg": false, "current": true, "hideEmpty": false, "max": true, "min": true, "rightSide": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "color": "#D683CE", "instance": "Interrupts" } ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "calculatedInterval": "2m", "datasourceErrors": {}, "errors": {}, "expr": "rate(node_intr_total{instance=\"$host\"}[$interval]) or irate(node_intr_total{instance=\"$host\"}[5m])", "format": "time_series", "intervalFactor": 1, "legendFormat": "Interrupts", "metric": "", "prometheusLink": "/api/datasources/proxy/1/graph#%5B%7B%22expr%22%3A%22node_procs_running%7Binstance%3D%5C%22%24host%5C%22%7D%22%2C%22range_input%22%3A%2243200s%22%2C%22end_input%22%3A%222015-9-18%2013%3A46%22%2C%22step_input%22%3A%22%22%2C%22stacked%22%3Atrue%2C%22tab%22%3A0%7D%5D", "refId": "A", "step": 5, "target": "" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Interrupts", "tooltip": { "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": "", "logBase": 1, "max": null, "min": 0, "show": true }, { "format": "none", "logBase": 1, "max": null, "min": 0, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "decimals": 2, "editable": true, "error": false, "fill": 2, "grid": {}, "gridPos": { "h": 5, "w": 12, "x": 12, "y": 12 }, "id": 24, "instanceColors": {}, "legend": { "alignAsTable": false, "avg": false, "current": true, "hideEmpty": false, "max": true, "min": true, "rightSide": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "color": "#EF843C", "instance": "Forks" } ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "calculatedInterval": "2m", "datasourceErrors": {}, "errors": {}, "expr": "rate(node_forks_total{instance=\"$host\"}[$interval]) or irate(node_forks_total{instance=\"$host\"}[5m])", "format": "time_series", "intervalFactor": 1, "legendFormat": "Forks", "metric": "", "prometheusLink": "/api/datasources/proxy/1/graph#%5B%7B%22expr%22%3A%22node_procs_running%7Binstance%3D%5C%22%24host%5C%22%7D%22%2C%22range_input%22%3A%2243200s%22%2C%22end_input%22%3A%222015-9-18%2013%3A46%22%2C%22step_input%22%3A%22%22%2C%22stacked%22%3Atrue%2C%22tab%22%3A0%7D%5D", "refId": "A", "step": 5, "target": "" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Forks", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": "", "logBase": 1, "max": null, "min": 0, "show": true }, { "format": "none", "logBase": 1, "max": null, "min": 0, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "decimals": 2, "editable": true, "error": false, "fill": 2, "grid": {}, "gridPos": { "h": 4, "w": 24, "x": 0, "y": 17 }, "id": 27, "instanceColors": {}, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "max": true, "min": true, "rightSide": true, "show": true, "sideWidth": null, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "calculatedInterval": "2m", "datasourceErrors": {}, "errors": {}, "expr": "rate(node_context_switches_total{instance=\"$host\"}[$interval]) or irate(node_context_switches_total{instance=\"$host\"}[5m])", "format": "time_series", "intervalFactor": 1, "legendFormat": "Context Switches", "metric": "", "prometheusLink": "/api/datasources/proxy/1/graph#%5B%7B%22expr%22%3A%22node_procs_running%7Binstance%3D%5C%22%24host%5C%22%7D%22%2C%22range_input%22%3A%2243200s%22%2C%22end_input%22%3A%222015-9-18%2013%3A46%22%2C%22step_input%22%3A%22%22%2C%22stacked%22%3Atrue%2C%22tab%22%3A0%7D%5D", "refId": "A", "step": 5, "target": "" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Context Switches", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": "", "logBase": 1, "max": null, "min": 0, "show": true }, { "format": "none", "logBase": 1, "max": null, "min": 0, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Kernel", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 2 }, "id": 68, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "decimals": 2, "editable": true, "error": false, "fill": 6, "grid": {}, "gridPos": { "h": 5, "w": 24, "x": 0, "y": 9 }, "height": "260px", "id": 2, "instanceColors": {}, "legend": { "alignAsTable": true, "avg": true, "current": false, "hideEmpty": false, "max": true, "min": true, "rightSide": true, "show": true, "sort": "avg", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "calculatedInterval": "2s", "datasourceErrors": {}, "errors": {}, "expr": "sum(rate(node_cpu_seconds_total{instance=\"$host\"}[$interval])) by (mode) * 100 / count(node_cpu_seconds_total{instance=\"$host\"}) by (mode) or sum(irate(node_cpu_seconds_total{instance=\"$host\"}[5m])) by (mode) * 100 / count(node_cpu_seconds_total{instance=\"$host\"}) by (mode)", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{mode}}", "metric": "", "prometheusLink": "/api/datasources/proxy/1/graph#%5B%7B%22expr%22%3A%22sum(rate(node_cpu%7Binstance%3D%5C%22%24host%5C%22%7D%5B%24interval%5D))%20by%20(mode)%20*%20100%22%2C%22range_input%22%3A%223600s%22%2C%22end_input%22%3A%222015-10-22%2015%3A27%22%2C%22step_input%22%3A%22%22%2C%22stacked%22%3Atrue%2C%22tab%22%3A0%7D%5D", "refId": "A", "step": 2 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "CPU Usage", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percent", "label": "", "logBase": 1, "max": 100, "min": 0, "show": true }, { "format": "short", "logBase": 1, "max": null, "min": 0, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "CPU", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 3 }, "id": 70, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "decimals": 2, "editable": true, "error": false, "fill": 6, "grid": {}, "gridPos": { "h": 4, "w": 24, "x": 0, "y": 5 }, "height": "", "id": 6, "instanceColors": {}, "legend": { "alignAsTable": true, "avg": true, "current": false, "hideEmpty": false, "max": true, "min": true, "rightSide": true, "show": true, "sort": "min", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "color": "#0A437C", "instance": "Used" }, { "color": "#5195CE", "instance": "Available" }, { "color": "#052B51", "instance": "Total", "legend": false, "stack": false } ], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "calculatedInterval": "2s", "datasourceErrors": {}, "errors": {}, "expr": "node_memory_MemTotal_bytes{instance=\"$host\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "Total", "metric": "", "prometheusLink": "/api/datasources/proxy/1/graph#%5B%7B%22expr%22%3A%22node_memory_MemFree%7Binstance%3D%5C%22%24host%5C%22%7D%20%2B%20node_memory_Buffers%7Binstance%3D%5C%22%24host%5C%22%7D%20%2B%20node_memory_Cached%7Binstance%3D%5C%22%24host%5C%22%7D%22%2C%22range_input%22%3A%22900s%22%2C%22end_input%22%3A%222015-10-22%2015%3A25%22%2C%22step_input%22%3A%22%22%2C%22stacked%22%3Atrue%2C%22tab%22%3A0%7D%5D", "refId": "C", "step": 5, "target": "" }, { "calculatedInterval": "2s", "datasourceErrors": {}, "errors": {}, "expr": "node_memory_MemTotal_bytes{instance=\"$host\"} - (node_memory_MemAvailable_bytes{instance=\"$host\"} or (node_memory_MemFree_bytes{instance=\"$host\"} + node_memory_Buffers_bytes{instance=\"$host\"} + node_memory_Cached_bytes{instance=\"$host\"}))", "format": "time_series", "intervalFactor": 1, "legendFormat": "Used", "metric": "", "prometheusLink": "/api/datasources/proxy/1/graph#%5B%7B%22expr%22%3A%22node_memory_MemTotal%7Binstance%3D%5C%22%24host%5C%22%7D%20-%20(node_memory_MemFree%7Binstance%3D%5C%22%24host%5C%22%7D%20%2B%20node_memory_Buffers%7Binstance%3D%5C%22%24host%5C%22%7D%20%2B%20node_memory_Cached%7Binstance%3D%5C%22%24host%5C%22%7D)%22%2C%22range_input%22%3A%22900s%22%2C%22end_input%22%3A%222015-10-22%2015%3A25%22%2C%22step_input%22%3A%22%22%2C%22stacked%22%3Atrue%2C%22tab%22%3A0%7D%5D", "refId": "A", "step": 5, "target": "" }, { "calculatedInterval": "2s", "datasourceErrors": {}, "errors": {}, "expr": "node_memory_MemAvailable_bytes{instance=\"$host\"} or (node_memory_MemFree_bytes{instance=\"$host\"} + node_memory_Buffers_bytes{instance=\"$host\"} + node_memory_Cached_bytes{instance=\"$host\"})", "format": "time_series", "intervalFactor": 1, "legendFormat": "Available", "metric": "", "prometheusLink": "/api/datasources/proxy/1/graph#%5B%7B%22expr%22%3A%22node_memory_MemFree%7Binstance%3D%5C%22%24host%5C%22%7D%20%2B%20node_memory_Buffers%7Binstance%3D%5C%22%24host%5C%22%7D%20%2B%20node_memory_Cached%7Binstance%3D%5C%22%24host%5C%22%7D%22%2C%22range_input%22%3A%22900s%22%2C%22end_input%22%3A%222015-10-22%2015%3A25%22%2C%22step_input%22%3A%22%22%2C%22stacked%22%3Atrue%2C%22tab%22%3A0%7D%5D", "refId": "B", "step": 5, "target": "" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Memory", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": "", "logBase": 1, "max": null, "min": 0, "show": true }, { "format": "bytes", "logBase": 1, "max": null, "min": 0, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "decimals": 2, "editable": true, "error": false, "fill": 6, "grid": {}, "gridPos": { "h": 5, "w": 24, "x": 0, "y": 9 }, "height": "", "id": 29, "instanceColors": {}, "legend": { "alignAsTable": true, "avg": true, "current": false, "hideEmpty": false, "max": true, "min": true, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "calculatedInterval": "2s", "datasourceErrors": {}, "errors": {}, "expr": "node_memory_MemTotal_bytes{instance=\"$host\"} - (node_memory_MemFree_bytes{instance=\"$host\"} + node_memory_Buffers_bytes{instance=\"$host\"} + node_memory_Cached_bytes{instance=\"$host\"})", "format": "time_series", "intervalFactor": 1, "legendFormat": "Used", "metric": "", "prometheusLink": "/api/datasources/proxy/1/graph#%5B%7B%22expr%22%3A%22node_memory_MemTotal%7Binstance%3D%5C%22%24host%5C%22%7D%20-%20(node_memory_MemFree%7Binstance%3D%5C%22%24host%5C%22%7D%20%2B%20node_memory_Buffers%7Binstance%3D%5C%22%24host%5C%22%7D%20%2B%20node_memory_Cached%7Binstance%3D%5C%22%24host%5C%22%7D)%22%2C%22range_input%22%3A%22900s%22%2C%22end_input%22%3A%222015-10-22%2015%3A25%22%2C%22step_input%22%3A%22%22%2C%22stacked%22%3Atrue%2C%22tab%22%3A0%7D%5D", "refId": "A", "step": 5, "target": "" }, { "calculatedInterval": "2s", "datasourceErrors": {}, "errors": {}, "expr": "node_memory_MemFree_bytes{instance=\"$host\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "Free", "metric": "", "prometheusLink": "/api/datasources/proxy/1/graph#%5B%7B%22expr%22%3A%22node_memory_MemFree%7Binstance%3D%5C%22%24host%5C%22%7D%20%2B%20node_memory_Buffers%7Binstance%3D%5C%22%24host%5C%22%7D%20%2B%20node_memory_Cached%7Binstance%3D%5C%22%24host%5C%22%7D%22%2C%22range_input%22%3A%22900s%22%2C%22end_input%22%3A%222015-10-22%2015%3A25%22%2C%22step_input%22%3A%22%22%2C%22stacked%22%3Atrue%2C%22tab%22%3A0%7D%5D", "refId": "B", "step": 5, "target": "" }, { "calculatedInterval": "2s", "datasourceErrors": {}, "errors": {}, "expr": "node_memory_Buffers_bytes{instance=\"$host\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "Buffers", "metric": "", "prometheusLink": "/api/datasources/proxy/1/graph#%5B%7B%22expr%22%3A%22node_memory_MemFree%7Binstance%3D%5C%22%24host%5C%22%7D%20%2B%20node_memory_Buffers%7Binstance%3D%5C%22%24host%5C%22%7D%20%2B%20node_memory_Cached%7Binstance%3D%5C%22%24host%5C%22%7D%22%2C%22range_input%22%3A%22900s%22%2C%22end_input%22%3A%222015-10-22%2015%3A25%22%2C%22step_input%22%3A%22%22%2C%22stacked%22%3Atrue%2C%22tab%22%3A0%7D%5D", "refId": "D", "step": 5, "target": "" }, { "calculatedInterval": "2s", "datasourceErrors": {}, "errors": {}, "expr": "node_memory_Cached_bytes{instance=\"$host\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "Cached", "metric": "", "prometheusLink": "/api/datasources/proxy/1/graph#%5B%7B%22expr%22%3A%22node_memory_MemFree%7Binstance%3D%5C%22%24host%5C%22%7D%20%2B%20node_memory_Buffers%7Binstance%3D%5C%22%24host%5C%22%7D%20%2B%20node_memory_Cached%7Binstance%3D%5C%22%24host%5C%22%7D%22%2C%22range_input%22%3A%22900s%22%2C%22end_input%22%3A%222015-10-22%2015%3A25%22%2C%22step_input%22%3A%22%22%2C%22stacked%22%3Atrue%2C%22tab%22%3A0%7D%5D", "refId": "E", "step": 5, "target": "" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Distribution", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": "", "logBase": 1, "max": null, "min": 0, "show": true }, { "format": "bytes", "logBase": 1, "max": null, "min": 0, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "description": "Inactive: Recently used less memory, priority to be recycled\n\nActive: Recently used memory is not usually recycled unless absolutely necessary", "fill": 1, "gridPos": { "h": 4, "w": 24, "x": 0, "y": 14 }, "id": 142, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "node_memory_Inactive_bytes{instance=~\"$host\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "Inactive", "refId": "A" }, { "expr": "node_memory_Active_bytes{instance=~\"$host\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "Active", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Active / Inactive", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": "", "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "description": "Writeback: Preparing to actively write back to the cache page of the hard disk\n\nWritebackTmp: Memory used to temporarily write back to the buffer\n\nDirty: The data size that needs to be written back to disk", "fill": 1, "gridPos": { "h": 4, "w": 24, "x": 0, "y": 18 }, "id": 146, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "node_memory_Writeback_bytes{instance=~\"$host\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "Writeback", "refId": "A" }, { "expr": "node_memory_WritebackTmp_bytes{instance=~\"$host\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "WritebackTmp", "refId": "B" }, { "expr": "node_memory_Dirty_bytes{instance=~\"$host\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "Dirty", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Writeback and Dirty", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "description": "Mapped: The memory occupied by a mapped page\n\nShared: The Shared memory", "fill": 1, "gridPos": { "h": 4, "w": 24, "x": 0, "y": 22 }, "id": 148, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "node_memory_Mapped_bytes{instance=~\"$host\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "Mapped", "refId": "A" }, { "expr": "node_memory_Shmem_bytes{instance=~\"$host\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "Shared", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Shared and Mapped", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "decimals": 1, "description": "Kernel stack size (resident memory, non-recyclable)", "fill": 1, "gridPos": { "h": 4, "w": 24, "x": 0, "y": 26 }, "id": 152, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "node_memory_KernelStack_bytes{instance=~\"$host\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "KernelStack", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "KernelStack", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "decimals": 1, "description": "AnonHugePages: The memory footprint of AnonHugePages\n\nAnonPages: Anonymous memory page size in the user process", "fill": 1, "gridPos": { "h": 4, "w": 24, "x": 0, "y": 30 }, "id": 150, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "node_memory_AnonHugePages_bytes{instance=~\"$host\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "AnonHugePages", "refId": "A" }, { "expr": "node_memory_AnonPages_bytes{instance=~\"$host\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "AnonPages", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Anonymous", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "decimals": 1, "description": "", "fill": 1, "gridPos": { "h": 4, "w": 24, "x": 0, "y": 34 }, "id": 156, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "node_memory_Hugepagesize_bytes{instance=~\"$host\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "Hugepagesize", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "HugePages Size", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "description": "HugePages_Free: The total number of idle HugePages that the system currently has\n\nHugePages_Rsvd: The total number of HugePages currently retained by the system\n\nHugePages_Surp: Exceeds the number of resident HugePages set by the system\n\nHugePages: The total number of HugePages currently owned by the system", "fill": 1, "gridPos": { "h": 5, "w": 24, "x": 0, "y": 38 }, "id": 154, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "node_memory_HugePages_Free{instance=~\"$host\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "HugePages_Free", "refId": "A" }, { "expr": "node_memory_HugePages_Rsvd{instance=~\"$host\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "HugePages_Rsvd", "refId": "B" }, { "expr": "node_memory_HugePages_Surp{instance=~\"$host\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "HugePages_Surp", "refId": "C" }, { "expr": "node_memory_HugePages_Total{instance=~\"$host\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "HugePages", "refId": "D" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "HugePages Counter", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": "Pages", "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "decimals": 1, "description": "Committed_AS: The amount of memory that has been allocated by the current system, including the size of the memory that has been allocated but is not yet used\n\nCommitLimit: The amount of memory that can be allocated to the current system", "fill": 1, "gridPos": { "h": 4, "w": 24, "x": 0, "y": 43 }, "id": 144, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "node_memory_Committed_AS_bytes{instance=~\"$host\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "Committed_AS", "refId": "A" }, { "expr": "node_memory_CommitLimit_bytes{instance=~\"$host\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "CommitLimit", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Commit", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "decimals": 2, "editable": true, "error": false, "fill": 6, "grid": {}, "gridPos": { "h": 4, "w": 24, "x": 0, "y": 47 }, "id": 23, "instanceColors": {}, "legend": { "alignAsTable": true, "avg": true, "current": false, "hideEmpty": false, "max": true, "min": true, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "color": "#584477", "instance": "Used" }, { "color": "#AEA2E0", "instance": "Free" } ], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "calculatedInterval": "2s", "datasourceErrors": {}, "errors": {}, "expr": "node_memory_SwapTotal_bytes{instance=\"$host\"} - node_memory_SwapFree_bytes{instance=\"$host\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "Used", "metric": "", "prometheusLink": "/api/datasources/proxy/1/graph#%5B%7B%22expr%22%3A%22node_memory_MemTotal%7Binstance%3D%5C%22%24host%5C%22%7D%20-%20(node_memory_MemFree%7Binstance%3D%5C%22%24host%5C%22%7D%20%2B%20node_memory_Buffers%7Binstance%3D%5C%22%24host%5C%22%7D%20%2B%20node_memory_Cached%7Binstance%3D%5C%22%24host%5C%22%7D)%22%2C%22range_input%22%3A%22900s%22%2C%22end_input%22%3A%222015-10-22%2015%3A25%22%2C%22step_input%22%3A%22%22%2C%22stacked%22%3Atrue%2C%22tab%22%3A0%7D%5D", "refId": "A", "step": 5, "target": "" }, { "calculatedInterval": "2s", "datasourceErrors": {}, "errors": {}, "expr": "node_memory_SwapFree_bytes{instance=\"$host\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "Free", "metric": "", "prometheusLink": "/api/datasources/proxy/1/graph#%5B%7B%22expr%22%3A%22node_memory_MemFree%7Binstance%3D%5C%22%24host%5C%22%7D%20%2B%20node_memory_Buffers%7Binstance%3D%5C%22%24host%5C%22%7D%20%2B%20node_memory_Cached%7Binstance%3D%5C%22%24host%5C%22%7D%22%2C%22range_input%22%3A%22900s%22%2C%22end_input%22%3A%222015-10-22%2015%3A25%22%2C%22step_input%22%3A%22%22%2C%22stacked%22%3Atrue%2C%22tab%22%3A0%7D%5D", "refId": "B", "step": 5, "target": "" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Swap", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": "", "logBase": 1, "max": null, "min": 0, "show": true }, { "format": "bytes", "logBase": 1, "max": null, "min": 0, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "decimals": 2, "editable": true, "error": false, "fill": 2, "grid": {}, "gridPos": { "h": 4, "w": 24, "x": 0, "y": 51 }, "id": 30, "instanceColors": {}, "legend": { "alignAsTable": true, "avg": true, "current": false, "hideEmpty": false, "max": true, "min": true, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "calculatedInterval": "2s", "datasourceErrors": {}, "errors": {}, "expr": "rate(node_vmstat_pswpin{instance=\"$host\"}[$interval]) * 4096 or irate(node_vmstat_pswpin{instance=\"$host\"}[5m]) * 4096", "format": "time_series", "intervalFactor": 1, "legendFormat": "Swap In", "metric": "", "prometheusLink": "/api/datasources/proxy/1/graph#%5B%7B%22expr%22%3A%22node_memory_MemTotal%7Binstance%3D%5C%22%24host%5C%22%7D%20-%20(node_memory_MemFree%7Binstance%3D%5C%22%24host%5C%22%7D%20%2B%20node_memory_Buffers%7Binstance%3D%5C%22%24host%5C%22%7D%20%2B%20node_memory_Cached%7Binstance%3D%5C%22%24host%5C%22%7D)%22%2C%22range_input%22%3A%22900s%22%2C%22end_input%22%3A%222015-10-22%2015%3A25%22%2C%22step_input%22%3A%22%22%2C%22stacked%22%3Atrue%2C%22tab%22%3A0%7D%5D", "refId": "A", "step": 5, "target": "" }, { "calculatedInterval": "2s", "datasourceErrors": {}, "errors": {}, "expr": "rate(node_vmstat_pswpout{instance=\"$host\"}[$interval]) * 4096 or irate(node_vmstat_pswpout{instance=\"$host\"}[5m]) * 4096", "format": "time_series", "intervalFactor": 1, "legendFormat": "Swap Out", "metric": "", "prometheusLink": "/api/datasources/proxy/1/graph#%5B%7B%22expr%22%3A%22node_memory_MemFree%7Binstance%3D%5C%22%24host%5C%22%7D%20%2B%20node_memory_Buffers%7Binstance%3D%5C%22%24host%5C%22%7D%20%2B%20node_memory_Cached%7Binstance%3D%5C%22%24host%5C%22%7D%22%2C%22range_input%22%3A%22900s%22%2C%22end_input%22%3A%222015-10-22%2015%3A25%22%2C%22step_input%22%3A%22%22%2C%22stacked%22%3Atrue%2C%22tab%22%3A0%7D%5D", "refId": "B", "step": 5, "target": "" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Swap Activity", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "Bps", "label": "", "logBase": 1, "max": null, "min": 0, "show": true }, { "format": "bytes", "logBase": 1, "max": null, "min": 0, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "fill": 1, "gridPos": { "h": 4, "w": 24, "x": 0, "y": 55 }, "id": 42, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "rate(node_vmstat_pgpgin{instance=\"$host\"}[1m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "page in", "refId": "A" }, { "expr": "rate(node_vmstat_pgpgout{instance=\"$host\"}[1m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "page out", "refId": "B" }, { "expr": "rate(node_vmstat_pswpout{instance=\"$host\"}[1m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "swap out", "refId": "C" }, { "expr": "rate(node_vmstat_pswpin{instance=\"$host\"}[1m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "swap in", "refId": "D" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Page/Swap in/out", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "decimals": null, "fill": 1, "gridPos": { "h": 5, "w": 24, "x": 0, "y": 59 }, "id": 44, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": true, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "rate(node_vmstat_pgmajfault{instance=\"$host\"}[1m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "major fault", "refId": "A" }, { "expr": "rate(node_vmstat_pgfault{instance=\"$host\"}[1m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "minor fault", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Mem Fault", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Memory", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }, "id": 134, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "description": "The rate at which data is read from the hard disk to physical memory (within 5 minutes)\n\nRate at which data is written from physical memory to hard disk (within 5 minutes)", "fill": 1, "gridPos": { "h": 4, "w": 24, "x": 0, "y": 2 }, "id": 138, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "irate(node_vmstat_pgpgin{instance=~\"$host\"}[5m])", "format": "time_series", "intervalFactor": 1, "legendFormat": "pgpgin", "refId": "A" }, { "expr": "irate(node_vmstat_pgpgout{instance=~\"$host\"}[5m])", "format": "time_series", "intervalFactor": 1, "legendFormat": "pgpgout", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Pages In / Out", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": "Pages", "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "description": "Rate at which data is loaded into memory from disk swap (5 minutes)\n\nRate at which data is dumped from memory to disk swap (within 5 minutes)", "fill": 1, "gridPos": { "h": 4, "w": 24, "x": 0, "y": 6 }, "id": 140, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "irate(node_vmstat_pswpin{instance=~\"$host\"}[5m])", "format": "time_series", "intervalFactor": 1, "legendFormat": "pswpin", "refId": "A" }, { "expr": "irate(node_vmstat_pswpout{instance=~\"$host\"}[5m])", "format": "time_series", "intervalFactor": 1, "legendFormat": "Pswpout", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Pages Swap In / Out", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": "Pages", "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "description": "pgdeactivate: Average number of pages activated (within 5 minutes)\n\npgfree: Average number of pages released (within 5 minutes)\n\npgactivate: Average number of pages not activated (within 5 minutes)", "fill": 1, "gridPos": { "h": 4, "w": 24, "x": 0, "y": 10 }, "id": 158, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "irate(node_vmstat_pgdeactivate{instance=~\"$host\"}[5m])", "format": "time_series", "intervalFactor": 1, "legendFormat": "pgdeactivate", "refId": "A" }, { "expr": "irate(node_vmstat_pgfree{instance=~\"$host\"}[5m])", "format": "time_series", "intervalFactor": 1, "legendFormat": "pgfree", "refId": "B" }, { "expr": "irate(node_vmstat_pgactivate{instance=~\"$host\"}[5m])", "format": "time_series", "intervalFactor": 1, "legendFormat": "pgactivate", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Page Operations", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": "Pages", "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "decimals": 2, "description": "Average number of pages requested to be recycled directly (within 5 minutes)", "fill": 1, "gridPos": { "h": 4, "w": 24, "x": 0, "y": 14 }, "id": 136, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "irate(node_vmstat_allocstall{instance=\"$host\"}[5m])", "format": "time_series", "intervalFactor": 1, "legendFormat": "allocstall", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Allocstall", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 2, "format": "short", "label": "Pages", "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "description": "drop_pagecache: Average number of pages called to release the cache (within 5 minutes)\n\ndrop_slab: Average number of pages to call to release the slab cache (in 5 minutes)", "fill": 1, "gridPos": { "h": 4, "w": 24, "x": 0, "y": 18 }, "id": 160, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "node_vmstat_drop_pagecache{instance=~\"$host\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "drop_pagecache", "refId": "A" }, { "expr": "node_vmstat_drop_slab{instance=~\"$host\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "drop_slab", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Page Drop", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": "Cells", "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "description": "pgalloc_dma: Average number of pages allocated by DMA storage area (within 5 minutes)\n\npgalloc_dma32: Average number of pages allocated in DMA32 storage (within 5 minutes)\n\npgalloc_movable: Movable storage allocated average number of pages (within 5 minutes)\n\npgalloc_normal: Average number of pages allocated by normal storage (within 5 minutes)", "fill": 1, "gridPos": { "h": 5, "w": 24, "x": 0, "y": 22 }, "id": 162, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "irate(node_vmstat_pgalloc_dma{instance=~\"$host\"}[5m])", "format": "time_series", "intervalFactor": 1, "legendFormat": "pgalloc_dma", "refId": "A" }, { "expr": "irate(node_vmstat_pgalloc_dma32{instance=~\"$host\"}[5m])", "format": "time_series", "intervalFactor": 1, "legendFormat": "pgalloc_dma32", "refId": "B" }, { "expr": "irate(node_vmstat_pgalloc_movable{instance=~\"$host\"}[5m])", "format": "time_series", "intervalFactor": 1, "legendFormat": "pgalloc_movable", "refId": "C" }, { "expr": "irate(node_vmstat_pgalloc_normal{instance=~\"$host\"}[5m])", "format": "time_series", "intervalFactor": 1, "legendFormat": "pgalloc_normal", "refId": "D" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Page Allocation", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": "Pages", "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Vmstat - Page", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 }, "id": 176, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "description": "foreign: Plan to use other node memory but use local memory\n\nhit: Use this node memory times\n\ninterleave: The number of times the memory of this node is used in the memory used for cross-allocation\n\nlocal: The program running on this node uses the memory of this node\n\nmiss: The number of times that you plan to use this node memory to be dispatched to other nodes\n\nother: Programs running on other nodes use the memory of this node", "fill": 1, "gridPos": { "h": 6, "w": 24, "x": 0, "y": 1 }, "id": 164, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "irate(node_vmstat_numa_foreign{instance=~\"$host\"}[5m])", "format": "time_series", "intervalFactor": 1, "legendFormat": "foreign", "refId": "A" }, { "expr": "irate(node_vmstat_numa_hit{instance=~\"$host\"}[5m])", "format": "time_series", "intervalFactor": 1, "legendFormat": "hit", "refId": "B" }, { "expr": "irate(node_vmstat_numa_interleave{instance=~\"$host\"}[5m])", "format": "time_series", "intervalFactor": 1, "legendFormat": "interleave", "refId": "C" }, { "expr": "irate(node_vmstat_numa_local{instance=~\"$host\"}[5m])", "format": "time_series", "intervalFactor": 1, "legendFormat": "local", "refId": "D" }, { "expr": "irate(node_vmstat_numa_miss{instance=~\"$host\"}[5m])", "format": "time_series", "intervalFactor": 1, "legendFormat": "miss", "refId": "E" }, { "expr": "irate(node_vmstat_numa_other{instance=~\"$host\"}[5m])", "format": "time_series", "intervalFactor": 1, "legendFormat": "other", "refId": "F" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Numa Allocations", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": "Allocations", "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "description": "pages_migrated: NUMA page number\n\nPgmigrate_fail: Number of pages failed to migrate\n\nPgmigrate_success: Number of pages successfully migrated", "fill": 1, "gridPos": { "h": 4, "w": 24, "x": 0, "y": 7 }, "id": 166, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "irate(node_vmstat_numa_pages_migrated{instance=~\"$host\"}[5m])", "format": "time_series", "intervalFactor": 1, "legendFormat": "pages_migrated", "refId": "A" }, { "expr": "irate(node_vmstat_pgmigrate_fail{instance=~\"$host\"}[5m])", "format": "time_series", "intervalFactor": 1, "legendFormat": "pgmigrate_fail", "refId": "B" }, { "expr": "irate(node_vmstat_pgmigrate_success{instance=~\"$host\"}[5m])", "format": "time_series", "intervalFactor": 1, "legendFormat": "Pgmigrate_success", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Numa Page Migrations", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": null, "format": "short", "label": "Pages", "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "description": "hint_faults: NUMA hint faults trapped\n\nhint_faults_local: Hinting faults to local nodes", "fill": 1, "gridPos": { "h": 4, "w": 24, "x": 0, "y": 11 }, "id": 168, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "irate(node_vmstat_numa_hint_faults{instance=~\"$host\"}[5m])", "format": "time_series", "hide": false, "intervalFactor": 1, "legendFormat": "hint_faults", "refId": "A" }, { "expr": "irate(node_vmstat_numa_hint_faults_local{instance=~\"$host\"}[5m])", "format": "time_series", "intervalFactor": 1, "legendFormat": "hint_faults_local", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Numa Hints", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": "Hints", "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "description": "pte_updates: NUMA page table entry updates\n\nhuge_pte_updates: NUMA huge page table entry updates", "fill": 1, "gridPos": { "h": 4, "w": 24, "x": 0, "y": 15 }, "id": 170, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "irate(node_vmstat_numa_pte_updates{instance=~\"$host\"}[5m])", "format": "time_series", "intervalFactor": 1, "legendFormat": "pte_updates", "refId": "A" }, { "expr": "irate(node_vmstat_numa_huge_pte_updates{instance=~\"$host\"}[5m])", "format": "time_series", "intervalFactor": 1, "legendFormat": "huge_pte_updates", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Numa Table Updates", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": "Updates", "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "decimals": 0, "description": "", "fill": 1, "gridPos": { "h": 4, "w": 24, "x": 0, "y": 19 }, "id": 172, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "node_memory_numa_MemUsed{instance=\"$host\"} / node_memory_numa_MemTotal{instance=\"$host\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "Numa node: {{ node }}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Numa Mem Usage", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 2, "format": "percentunit", "label": "", "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "decimals": 0, "fill": 1, "gridPos": { "h": 5, "w": 24, "x": 0, "y": 23 }, "id": 174, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "node_memory_numa_MemFree{instance=\"$host\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "Numa node: {{ node }}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Numa Mem Free", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Vmstat - Numa", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 6 }, "id": 178, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "description": "", "fill": 1, "gridPos": { "h": 4, "w": 24, "x": 0, "y": 1 }, "id": 180, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "irate(node_vmstat_thp_split{instance=~\"$host\"}[5m])", "format": "time_series", "intervalFactor": 1, "legendFormat": "split", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "THP Splits", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": "Splits", "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "description": "collapse_alloc: Transparent huge page collapse allocations\n\ncollapse_alloc_failed: Transparent huge page collapse allocation failures\n\nzero_page_alloc: Transparent huge page zeroed page allocations\n\nzero_page_alloc_failed: Transparent huge page zeroed page allocation failures\n\nfault_alloc: Transparent huge page fault allocations\n\nfault_fallback: Transparent huge page fault fallbacks", "fill": 1, "gridPos": { "h": 5, "w": 24, "x": 0, "y": 5 }, "id": 182, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "irate(node_vmstat_thp_collapse_alloc{instance=~\"$host\"}[5m])", "format": "time_series", "intervalFactor": 1, "legendFormat": "collapse_alloc", "refId": "A" }, { "expr": "irate(node_vmstat_thp_collapse_alloc_failed{instance=~\"$host\"}[5m])", "format": "time_series", "intervalFactor": 1, "legendFormat": "collapse_alloc_failed", "refId": "B" }, { "expr": "irate(node_vmstat_thp_zero_page_alloc{instance=~\"$host\"}[5m])", "format": "time_series", "intervalFactor": 1, "legendFormat": "zero_page_alloc", "refId": "C" }, { "expr": "irate(node_vmstat_thp_zero_page_alloc_failed{instance=~\"$host\"}[5m])", "format": "time_series", "intervalFactor": 1, "legendFormat": "zero_page_alloc_failed", "refId": "D" }, { "expr": "irate(node_vmstat_thp_fault_alloc{instance=~\"$host\"}[5m])", "format": "time_series", "intervalFactor": 1, "legendFormat": "fault_alloc", "refId": "E" }, { "expr": "irate(node_vmstat_thp_fault_fallback{instance=~\"$host\"}[5m])", "format": "time_series", "intervalFactor": 1, "legendFormat": "fault_fallback", "refId": "F" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "THP Allocations", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": "Allocations", "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Vmstat - THP", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 7 }, "id": 184, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "description": "", "fill": 1, "gridPos": { "h": 4, "w": 24, "x": 0, "y": 1 }, "id": 186, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "irate(node_vmstat_compact_fail{instance=~\"$host\"}[5m])", "format": "time_series", "intervalFactor": 1, "legendFormat": "fail", "refId": "A" }, { "expr": "irate(node_vmstat_compact_success{instance=~\"$host\"}[5m])", "format": "time_series", "intervalFactor": 1, "legendFormat": "success", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Compact Status", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": "Compactions", "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "fill": 1, "gridPos": { "h": 4, "w": 24, "x": 0, "y": 5 }, "id": 188, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "irate(node_vmstat_compact_stall{instance=~\"$host\"}[5m])", "format": "time_series", "intervalFactor": 1, "legendFormat": "Compact_stall", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Compact Stall", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": "Compactions", "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "fill": 1, "gridPos": { "h": 4, "w": 24, "x": 0, "y": 9 }, "id": 190, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "irate(node_vmstat_compact_isolated{instance=~\"$host\"}[5m])", "format": "time_series", "intervalFactor": 1, "legendFormat": "isolated", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Compact Isolated", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": "Pages", "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "description": "", "fill": 1, "gridPos": { "h": 4, "w": 24, "x": 0, "y": 13 }, "id": 192, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "irate(node_vmstat_compact_free_scanned{instance=~\"$host\"}[5m])", "format": "time_series", "intervalFactor": 1, "legendFormat": "free_scanned", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Compact Free Scanned", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": null, "format": "short", "label": "Pages", "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "fill": 1, "gridPos": { "h": 4, "w": 24, "x": 0, "y": 17 }, "id": 194, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "irate(node_vmstat_compact_migrate_scanned{instance=~\"$host\"}[5m])", "format": "time_series", "intervalFactor": 1, "legendFormat": "migrate_scanned", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Compact Migrate Scanned", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": "Pages", "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Vmstat - Compact", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 8 }, "id": 62, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "decimals": 2, "editable": true, "error": false, "fill": 2, "grid": {}, "gridPos": { "h": 4, "w": 24, "x": 0, "y": 5 }, "id": 66, "instanceColors": {}, "legend": { "alignAsTable": true, "avg": true, "current": false, "hideEmpty": false, "max": true, "min": true, "rightSide": true, "show": true, "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "color": "#E24D42", "instance": "Load 1m" }, { "color": "#E0752D", "instance": "Load 5m" }, { "color": "#E5AC0E", "instance": "Load 15m" } ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "calculatedInterval": "10s", "datasourceErrors": {}, "errors": {}, "expr": "node_load1{instance=\"$host\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "Load 1m", "metric": "", "prometheusLink": "/api/datasources/proxy/1/graph#%5B%7B%22expr%22%3A%22node_load1%7Binstance%3D%5C%22%24host%5C%22%7D%22%2C%22range_input%22%3A%223601s%22%2C%22end_input%22%3A%222015-10-22%2015%3A27%22%2C%22step_input%22%3A%22%22%2C%22stacked%22%3Afalse%2C%22tab%22%3A0%7D%5D", "refId": "A", "step": 2, "target": "" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Load: 1m", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": "", "logBase": 1, "max": null, "min": 0, "show": true }, { "format": "none", "logBase": 1, "max": null, "min": 0, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "decimals": 2, "editable": true, "error": false, "fill": 2, "grid": {}, "gridPos": { "h": 4, "w": 24, "x": 0, "y": 9 }, "id": 64, "instanceColors": {}, "legend": { "alignAsTable": true, "avg": true, "current": false, "hideEmpty": false, "max": true, "min": true, "rightSide": true, "show": true, "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "color": "#E24D42", "instance": "Load 1m" }, { "color": "#E0752D", "instance": "Load 5m" }, { "color": "#E5AC0E", "instance": "Load 15m" } ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "calculatedInterval": "10s", "datasourceErrors": {}, "errors": {}, "expr": "node_load5{instance=\"$host\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "Load 5m", "metric": "", "prometheusLink": "/api/datasources/proxy/1/graph#%5B%7B%22expr%22%3A%22node_load5%7Binstance%3D%5C%22%24host%5C%22%7D%22%2C%22range_input%22%3A%223600s%22%2C%22end_input%22%3A%222015-10-22%2015%3A27%22%2C%22step_input%22%3A%22%22%2C%22stacked%22%3Afalse%2C%22tab%22%3A0%7D%5D", "refId": "B", "step": 2, "target": "" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Load: 5m", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": "", "logBase": 1, "max": null, "min": 0, "show": true }, { "format": "none", "logBase": 1, "max": null, "min": 0, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "decimals": 2, "editable": true, "error": false, "fill": 2, "grid": {}, "gridPos": { "h": 4, "w": 24, "x": 0, "y": 13 }, "id": 18, "instanceColors": {}, "legend": { "alignAsTable": true, "avg": true, "current": false, "hideEmpty": false, "max": true, "min": true, "rightSide": true, "show": true, "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "color": "#E24D42", "instance": "Load 1m" }, { "color": "#E0752D", "instance": "Load 5m" }, { "color": "#E5AC0E", "instance": "Load 15m" } ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "calculatedInterval": "10s", "datasourceErrors": {}, "errors": {}, "expr": "node_load15{instance=\"$host\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "Load 15m", "metric": "", "prometheusLink": "/api/datasources/proxy/1/graph#%5B%7B%22expr%22%3A%22node_load15%7Binstance%3D%5C%22%24host%5C%22%7D%22%2C%22range_input%22%3A%223600s%22%2C%22end_input%22%3A%222015-10-22%2015%3A27%22%2C%22step_input%22%3A%22%22%2C%22stacked%22%3Afalse%2C%22tab%22%3A0%7D%5D", "refId": "C", "step": 2, "target": "" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Load: 15m", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": "", "logBase": 1, "max": null, "min": 0, "show": true }, { "format": "none", "logBase": 1, "max": null, "min": 0, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Load", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 9 }, "id": 76, "panels": [ { "columns": [], "datasource": "${DS_TIDB-CLUSTER}", "fontSize": "100%", "gridPos": { "h": 4, "w": 16, "x": 0, "y": 7 }, "id": 128, "interval": "", "links": [], "pageSize": null, "scroll": true, "showHeader": true, "sort": { "col": 0, "desc": true }, "styles": [ { "alias": "", "colorMode": null, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "mappingType": 1, "pattern": "device", "preserveFormat": false, "sanitize": false, "thresholds": [], "type": "string", "unit": "short" }, { "alias": "", "colorMode": null, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "link": false, "mappingType": 1, "pattern": "Time", "thresholds": [], "type": "hidden", "unit": "short" }, { "alias": "", "colorMode": null, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "mappingType": 1, "pattern": "__name__", "thresholds": [], "type": "hidden", "unit": "short" }, { "alias": "", "colorMode": null, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "mappingType": 1, "pattern": "instance", "thresholds": [], "type": "hidden", "unit": "short" }, { "alias": "", "colorMode": null, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "mappingType": 1, "pattern": "job", "thresholds": [], "type": "hidden", "unit": "short" }, { "alias": "", "colorMode": "value", "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "mappingType": 1, "pattern": "Value", "thresholds": [ "0", "1" ], "type": "number", "unit": "bytes" } ], "targets": [ { "expr": "node_filesystem_size_bytes{instance=\"$host\",device=~'^/.*'}", "format": "table", "instant": true, "interval": "", "intervalFactor": 1, "legendFormat": "Disk Size: {{ device }} {{ fstype }} {{ mountpoint }}", "refId": "A" } ], "timeFrom": null, "timeShift": null, "title": "Total Disk Size", "transform": "table", "type": "table" }, { "columns": [], "datasource": "${DS_TIDB-CLUSTER}", "fontSize": "100%", "gridPos": { "h": 4, "w": 8, "x": 16, "y": 7 }, "id": 132, "links": [], "pageSize": 4, "scroll": true, "showHeader": true, "sort": { "col": 0, "desc": true }, "styles": [ { "alias": "Time", "dateFormat": "YYYY-MM-DD HH:mm:ss", "pattern": "Time", "type": "hidden" }, { "alias": "", "colorMode": null, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "mappingType": 1, "pattern": "__name__", "thresholds": [], "type": "hidden", "unit": "short" }, { "alias": "", "colorMode": null, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "mappingType": 1, "pattern": "device", "thresholds": [], "type": "string", "unit": "short" }, { "alias": "", "colorMode": null, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "mappingType": 1, "pattern": "fstype", "thresholds": [], "type": "hidden", "unit": "short" }, { "alias": "", "colorMode": null, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "mappingType": 1, "pattern": "instance", "thresholds": [], "type": "hidden", "unit": "short" }, { "alias": "", "colorMode": null, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "mappingType": 1, "pattern": "job", "thresholds": [], "type": "hidden", "unit": "short" }, { "alias": "", "colorMode": null, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "mappingType": 1, "pattern": "mountpoint", "thresholds": [], "type": "hidden", "unit": "short" }, { "alias": "", "colorMode": "value", "colors": [ "rgba(50, 172, 45, 0.97)", "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)" ], "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "mappingType": 1, "pattern": "Value", "thresholds": [ "0.5", "0.9" ], "type": "string", "unit": "short", "valueMaps": [ { "text": "Normal", "value": "0" }, { "text": "Read Only", "value": "1" } ] } ], "targets": [ { "expr": "node_filesystem_readonly{instance=\"$host\", device=~\"/.*\"}", "format": "table", "instant": true, "intervalFactor": 1, "legendFormat": "", "refId": "A" } ], "timeFrom": null, "timeShift": null, "title": "Disk State", "transform": "table", "type": "table" }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "decimals": 2, "fill": 1, "gridPos": { "h": 4, "w": 24, "x": 0, "y": 11 }, "id": 130, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": true, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "1 - node_filesystem_avail_bytes{instance=\"$host\", device=~'^/.*'} / node_filesystem_size_bytes{instance=\"$host\", device=~'^/.*'}", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{ device }} - {{ fstype }} - {{ mountpoint }}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Disk Space Utilization", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 2, "format": "percentunit", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "decimals": 2, "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "decimals": 2, "editable": true, "error": false, "fill": 2, "grid": {}, "gridPos": { "h": 4, "w": 24, "x": 0, "y": 15 }, "id": 35, "instanceColors": {}, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 200, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "calculatedInterval": "2s", "datasourceErrors": {}, "errors": {}, "expr": "rate(node_disk_io_time_seconds_total{instance=\"$host\"}[$interval]) or irate(node_disk_io_time_seconds_total{instance=\"$host\"}[1m])", "format": "time_series", "hide": false, "intervalFactor": 1, "legendFormat": "{{ device }}", "metric": "node_disk_io_time_ms", "prometheusLink": "/api/datasources/proxy/1/graph#%5B%7B%22expr%22%3A%22node_memory_MemTotal%7Binstance%3D%5C%22%24host%5C%22%7D%20-%20(node_memory_MemFree%7Binstance%3D%5C%22%24host%5C%22%7D%20%2B%20node_memory_Buffers%7Binstance%3D%5C%22%24host%5C%22%7D%20%2B%20node_memory_Cached%7Binstance%3D%5C%22%24host%5C%22%7D)%22%2C%22range_input%22%3A%22900s%22%2C%22end_input%22%3A%222015-10-22%2015%3A25%22%2C%22step_input%22%3A%22%22%2C%22stacked%22%3Atrue%2C%22tab%22%3A0%7D%5D", "refId": "A", "step": 5, "target": "" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "I/O Util", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": "", "logBase": 1, "max": null, "min": 0, "show": true }, { "format": "bytes", "logBase": 1, "max": null, "min": 0, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "fill": 1, "gridPos": { "h": 4, "w": 24, "x": 0, "y": 19 }, "id": 78, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(node_disk_reads_completed_total{instance=~\"$host\"}[5m]) + rate(node_disk_writes_completed_total{instance=~\"$host\"}[5m]))", "format": "time_series", "intervalFactor": 1, "legendFormat": "IOPs", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "IOPs", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "description": "DISK seconds Read/ Write Latency.\n- Critical:\n - Recommended performance value is < 10ms as avg value of the Avg Disk sec/Read,Write.\n - Critical value of the Avg Disk sec/Read,Write is > 50ms, should not exceed this value.", "fill": 1, "gridPos": { "h": 4, "w": 24, "x": 0, "y": 23 }, "id": 80, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": true, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "(rate(node_disk_write_time_seconds_total{instance=~\"$host\"}[5m])/ rate(node_disk_writes_completed_total{instance=~\"$host\"}[5m]))", "format": "time_series", "intervalFactor": 1, "legendFormat": "Disk Write Latency: [{{ device }}]", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Disk Write Latency (ms)", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "decimals": 2, "description": "Disk seconds Read Latency.\n- Critical:\n - Recommended performance value is < 10ms as avg value of the Avg Disk sec/Read,Write.\n - Critical value of the Avg Disk sec/Read,Write is > 50ms, should not exceed this value.", "fill": 1, "gridPos": { "h": 4, "w": 24, "x": 0, "y": 27 }, "id": 82, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": true, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "(rate(node_disk_read_time_seconds_total{instance=~\"$host\"}[5m])/ rate(node_disk_reads_completed_total{instance=~\"$host\"}[5m]))", "format": "time_series", "intervalFactor": 1, "legendFormat": "Disk Read Latency: [{{ device }}]", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Disk Read Latency (ms)", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "none", "label": null, "logBase": 1, "max": null, "min": "0", "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "decimals": 2, "description": "", "fill": 1, "gridPos": { "h": 4, "w": 24, "x": 0, "y": 31 }, "id": 86, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": true, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "irate(node_disk_read_bytes_total{instance=~\"$host\"}[5m]) + irate(node_disk_written_bytes_total{instance=~\"$host\"}[5m])", "format": "time_series", "intervalFactor": 1, "legendFormat": "Disk Troughput: [{{ device }}]", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Disk Throughput", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "none", "label": null, "logBase": 1, "max": null, "min": "0", "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Disk", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 10 }, "id": 88, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "decimals": 0, "description": "Whether an error occurred while getting statistics for the given device.\n> A value > 0 Means that there are some problems with that device.", "fill": 1, "gridPos": { "h": 5, "w": 24, "x": 0, "y": 18 }, "id": 90, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": true, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "node_filesystem_device_error{instance=~\"$host\"} ", "format": "time_series", "intervalFactor": 1, "legendFormat": "Filesystem Device Error: {{fstype}} {{device}} {{mountpoint}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Filesystem Device Error", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 0, "format": "none", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "decimals": 0, "format": "none", "label": null, "logBase": 1, "max": null, "min": "0", "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "decimals": 2, "description": "Filesystem used space.\n> If is > 80% then is Critical.", "fill": 1, "gridPos": { "h": 5, "w": 24, "x": 0, "y": 23 }, "id": 92, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": true, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "((node_filesystem_size_bytes{instance=~\"$host\"} - node_filesystem_avail_bytes{instance=~\"$host\"}) / node_filesystem_size_bytes{instance=~\"$host\"}) * 100", "format": "time_series", "intervalFactor": 1, "legendFormat": "Filesystem Space Used: {{fstype}} {{device}} {{mountpoint}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Filesystem Space Used", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 0, "format": "percent", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "decimals": 0, "format": "none", "label": null, "logBase": 1, "max": null, "min": "0", "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "decimals": 0, "description": "Filesystem used file nodes.\n> If is > 85% the is Critical.", "fill": 1, "gridPos": { "h": 5, "w": 24, "x": 0, "y": 28 }, "id": 94, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": true, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "node_filesystem_files_free{instance=~\"$host\"} / node_filesystem_files{instance=~\"$host\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "Filesystem Inodes: {{fstype}} {{device}} {{mountpoint}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Filesystem Inodes Used", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 0, "format": "percent", "label": null, "logBase": 1, "max": "100", "min": "0", "show": true }, { "format": "none", "label": null, "logBase": 1, "max": null, "min": "0", "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Filesystem", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 11 }, "id": 48, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "decimals": null, "fill": 1, "gridPos": { "h": 5, "w": 12, "x": 0, "y": 8 }, "id": 33, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "node_filefd_allocated{instance=\"$host\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "Allocated File Descriptor", "metric": "node_filefd_allocated", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Allocated File Descriptor", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "fill": 1, "gridPos": { "h": 5, "w": 12, "x": 12, "y": 8 }, "id": 34, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "node_filefd_maximum{instance=\"$host\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "Maximum File Descriptor", "metric": "node_filefd_maximum", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Maximum File Descriptor", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "decimals": 2, "description": "A file descriptor is a data structure used by a program to get a handle on a file, the most well know being 0,1,2 for standard in, standard out, and standard error.\n\nThe maximum number of file handles denotes the maximum number of open files on a Linux system.\n\nThe kernel dynamically allocates file handles whenever a file handle is requested by an application but the kernel does not free these file handles when they are released by the application. The kernel recycles these file handles instead. This means that over time the total number of allocated file handles will increase even though the number of currently used file handles may be low.\n\n>$ cat /proc/sys/fs/file-nr\n\n 1376 0 785623\n\n- 1376: total allocated file descriptors (the number of file descriptors allocated since boot)\n- 0: total free allocated file descriptors\n- 785623: maximum open file descriptors [the maximum file handles that can be allocated (also found in /proc/sys/fs/file-max)]\n\n1376 - 0 = 1376 (being used)", "fill": 1, "gridPos": { "h": 4, "w": 24, "x": 0, "y": 13 }, "id": 96, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": true, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "(node_filefd_allocated{instance=~\"$host\"} / node_filefd_maximum{instance=~\"$host\"}) * 100", "format": "time_series", "intervalFactor": 1, "legendFormat": "File Descriptors Used", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "File Descriptors Used", "tooltip": { "shared": true, "sort": 2, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 2, "format": "percent", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "decimals": 2, "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "decimals": 2, "description": "Open Files Limits.\n- Critical:\n - If > 85% then there is High Usage.\n\nprocess_open_fds\t\n>Number of open file descriptors.\t\n\nprocess_max_fds\t\n> Maximum number of open file descriptors.", "fill": 1, "gridPos": { "h": 4, "w": 24, "x": 0, "y": 17 }, "id": 98, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": true, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "(process_open_fds{instance=~\"$host\"} / process_max_fds{instance=~\"$host\"}) * 100", "format": "time_series", "intervalFactor": 1, "legendFormat": "Open Files Used", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Process Open Files Used", "tooltip": { "shared": true, "sort": 2, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 2, "format": "percent", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "decimals": 2, "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Descriptors", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 12 }, "id": 100, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "decimals": 0, "description": "Thresholds:\n- Critical if state = DOWN (0)", "fill": 1, "gridPos": { "h": 4, "w": 24, "x": 0, "y": 10 }, "id": 106, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": true, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "node_network_up{instance=~\"$host\",interface!=\"lo\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "Interface: [{{interface}}] {{operstate}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Network Interface State", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 0, "format": "none", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "decimals": 0, "description": "Network Drops.\n- Critical:\n - If is > 0 then are Drops on that interface (IN/OUT) and is not ok.", "fill": 1, "gridPos": { "h": 5, "w": 12, "x": 0, "y": 14 }, "id": 102, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": true, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "rate(node_network_receive_drop_total{instance=~\"$host\",device!=\"lo\"}[5m]) ", "format": "time_series", "intervalFactor": 1, "legendFormat": "IN: {{device}}", "refId": "A" }, { "expr": "rate(node_network_transmit_drop_total{instance=~\"$host\",device!=\"lo\"}[5m]) ", "format": "time_series", "intervalFactor": 1, "legendFormat": "OUT: {{device}}", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Network IN/OUT Drops", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 0, "format": "none", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "decimals": 0, "description": "Network Errors on IN/OUT.\n- Critical:\n - If Errors > 0 then there are some problems on that interface.", "fill": 1, "gridPos": { "h": 5, "w": 12, "x": 12, "y": 14 }, "id": 104, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": true, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "rate(node_network_receive_errs_total{instance=~\"$host\",device!=\"lo\"}[5m]) ", "format": "time_series", "intervalFactor": 1, "legendFormat": "IN: {{device}}", "refId": "A" }, { "expr": "rate(node_network_transmit_errs_total{instance=~\"$host\",device!=\"lo\"}[5m]) ", "format": "time_series", "intervalFactor": 1, "legendFormat": "OUT: {{device}}", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Network IN/OUT Errors", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 0, "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "decimals": 2, "editable": true, "error": false, "fill": 6, "grid": {}, "gridPos": { "h": 5, "w": 24, "x": 0, "y": 19 }, "id": 21, "instanceColors": {}, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "max": true, "min": true, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "calculatedInterval": "2s", "datasourceErrors": {}, "errors": {}, "expr": "rate(node_network_receive_bytes_total{instance=\"$host\", device!=\"lo\"}[$interval]) or irate(node_network_receive_bytes_total{instance=\"$host\", device!=\"lo\"}[5m])", "format": "time_series", "intervalFactor": 1, "legendFormat": "Inbound: {{ device }}", "metric": "", "prometheusLink": "/api/datasources/proxy/1/graph#%5B%7B%22expr%22%3A%22node_memory_MemFree%7Binstance%3D%5C%22%24host%5C%22%7D%20%2B%20node_memory_Buffers%7Binstance%3D%5C%22%24host%5C%22%7D%20%2B%20node_memory_Cached%7Binstance%3D%5C%22%24host%5C%22%7D%22%2C%22range_input%22%3A%22900s%22%2C%22end_input%22%3A%222015-10-22%2015%3A25%22%2C%22step_input%22%3A%22%22%2C%22stacked%22%3Atrue%2C%22tab%22%3A0%7D%5D", "refId": "B", "step": 5, "target": "" }, { "calculatedInterval": "2s", "datasourceErrors": {}, "errors": {}, "expr": "rate(node_network_transmit_bytes_total{instance=\"$host\", device!=\"lo\"}[$interval]) or irate(node_network_transmit_bytes_total{instance=\"$host\", device!=\"lo\"}[5m])", "format": "time_series", "intervalFactor": 1, "legendFormat": "Outbound: {{ device }}", "metric": "", "prometheusLink": "/api/datasources/proxy/1/graph#%5B%7B%22expr%22%3A%22node_memory_MemTotal%7Binstance%3D%5C%22%24host%5C%22%7D%20-%20(node_memory_MemFree%7Binstance%3D%5C%22%24host%5C%22%7D%20%2B%20node_memory_Buffers%7Binstance%3D%5C%22%24host%5C%22%7D%20%2B%20node_memory_Cached%7Binstance%3D%5C%22%24host%5C%22%7D)%22%2C%22range_input%22%3A%22900s%22%2C%22end_input%22%3A%222015-10-22%2015%3A25%22%2C%22step_input%22%3A%22%22%2C%22stacked%22%3Atrue%2C%22tab%22%3A0%7D%5D", "refId": "A", "step": 5, "target": "" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Network IN/OUT Traffic", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "Bps", "label": "", "logBase": 1, "max": null, "min": 0, "show": true }, { "format": "bytes", "logBase": 1, "max": null, "min": 0, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "decimals": 2, "description": "Network Packets for IN/OUT.\n- Critical:\n - If is = 0 for IN/OUT.", "editable": true, "error": false, "fill": 6, "grid": {}, "gridPos": { "h": 5, "w": 24, "x": 0, "y": 24 }, "id": 108, "instanceColors": {}, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "max": true, "min": true, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "calculatedInterval": "2s", "datasourceErrors": {}, "errors": {}, "expr": "rate(node_network_receive_packets_total{instance=~\"$host\",device!=\"lo\"}[$interval]) or irate(node_network_receive_packets_total{instance=~\"$host\",device!=\"lo\"}[5m])", "format": "time_series", "intervalFactor": 1, "legendFormat": "Inbound: {{ device }}", "metric": "", "prometheusLink": "/api/datasources/proxy/1/graph#%5B%7B%22expr%22%3A%22node_memory_MemFree%7Binstance%3D%5C%22%24host%5C%22%7D%20%2B%20node_memory_Buffers%7Binstance%3D%5C%22%24host%5C%22%7D%20%2B%20node_memory_Cached%7Binstance%3D%5C%22%24host%5C%22%7D%22%2C%22range_input%22%3A%22900s%22%2C%22end_input%22%3A%222015-10-22%2015%3A25%22%2C%22step_input%22%3A%22%22%2C%22stacked%22%3Atrue%2C%22tab%22%3A0%7D%5D", "refId": "B", "step": 5, "target": "" }, { "calculatedInterval": "2s", "datasourceErrors": {}, "errors": {}, "expr": "rate(node_network_transmit_packets_total{instance=~\"$host\",device!=\"lo\"}[$interval]) or irate(node_network_transmit_packets_total{instance=~\"$host\",device!=\"lo\"}[5m])", "format": "time_series", "intervalFactor": 1, "legendFormat": "Outbound: {{ device }}", "metric": "", "prometheusLink": "/api/datasources/proxy/1/graph#%5B%7B%22expr%22%3A%22node_memory_MemTotal%7Binstance%3D%5C%22%24host%5C%22%7D%20-%20(node_memory_MemFree%7Binstance%3D%5C%22%24host%5C%22%7D%20%2B%20node_memory_Buffers%7Binstance%3D%5C%22%24host%5C%22%7D%20%2B%20node_memory_Cached%7Binstance%3D%5C%22%24host%5C%22%7D)%22%2C%22range_input%22%3A%22900s%22%2C%22end_input%22%3A%222015-10-22%2015%3A25%22%2C%22step_input%22%3A%22%22%2C%22stacked%22%3Atrue%2C%22tab%22%3A0%7D%5D", "refId": "A", "step": 5, "target": "" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Network IN/OUT Packets", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 0, "format": "pps", "label": "", "logBase": 1, "max": null, "min": null, "show": true }, { "format": "bytes", "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "decimals": 0, "description": "- node_network_transmit_queue_length = transmit_queue_length value of /sys/class/net/.", "fill": 1, "gridPos": { "h": 4, "w": 24, "x": 0, "y": 29 }, "id": 110, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": true, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "node_network_transmit_queue_length{instance=~\"$host\",interface!=\"lo\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "MTU: [{{interface}}]", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Network Interface Speed", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 0, "format": "decmbytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "decimals": 0, "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "decimals": 2, "editable": true, "error": false, "fill": 6, "grid": {}, "gridPos": { "h": 5, "w": 24, "x": 0, "y": 33 }, "id": 22, "instanceColors": {}, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "max": true, "min": true, "rightSide": true, "show": true, "sort": "min", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "calculatedInterval": "2s", "datasourceErrors": {}, "errors": {}, "expr": "sum(increase(node_network_receive_bytes_total{instance=\"$host\", device!=\"lo\"}[1h]))", "format": "time_series", "interval": "", "intervalFactor": 1, "legendFormat": "Received", "metric": "", "prometheusLink": "/api/datasources/proxy/1/graph#%5B%7B%22expr%22%3A%22node_memory_MemTotal%7Binstance%3D%5C%22%24host%5C%22%7D%20-%20(node_memory_MemFree%7Binstance%3D%5C%22%24host%5C%22%7D%20%2B%20node_memory_Buffers%7Binstance%3D%5C%22%24host%5C%22%7D%20%2B%20node_memory_Cached%7Binstance%3D%5C%22%24host%5C%22%7D)%22%2C%22range_input%22%3A%22900s%22%2C%22end_input%22%3A%222015-10-22%2015%3A25%22%2C%22step_input%22%3A%22%22%2C%22stacked%22%3Atrue%2C%22tab%22%3A0%7D%5D", "refId": "A", "step": 3600, "target": "" }, { "calculatedInterval": "2s", "datasourceErrors": {}, "errors": {}, "expr": "sum(increase(node_network_transmit_bytes_total{instance=\"$host\", device!=\"lo\"}[1h]))", "format": "time_series", "interval": "", "intervalFactor": 1, "legendFormat": "Sent", "metric": "", "prometheusLink": "/api/datasources/proxy/1/graph#%5B%7B%22expr%22%3A%22node_memory_MemTotal%7Binstance%3D%5C%22%24host%5C%22%7D%20-%20(node_memory_MemFree%7Binstance%3D%5C%22%24host%5C%22%7D%20%2B%20node_memory_Buffers%7Binstance%3D%5C%22%24host%5C%22%7D%20%2B%20node_memory_Cached%7Binstance%3D%5C%22%24host%5C%22%7D)%22%2C%22range_input%22%3A%22900s%22%2C%22end_input%22%3A%222015-10-22%2015%3A25%22%2C%22step_input%22%3A%22%22%2C%22stacked%22%3Atrue%2C%22tab%22%3A0%7D%5D", "refId": "B", "step": 3600, "target": "" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Network Utilization Hourly", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": "", "logBase": 1, "max": null, "min": 0, "show": true }, { "format": "bytes", "logBase": 1, "max": null, "min": 0, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Network", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 }, "id": 46, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "description": "Number of TCP sockets in state inuse.", "fill": 1, "gridPos": { "h": 4, "w": 24, "x": 0, "y": 10 }, "id": 32, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": false, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "node_sockstat_TCP_inuse{instance=\"$host\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "TCP In Use", "metric": "", "refId": "A", "step": 5 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "TCP In Use", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "decimals": 0, "description": "", "fill": 1, "gridPos": { "h": 5, "w": 12, "x": 0, "y": 14 }, "id": 39, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "rate(node_netstat_Tcp_RetransSegs{instance=~\"$host\"}[$interval]) or irate(node_netstat_Tcp_RetransSegs{instance=~\"$host\"}[5m])", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "RetransSegs - Segments retransmitted", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Segments retransmitted", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 0, "format": "none", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "decimals": 0, "fill": 1, "gridPos": { "h": 5, "w": 12, "x": 12, "y": 14 }, "id": 126, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": true, "rightSide": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "node_netstat_Tcp_CurrEstab{instance=~\"$host\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "CurrEstab", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "TCP Connections", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "TCP", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 }, "id": 50, "panels": [ { "aliasColors": {}, "bars": true, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "decimals": 2, "editable": true, "error": false, "fill": 2, "grid": {}, "gridPos": { "h": 5, "w": 24, "x": 0, "y": 11 }, "id": 20, "instanceColors": {}, "legend": { "alignAsTable": true, "avg": true, "current": true, "hideEmpty": false, "max": true, "min": true, "rightSide": true, "show": true, "total": false, "values": true }, "lines": false, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "color": "#E24D42", "instance": "Processes blocked waiting for I/O to complete" }, { "color": "#6ED0E0", "instance": "Processes in runnable state" } ], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "calculatedInterval": "2m", "datasourceErrors": {}, "errors": {}, "expr": "node_procs_running{instance=\"$host\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "Processes in runnable state", "metric": "", "prometheusLink": "/api/datasources/proxy/1/graph#%5B%7B%22expr%22%3A%22node_procs_running%7Binstance%3D%5C%22%24host%5C%22%7D%22%2C%22range_input%22%3A%2243200s%22%2C%22end_input%22%3A%222015-9-18%2013%3A46%22%2C%22step_input%22%3A%22%22%2C%22stacked%22%3Atrue%2C%22tab%22%3A0%7D%5D", "refId": "A", "step": 5, "target": "" }, { "calculatedInterval": "2m", "datasourceErrors": {}, "errors": {}, "expr": "node_procs_blocked{instance=\"$host\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "Processes blocked waiting for I/O to complete", "metric": "", "prometheusLink": "/api/datasources/proxy/1/graph#%5B%7B%22expr%22%3A%22node_procs_blocked%7Binstance%3D%5C%22%24host%5C%22%7D%22%2C%22range_input%22%3A%2243200s%22%2C%22end_input%22%3A%222015-9-18%2013%3A46%22%2C%22step_input%22%3A%22%22%2C%22stacked%22%3Atrue%2C%22tab%22%3A0%7D%5D", "refId": "B", "step": 5, "target": "" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Processes", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": "", "logBase": 1, "max": null, "min": 0, "show": true }, { "format": "none", "logBase": 1, "max": null, "min": 0, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Processes", "type": "row" } ], "refresh": "30s", "schemaVersion": 18, "style": "dark", "tags": [], "templating": { "list": [ { "allFormat": "glob", "auto": true, "auto_count": 200, "auto_min": "1s", "current": { "text": "1m", "value": "1m" }, "datasource": "tidb-cluster", "hide": 0, "includeAll": false, "label": "Interval", "multi": false, "multiFormat": "glob", "name": "interval", "options": [ { "selected": false, "text": "auto", "value": "$__auto_interval_interval" }, { "selected": false, "text": "1s", "value": "1s" }, { "selected": false, "text": "5s", "value": "5s" }, { "selected": true, "text": "1m", "value": "1m" }, { "selected": false, "text": "5m", "value": "5m" }, { "selected": false, "text": "1h", "value": "1h" }, { "selected": false, "text": "6h", "value": "6h" }, { "selected": false, "text": "1d", "value": "1d" } ], "query": "1s,5s,1m,5m,1h,6h,1d", "refresh": 2, "skipUrlSync": false, "type": "interval" }, { "allFormat": "glob", "allValue": null, "current": {}, "datasource": "${DS_TIDB-CLUSTER}", "definition": "label_values(node_boot_time_seconds,instance)", "hide": 0, "includeAll": false, "label": "Host", "multi": false, "multiFormat": "regex values", "name": "host", "options": [], "query": "label_values(node_boot_time_seconds,instance)", "refresh": 1, "refresh_on_load": false, "regex": "", "skipUrlSync": false, "sort": 3, "tagValuesQuery": "instance", "tags": [], "tagsQuery": "up", "type": "query", "useTags": false } ] }, "time": { "from": "now-1h", "to": "now" }, "timepicker": { "collapse": false, "enable": true, "notice": false, "now": true, "refresh_intervals": [ "5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d" ], "status": "Stable", "time_options": [ "5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d" ], "type": "timepicker" }, "timezone": "browser", "title": "Tidb-Cluster-Node_exporter", "uid": "000000001", "version": 9 } ================================================ FILE: scripts/overview.json ================================================ { "__inputs": [ { "name": "DS_TEST-CLUSTER", "label": "test-cluster", "description": "", "type": "datasource", "pluginId": "prometheus", "pluginName": "Prometheus" } ], "__requires": [ { "type": "grafana", "id": "grafana", "name": "Grafana", "version": "6.1.6" }, { "type": "panel", "id": "graph", "name": "Graph", "version": "" }, { "type": "datasource", "id": "prometheus", "name": "Prometheus", "version": "1.0.0" }, { "type": "panel", "id": "singlestat", "name": "Singlestat", "version": "" }, { "type": "panel", "id": "table", "name": "Table", "version": "" } ], "annotations": { "list": [ { "builtIn": 1, "datasource": "${DS_TEST-CLUSTER}", "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "type": "dashboard" } ] }, "editable": true, "gnetId": null, "graphTooltip": 1, "id": null, "iteration": 1577357354898, "links": [], "panels": [ { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, "id": 80, "panels": [ { "columns": [ { "text": "Current", "value": "current" } ], "datasource": "${DS_TEST-CLUSTER}", "fontSize": "100%", "gridPos": { "h": 8, "w": 12, "x": 0, "y": 1 }, "hideTimeOverride": true, "id": 76, "links": [], "pageSize": null, "scroll": true, "showHeader": true, "sort": { "col": null, "desc": false }, "styles": [ { "alias": "Time", "dateFormat": "YYYY-MM-DD HH:mm:ss", "pattern": "Time", "type": "date" }, { "alias": "Service", "colorMode": null, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "pattern": "Metric", "thresholds": [], "type": "string", "unit": "short" }, { "alias": "Up", "colorMode": "cell", "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 0, "pattern": "Current", "thresholds": [ "0", "1" ], "type": "number", "unit": "short" }, { "alias": "", "colorMode": null, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "decimals": 2, "pattern": "/.*/", "thresholds": [], "type": "number", "unit": "short" } ], "targets": [ { "expr": "\ncount(probe_success{group=\"tidb\"} == 1)", "format": "time_series", "intervalFactor": 2, "legendFormat": "TiDB", "refId": "A" }, { "expr": "\ncount(probe_success{group=\"pd\"} == 1)", "format": "time_series", "intervalFactor": 2, "legendFormat": "PD", "refId": "B" }, { "expr": "\ncount(probe_success{group=\"tikv\"} == 1)", "format": "time_series", "intervalFactor": 2, "legendFormat": "TiKV", "refId": "C" }, { "expr": "\ncount(probe_success{group=\"pump\"} == 1)", "format": "time_series", "intervalFactor": 2, "legendFormat": "Pump", "refId": "D" }, { "expr": "\ncount(probe_success{group=\"drainer\"} == 1)", "format": "time_series", "intervalFactor": 2, "legendFormat": "Drainer", "refId": "E" }, { "expr": "\ncount(probe_success{group=\"kafka\"} == 1)", "format": "time_series", "intervalFactor": 2, "legendFormat": "Kafka", "refId": "F" }, { "expr": "\ncount(probe_success{group=\"zookeeper\"} == 1)", "format": "time_series", "intervalFactor": 2, "legendFormat": "Zookeeper", "refId": "G" }, { "expr": "\ncount(probe_success{group=\"node_exporter\"} == 1)", "format": "time_series", "intervalFactor": 2, "legendFormat": "Node_exporter", "refId": "H" }, { "expr": "\ncount(probe_success{group=\"blackbox_exporter\"} == 1)", "format": "time_series", "intervalFactor": 2, "legendFormat": "Blackbox_exporter", "refId": "I" }, { "expr": "\ncount(probe_success{group=\"grafana\"} == 1)", "format": "time_series", "intervalFactor": 2, "legendFormat": "Grafana", "refId": "J" }, { "expr": "\ncount(probe_success{job=\"blackbox_exporter_http\"} == 1)", "format": "time_series", "intervalFactor": 2, "legendFormat": "Pushgateway", "refId": "K" }, { "expr": "\ncount(probe_success{group=\"kafka_exporter\"} == 1)", "format": "time_series", "intervalFactor": 2, "legendFormat": "Kafka_exporter", "refId": "L" }, { "expr": "\ncount(probe_success{group=\"tiflash\"} == 1)", "format": "time_series", "intervalFactor": 2, "legendFormat": "TiFlash", "refId": "M" } ], "timeFrom": "1s", "title": "", "transform": "timeseries_aggregations", "type": "table" }, { "columns": [ { "text": "Current", "value": "current" } ], "datasource": "${DS_TEST-CLUSTER}", "fontSize": "100%", "gridPos": { "h": 8, "w": 12, "x": 12, "y": 1 }, "hideTimeOverride": true, "id": 77, "links": [], "pageSize": null, "scroll": true, "showHeader": true, "sort": { "col": null, "desc": false }, "styles": [ { "alias": "Time", "dateFormat": "YYYY-MM-DD HH:mm:ss", "pattern": "Time", "type": "date" }, { "alias": "Service", "colorMode": null, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "pattern": "Metric", "thresholds": [], "type": "string", "unit": "short" }, { "alias": "Down", "colorMode": "cell", "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 0, "pattern": "Current", "thresholds": [ "100", "200" ], "type": "number", "unit": "short" }, { "alias": "", "colorMode": null, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "decimals": 2, "pattern": "/.*/", "thresholds": [], "type": "number", "unit": "short" } ], "targets": [ { "expr": "\ncount(probe_success{group=\"tidb\"} == 0)", "format": "time_series", "intervalFactor": 2, "legendFormat": "TiDB", "refId": "A" }, { "expr": "\ncount(probe_success{group=\"pd\"} == 0)", "format": "time_series", "intervalFactor": 2, "legendFormat": "PD", "refId": "B" }, { "expr": "\ncount(probe_success{group=\"tikv\"} == 0)", "format": "time_series", "intervalFactor": 2, "legendFormat": "TiKV", "refId": "C" }, { "expr": "\ncount(probe_success{group=\"pump\"} == 0)", "format": "time_series", "intervalFactor": 2, "legendFormat": "Pump", "refId": "D" }, { "expr": "\ncount(probe_success{group=\"drainer\"} == 0)", "format": "time_series", "intervalFactor": 2, "legendFormat": "Drainer", "refId": "E" }, { "expr": "\ncount(probe_success{group=\"kafka\"} == 0)", "format": "time_series", "intervalFactor": 2, "legendFormat": "Kafka", "refId": "F" }, { "expr": "\ncount(probe_success{group=\"zookeeper\"} == 0)", "format": "time_series", "intervalFactor": 2, "legendFormat": "Zookeeper", "refId": "G" }, { "expr": "\ncount(probe_success{group=\"node_exporter\"} == 0)", "format": "time_series", "intervalFactor": 2, "legendFormat": "Node_exporter", "refId": "H" }, { "expr": "\ncount(probe_success{group=\"blackbox_exporter\"} == 0)", "format": "time_series", "intervalFactor": 2, "legendFormat": "Blackbox_exporter", "refId": "I" }, { "expr": "\ncount(probe_success{group=\"grafana\"} == 0)", "format": "time_series", "intervalFactor": 2, "legendFormat": "Grafana", "refId": "J" }, { "expr": "\ncount(probe_success{job=\"blackbox_exporter_http\"} == 0)", "format": "time_series", "intervalFactor": 2, "legendFormat": "Pushgateway", "refId": "K" }, { "expr": "\ncount(probe_success{group=\"kafka_exporter\"} == 0)", "format": "time_series", "intervalFactor": 2, "legendFormat": "Kafka_exporter", "refId": "L" }, { "expr": "\ncount(probe_success{group=\"tiflash\"} == 0)", "format": "time_series", "intervalFactor": 2, "legendFormat": "TiFlash", "refId": "M" } ], "timeFrom": "1s", "title": "", "transform": "timeseries_aggregations", "type": "table" } ], "repeat": null, "title": "Services Port Status", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 1 }, "id": 81, "panels": [ { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_TEST-CLUSTER}", "format": "none", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "gridPos": { "h": 7, "w": 4, "x": 0, "y": 2 }, "id": 29, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [], "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { "expr": "delta(pd_tso_events{type=\"save\",instance=\"$instance\"}[1m]) > bool 0", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "", "metric": "pd_server_tso", "refId": "A", "step": 60 } ], "thresholds": "0,2", "title": "PD role", "type": "singlestat", "valueFontSize": "50%", "valueMaps": [ { "op": "=", "text": "Leader", "value": "1" }, { "op": "=", "text": "Follower", "value": "null" } ], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_TEST-CLUSTER}", "decimals": null, "editable": true, "error": false, "format": "decbytes", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": false }, "gridPos": { "h": 7, "w": 4, "x": 4, "y": 2 }, "id": 27, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "null", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "sparkline": { "fillColor": "rgba(77, 135, 25, 0.18)", "full": true, "lineColor": "rgb(21, 179, 65)", "show": true }, "tableColumn": "", "targets": [ { "expr": "pd_cluster_status{instance=\"$instance\",type=\"storage_capacity\"}", "format": "time_series", "intervalFactor": 2, "refId": "A", "step": 60 } ], "thresholds": "", "title": "Storage capacity", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "editable": true, "error": false, "format": "decbytes", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "gridPos": { "h": 7, "w": 4, "x": 8, "y": 2 }, "hideTimeOverride": false, "id": 28, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "null", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": true, "lineColor": "rgb(31, 120, 193)", "show": true }, "tableColumn": "", "targets": [ { "expr": "pd_cluster_status{instance=\"$instance\",type=\"storage_size\"}", "intervalFactor": 2, "refId": "A", "step": 60 } ], "thresholds": "", "title": "Current storage size", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_TEST-CLUSTER}", "format": "none", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "gridPos": { "h": 7, "w": 4, "x": 12, "y": 2 }, "id": 30, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": true, "lineColor": "rgb(31, 120, 193)", "show": true }, "tableColumn": "", "targets": [ { "expr": "pd_cluster_status{instance=\"$instance\", type=\"leader_count\"}", "intervalFactor": 2, "refId": "A", "step": 60 } ], "thresholds": "", "title": "Number of Regions", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "rgb(255, 255, 255)", "rgba(255, 255, 255, 0.89)", "rgb(255, 255, 255)" ], "datasource": "${DS_TEST-CLUSTER}", "format": "none", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "gridPos": { "h": 7, "w": 4, "x": 16, "y": 2 }, "id": 65, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": true, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { "expr": "sum(pd_cluster_status{instance=\"$instance\", type=\"store_up_count\"})", "format": "time_series", "interval": "15s", "intervalFactor": 2, "refId": "A" } ], "thresholds": "", "title": "Normal stores", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" }, { "columns": [ { "text": "Current", "value": "current" } ], "datasource": "${DS_TEST-CLUSTER}", "editable": true, "error": false, "fontSize": "100%", "gridPos": { "h": 7, "w": 4, "x": 20, "y": 2 }, "hideTimeOverride": true, "id": 18, "links": [], "pageSize": null, "scroll": false, "showHeader": true, "sort": { "col": null, "desc": false }, "styles": [ { "dateFormat": "YYYY-MM-DD HH:mm:ss", "pattern": "Metric", "sanitize": false, "type": "string" }, { "colorMode": "cell", "colors": [ "rgba(50, 172, 45, 0.97)", "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)" ], "decimals": 0, "pattern": "Current", "thresholds": [ "1", "2" ], "type": "number", "unit": "short" } ], "targets": [ { "expr": "sum(pd_cluster_status{instance=\"$instance\", type=\"store_disconnected_count\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Disconnect Stores", "refId": "B", "step": 20 }, { "expr": "sum(pd_cluster_status{instance=\"$instance\", type=\"store_unhealth_count\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Unhealth Stores", "refId": "C", "step": 20 }, { "expr": "sum(pd_cluster_status{instance=\"$instance\", type=\"store_low_space_count\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "LowSpace Stores", "refId": "D", "step": 20 }, { "expr": "sum(pd_cluster_status{instance=\"$instance\", type=\"store_down_count\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Down Stores", "refId": "E", "step": 20 }, { "expr": "sum(pd_cluster_status{instance=\"$instance\", type=\"store_offline_count\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Offline Stores", "refId": "F", "step": 20 }, { "expr": "sum(pd_cluster_status{instance=\"$instance\", type=\"store_tombstone_count\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Tombstone Stores", "refId": "G", "step": 20 } ], "timeFrom": "1s", "title": "Abnormal stores", "transform": "timeseries_aggregations", "type": "table" }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 9 }, "id": 24, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "paceLength": 10, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{instance=\"$instance\"}[5m])) by (grpc_method, le))", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{grpc_method}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "99% completed cmds duration seconds", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 9 }, "id": 32, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "paceLength": 10, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.98, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket[30s])) by (type, le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}-98%", "refId": "A", "step": 10 }, { "expr": "avg(rate(pd_client_request_handle_requests_duration_seconds_sum[30s])) by (type) / avg(rate(pd_client_request_handle_requests_duration_seconds_count[30s])) by (type)", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{type}}-average", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Handle requests duration seconds", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": null, "format": "s", "label": "", "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 7, "w": 24, "x": 0, "y": 16 }, "id": 66, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "paceLength": 10, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "pd_regions_status{instance=\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A" }, { "expr": "sum(pd_regions_status) by (instance, type)", "format": "time_series", "hide": true, "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Region health", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 23 }, "id": 68, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "paceLength": 10, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "pd_hotspot_status{instance=\"$instance\",type=\"hot_write_region_as_leader\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "store-{{store}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Hot write Region's leader distribution", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 0, "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 23 }, "id": 69, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "paceLength": 10, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "pd_hotspot_status{instance=\"$instance\",type=\"hot_read_region_as_leader\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "store-{{store}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Hot read Region's leader distribution", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 30 }, "id": 33, "legend": { "alignAsTable": true, "avg": true, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "paceLength": 10, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(delta(pd_scheduler_region_heartbeat{instance=\"$instance\", type=\"report\", status=\"ok\"}[1m])) by (store)", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "store-{{store}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Region heartbeat report", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "opm", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 30 }, "id": 67, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "paceLength": 10, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(pd_scheduler_region_heartbeat_latency_seconds_bucket[5m])) by (store, le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "store-{{store}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "99% Region heartbeat latency", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ms", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "ms", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "PD", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 2 }, "id": 82, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 3 }, "id": 2, "legend": { "alignAsTable": true, "avg": true, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "maxPerRow": 1, "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tidb_executor_statement_total[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Statement OPS", "tooltip": { "msResolution": true, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 2, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 3 }, "id": 34, "legend": { "alignAsTable": false, "avg": false, "current": false, "hideEmpty": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.999, sum(rate(tidb_server_handle_query_duration_seconds_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "999", "refId": "A", "step": 10 }, { "expr": "histogram_quantile(0.99, sum(rate(tidb_server_handle_query_duration_seconds_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 3, "legendFormat": "99", "refId": "B", "step": 15 }, { "expr": "histogram_quantile(0.95, sum(rate(tidb_server_handle_query_duration_seconds_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "95", "refId": "C" }, { "expr": "histogram_quantile(0.80, sum(rate(tidb_server_handle_query_duration_seconds_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "80", "refId": "D" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 10 }, "id": 35, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "rate(tidb_server_query_total[1m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} {{type}} {{result}}", "refId": "A", "step": 20 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "QPS By Instance", "tooltip": { "shared": true, "sort": 2, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 10 }, "id": 72, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(increase(tidb_server_execute_error_total[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": " {{type}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Failed Query OPM", "tooltip": { "shared": true, "sort": 2, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 2, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 17 }, "id": 4, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "total", "fill": 0, "lines": false } ], "spaceLength": 10, "stack": true, "steppedLine": true, "targets": [ { "expr": "tidb_server_connections", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 10 }, { "expr": "sum(tidb_server_connections)", "format": "time_series", "intervalFactor": 2, "legendFormat": "total", "refId": "B", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Connection Count", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 17 }, "id": 36, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "go_memstats_heap_inuse_bytes{job=~\"tidb.*\"}", "intervalFactor": 2, "legendFormat": "{{instance}}-{{job}}", "metric": "go_memstats_heap_inuse_bytes", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Heap Memory Usage", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 24 }, "id": 70, "legend": { "avg": false, "current": true, "max": true, "min": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tidb_session_transaction_duration_seconds_count[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Transaction OPS", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 24 }, "id": 71, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tidb_session_transaction_duration_seconds_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99", "refId": "A" }, { "expr": "histogram_quantile(0.95, sum(rate(tidb_session_transaction_duration_seconds_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "95", "refId": "B" }, { "expr": "histogram_quantile(0.80, sum(rate(tidb_session_transaction_duration_seconds_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "80", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Transaction Duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 31 }, "id": 37, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tidb_tikvclient_txn_cmd_duration_seconds_count[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "KV Cmd OPS", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 31 }, "id": 38, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tidb_tikvclient_txn_cmd_duration_seconds_bucket[1m])) by (le, type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "KV Cmd Duration 99", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 38 }, "id": 39, "legend": { "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(pd_client_cmd_handle_cmds_duration_seconds_count{type=\"tso\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "cmd", "refId": "A", "step": 10 }, { "expr": "sum(rate(pd_client_request_handle_requests_duration_seconds_count{type=\"tso\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "request", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "PD TSO OPS", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 38 }, "id": 40, "legend": { "alignAsTable": false, "avg": false, "current": false, "hideEmpty": false, "hideZero": false, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.999, sum(rate(pd_client_cmd_handle_cmds_duration_seconds_bucket{type=\"tso\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "999", "refId": "A", "step": 10 }, { "expr": "histogram_quantile(0.99, sum(rate(pd_client_cmd_handle_cmds_duration_seconds_bucket{type=\"tso\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99", "refId": "B" }, { "expr": "histogram_quantile(0.90, sum(rate(pd_client_cmd_handle_cmds_duration_seconds_bucket{type=\"tso\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "90", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "PD TSO Wait Duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 2, "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 45 }, "id": 41, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tidb_tikvclient_region_err_total[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "metric": "tidb_server_session_execute_parse_duration_count", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "TiClient Region Error OPS", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 45 }, "id": 42, "legend": { "alignAsTable": true, "avg": false, "current": false, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tidb_tikvclient_lock_resolver_actions_total[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "metric": "tidb_tikvclient_lock_resolver_actions_total", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Lock Resolve OPS", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": null, "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 52 }, "id": 6, "legend": { "alignAsTable": true, "avg": false, "current": false, "hideEmpty": false, "hideZero": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tidb_domain_load_schema_duration_seconds_bucket[1m])) by (le, instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Load Schema Duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 52 }, "id": 43, "legend": { "alignAsTable": true, "avg": false, "current": false, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "total": true, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tidb_tikvclient_backoff_seconds_count[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "KV Backoff OPS", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "TiDB", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 3 }, "id": 83, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 }, "id": 20, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "total", "lines": false } ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(tikv_raftstore_region_count{type=\"leader\"}) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tikv_raftstore_region_count", "refId": "A", "step": 10 }, { "expr": "delta(tikv_raftstore_region_count{type=\"leader\"}[30s]) < -10", "format": "time_series", "hide": true, "intervalFactor": 2, "legendFormat": "total", "refId": "B", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "leader", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 }, "id": 21, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(tikv_raftstore_region_count{type=\"region\"}) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "region", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": "", "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "fill": 0, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 }, "id": 75, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_thread_cpu_seconds_total{job=\"tikv\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "CPU", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 0, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 12 }, "id": 74, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "avg(process_resident_memory_bytes{job=\"tikv\"}) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Memory", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "fill": 5, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 20 }, "id": 44, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "total": false, "values": true }, "lines": true, "linewidth": 0, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "expr": "sum(tikv_engine_size_bytes) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "store size", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "decbytes", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 20 }, "id": 73, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "expr": "sum(tikv_engine_size_bytes) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "cf size", "tooltip": { "shared": true, "sort": 2, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "decbytes", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "editable": true, "error": false, "fill": 3, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 28 }, "id": 17, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_channel_full_total[1m])) by (instance, type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} - {{type}}", "metric": "", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "channel full", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 28 }, "id": 11, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_server_report_failure_msg_total[1m])) by (type,instance,store_id)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} - {{type}} - to - {{store_id}}", "metric": "tikv_server_raft_store_msg_total", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "server report failures", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 36 }, "id": 46, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(tikv_scheduler_contex_total) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "scheduler pending commands", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 36 }, "id": 51, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_coprocessor_executor_count[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "metric": "tikv_coprocessor_request_error", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "coprocessor executor count", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 5, "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 44 }, "id": 47, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "repeat": null, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_coprocessor_request_duration_seconds_bucket[1m])) by (le,req))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{req}}-99%", "refId": "A", "step": 10 }, { "expr": "histogram_quantile(0.95, sum(rate(tikv_coprocessor_request_duration_seconds_bucket[1m])) by (le,req))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{req}}-95%", "refId": "B", "step": 10 }, { "expr": " sum(rate(tikv_coprocessor_request_duration_seconds_sum{req=\"select\"}[1m])) / sum(rate(tikv_coprocessor_request_duration_seconds_count{req=\"select\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "select-avg", "refId": "C", "step": 10 }, { "expr": " sum(rate(tikv_coprocessor_request_duration_seconds_sum{req=\"index\"}[1m])) / sum(rate(tikv_coprocessor_request_duration_seconds_count{req=\"index\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "index-avg", "refId": "D" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "coprocessor request duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "fill": 0, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 44 }, "id": 48, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_thread_cpu_seconds_total{name=~\"raftstore_.*\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "raft store CPU", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "fill": 0, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 52 }, "id": 49, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_thread_cpu_seconds_total{name=~\"cop_.*\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tikv_coprocessor_request_duration_seconds_bucket", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Coprocessor CPU", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "TiKV", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }, "id": 84, "panels": [ { "columns": [], "datasource": "${DS_TEST-CLUSTER}", "fontSize": "100%", "gridPos": { "h": 5, "w": 6, "x": 0, "y": 5 }, "hideTimeOverride": false, "id": 57, "links": [], "pageSize": 4, "scroll": true, "showHeader": true, "sort": { "col": null, "desc": false }, "styles": [ { "alias": "CPU Num", "colorMode": "value", "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(30, 232, 22, 0.97)" ], "decimals": 0, "link": false, "pattern": "Value", "thresholds": [ "0", "1" ], "type": "number", "unit": "short" }, { "alias": "", "colorMode": null, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "mappingType": 1, "pattern": "Time", "thresholds": [], "type": "hidden", "unit": "short" }, { "alias": "Host", "colorMode": null, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "mappingType": 1, "pattern": "instance", "thresholds": [], "type": "string", "unit": "short" } ], "targets": [ { "expr": "count(node_cpu_seconds_total{mode=\"user\"}) by (instance)", "format": "table", "instant": true, "intervalFactor": 2, "legendFormat": "{{ instance }}", "refId": "A", "step": 2 } ], "timeFrom": null, "title": "Vcores", "transform": "table", "type": "table" }, { "columns": [], "datasource": "${DS_TEST-CLUSTER}", "fontSize": "100%", "gridPos": { "h": 5, "w": 6, "x": 6, "y": 5 }, "hideTimeOverride": true, "id": 59, "links": [], "pageSize": 4, "scroll": true, "showHeader": true, "sort": { "col": null, "desc": false }, "styles": [ { "dateFormat": "YYYY-MM-DD HH:mm:ss", "pattern": "Time", "sanitize": false, "type": "hidden" }, { "alias": "Host", "colorMode": null, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "pattern": "__name__", "thresholds": [], "type": "hidden", "unit": "short" }, { "alias": "Host", "colorMode": "value", "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 0, "pattern": "instance", "thresholds": [ "0", "1" ], "type": "string", "unit": "bytes" }, { "alias": "", "colorMode": null, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "mappingType": 1, "pattern": "job", "thresholds": [], "type": "hidden", "unit": "short" }, { "alias": "Total Memory", "colorMode": "value", "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "mappingType": 1, "pattern": "Value", "thresholds": [ "0", "1" ], "type": "number", "unit": "bytes" } ], "targets": [ { "expr": "node_memory_MemTotal_bytes", "format": "table", "instant": true, "intervalFactor": 2, "legendFormat": "{{ instance }}", "refId": "A", "step": 2 } ], "timeFrom": "1s", "title": "Memory", "transform": "table", "type": "table" }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "fill": 1, "gridPos": { "h": 5, "w": 12, "x": 12, "y": 5 }, "id": 55, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "100 - avg by (instance) (irate(node_cpu_seconds_total{mode=\"idle\"}[1m]) ) * 100", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "CPU Usage", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 1, "format": "percent", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "fill": 0, "gridPos": { "h": 5, "w": 12, "x": 0, "y": 10 }, "id": 78, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "node_load1", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Load [1m]", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 0, "gridPos": { "h": 5, "w": 12, "x": 12, "y": 10 }, "id": 58, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "node_memory_MemAvailable_bytes", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ instance }}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Memory Available", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "decbytes", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 5, "w": 12, "x": 0, "y": 15 }, "id": 79, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "irate(node_network_receive_bytes_total{device!=\"lo\"}[5m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "Inbound: {{instance}}", "refId": "A" }, { "expr": "irate(node_network_transmit_bytes_total{device!=\"lo\"}[5m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "Outbound: {{instance}}", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Network Traffic", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": null, "format": "bytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "fill": 1, "gridPos": { "h": 5, "w": 12, "x": 12, "y": 15 }, "id": 60, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "irate(node_netstat_Tcp_RetransSegs[1m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} - TCPSlowStartRetrans", "refId": "B", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "TCP Retrans", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 1, "format": "none", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 0, "fill": 1, "gridPos": { "h": 7, "w": 24, "x": 0, "y": 20 }, "id": 61, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "irate(node_disk_io_time_seconds_total[1m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} - {{device}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "IO Util", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "System Info", "type": "row" } ], "refresh": "30s", "schemaVersion": 18, "style": "dark", "tags": [], "templating": { "list": [ { "allValue": null, "current": {}, "datasource": "${DS_TEST-CLUSTER}", "definition": "", "hide": 0, "includeAll": false, "label": null, "multi": false, "name": "instance", "options": [], "query": "label_values(pd_cluster_status, instance)", "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 0, "tagValuesQuery": "", "tags": [], "tagsQuery": "", "type": "query", "useTags": false } ] }, "time": { "from": "now-1h", "to": "now" }, "timepicker": { "refresh_intervals": [ "5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d" ], "time_options": [ "5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d" ] }, "timezone": "browser", "title": "Test-Cluster-Overview", "uid": "eDbRZpnWk", "version": 1 } ================================================ FILE: scripts/pd.json ================================================ { "__inputs": [ { "name": "DS_TEST-CLUSTER", "label": "test-cluster", "description": "", "type": "datasource", "pluginId": "prometheus", "pluginName": "Prometheus" } ], "__requires": [ { "type": "grafana", "id": "grafana", "name": "Grafana", "version": "6.1.6" }, { "type": "panel", "id": "graph", "name": "Graph", "version": "" }, { "type": "datasource", "id": "prometheus", "name": "Prometheus", "version": "1.0.0" }, { "type": "panel", "id": "singlestat", "name": "Singlestat", "version": "" }, { "type": "panel", "id": "table", "name": "Table", "version": "" } ], "annotations": { "list": [ { "builtIn": 1, "datasource": "${DS_TEST-CLUSTER}", "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "type": "dashboard" } ] }, "editable": true, "gnetId": null, "graphTooltip": 1, "id": null, "iteration": 1564738811479, "links": [], "panels": [ { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, "id": 118, "panels": [ { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_TEST-CLUSTER}", "description": "It indicates whether the current PD is the leader or a follower.", "format": "none", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "gridPos": { "h": 6, "w": 4, "x": 0, "y": 1 }, "id": 55, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "1", "text": "Leader", "to": "100000" }, { "from": "0", "text": "Follower", "to": "1" } ], "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { "expr": "delta(pd_tso_events{type=\"save\",instance=\"$instance\"}[1m]) > bool 0", "format": "time_series", "intervalFactor": 2, "legendFormat": "", "metric": "pd_server_tso", "refId": "A", "step": 40 } ], "thresholds": "", "title": "PD role", "type": "singlestat", "valueFontSize": "50%", "valueMaps": [ { "op": "=", "text": "Follower", "value": "null" }, { "op": "=", "text": "Leader", "value": "1" } ], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_TEST-CLUSTER}", "decimals": null, "description": "The total capacity size of the cluster", "editable": true, "error": false, "format": "decbytes", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": false }, "gridPos": { "h": 6, "w": 4, "x": 4, "y": 1 }, "id": 10, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "null", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "sparkline": { "fillColor": "rgba(77, 135, 25, 0.18)", "full": true, "lineColor": "rgb(21, 179, 65)", "show": false }, "tableColumn": "", "targets": [ { "expr": "sum(pd_cluster_status{instance=\"$instance\",type=\"storage_capacity\"})", "format": "time_series", "intervalFactor": 2, "refId": "A", "step": 40 } ], "thresholds": "", "title": "Storage capacity", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The current storage size of the cluster", "editable": true, "error": false, "format": "decbytes", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "gridPos": { "h": 6, "w": 4, "x": 8, "y": 1 }, "hideTimeOverride": false, "id": 38, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "null", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": true, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { "expr": "sum(pd_cluster_status{instance=\"$instance\",type=\"storage_size\"})", "format": "time_series", "intervalFactor": 2, "refId": "A", "step": 40 } ], "thresholds": "", "title": "Current storage size", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": true, "colors": [ "rgba(50, 172, 45, 0.97)", "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)" ], "datasource": "${DS_TEST-CLUSTER}", "description": "The current storage size and used ratio of the cluster", "editable": true, "error": false, "format": "percentunit", "gauge": { "maxValue": 1, "minValue": 0, "show": true, "thresholdLabels": false, "thresholdMarkers": true }, "gridPos": { "h": 6, "w": 4, "x": 12, "y": 1 }, "hideTimeOverride": false, "id": 37, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "null", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": true, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { "expr": "sum(pd_cluster_status{instance=\"$instance\",type=\"storage_size\"}) / sum(pd_cluster_status{instance=\"$instance\",type=\"storage_capacity\"})", "format": "time_series", "intervalFactor": 2, "refId": "A", "step": 40 } ], "thresholds": "0.01,0.5", "title": "Current storage used", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "#d44a3a", "rgba(237, 129, 40, 0.89)", "#299c46" ], "datasource": "${DS_TEST-CLUSTER}", "description": "The count of healthy stores", "format": "none", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "gridPos": { "h": 6, "w": 4, "x": 16, "y": 1 }, "id": 97, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "connected", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { "expr": "sum(pd_cluster_status{instance=\"$instance\", type=\"store_up_count\"})", "format": "time_series", "intervalFactor": 2, "refId": "A" } ], "thresholds": "0,1", "title": "Normal stores", "type": "singlestat", "valueFontSize": "100%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_TEST-CLUSTER}", "description": "The total number of Regions without replicas", "editable": true, "error": false, "format": "none", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": false }, "gridPos": { "h": 6, "w": 4, "x": 20, "y": 1 }, "id": 20, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "null", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": true, "lineColor": "rgb(31, 120, 193)", "show": true }, "tableColumn": "", "targets": [ { "expr": "sum(pd_cluster_status{instance=\"$instance\",type=\"leader_count\"})", "format": "time_series", "intervalFactor": 2, "refId": "A", "step": 40 } ], "thresholds": "", "title": "Number of Regions", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" }, { "columns": [ { "text": "Current", "value": "current" } ], "datasource": "${DS_TEST-CLUSTER}", "fontSize": "90%", "gridPos": { "h": 7, "w": 6, "x": 0, "y": 7 }, "hideTimeOverride": true, "id": 116, "links": [], "pageSize": null, "scroll": true, "showHeader": true, "sort": { "col": 1, "desc": true }, "styles": [ { "alias": "Option", "dateFormat": "YYYY-MM-DD HH:mm:ss", "pattern": "Metric", "preserveFormat": false, "type": "string" }, { "alias": "", "colorMode": null, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "decimals": 2, "pattern": "/.*/", "thresholds": [], "type": "number", "unit": "short" } ], "targets": [ { "expr": "pd_config_status", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A" } ], "timeFrom": "1s", "title": "PD scheduler config", "transform": "timeseries_aggregations", "type": "table" }, { "columns": [ { "text": "Current", "value": "current" } ], "datasource": "${DS_TEST-CLUSTER}", "fontSize": "100%", "gridPos": { "h": 7, "w": 5, "x": 6, "y": 7 }, "hideTimeOverride": true, "id": 103, "links": [], "pageSize": null, "scroll": true, "showHeader": true, "sort": { "col": 0, "desc": false }, "styles": [ { "alias": "Type", "dateFormat": "YYYY-MM-DD HH:mm:ss", "pattern": "type", "type": "date" }, { "alias": "Numbers", "colorMode": null, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "pattern": "Current", "thresholds": [], "type": "number", "unit": "short" }, { "alias": "", "colorMode": null, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "decimals": 2, "pattern": "/.*/", "thresholds": [], "type": "number", "unit": "short" } ], "targets": [ { "expr": "pd_regions_label_level{instance=\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A" } ], "timeFrom": "1s", "timeShift": null, "title": "Region label isolation level", "transform": "timeseries_aggregations", "type": "table" }, { "columns": [ { "text": "Current", "value": "current" } ], "datasource": "${DS_TEST-CLUSTER}", "fontSize": "100%", "gridPos": { "h": 7, "w": 6, "x": 11, "y": 7 }, "hideTimeOverride": true, "id": 117, "links": [], "pageSize": null, "scroll": true, "showHeader": true, "sort": { "col": 0, "desc": true }, "styles": [ { "alias": "Label : address", "dateFormat": "YYYY-MM-DD HH:mm:ss", "link": false, "linkUrl": "", "pattern": "Metric", "thresholds": [ "un" ], "type": "string" }, { "alias": "count number", "colorMode": null, "colors": [ "#bf1b00", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 0, "pattern": "Current", "thresholds": [ "0" ], "type": "number", "unit": "short" }, { "alias": "", "colorMode": null, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "decimals": 2, "pattern": "/.*/", "thresholds": [], "type": "number", "unit": "short" } ], "targets": [ { "expr": "pd_cluster_placement_status", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{name}}", "refId": "A" } ], "timeFrom": "1s", "title": "Label distribution", "transform": "timeseries_aggregations", "type": "table" }, { "columns": [ { "text": "Current", "value": "current" } ], "datasource": "${DS_TEST-CLUSTER}", "editable": true, "error": false, "fontSize": "100%", "gridPos": { "h": 7, "w": 7, "x": 17, "y": 7 }, "hideTimeOverride": true, "id": 96, "links": [], "pageSize": null, "scroll": false, "showHeader": true, "sort": { "col": null, "desc": false }, "styles": [ { "dateFormat": "YYYY-MM-DD HH:mm:ss", "pattern": "Metric", "sanitize": false, "type": "string" }, { "colorMode": "cell", "colors": [ "rgba(50, 172, 45, 0.97)", "rgba(237, 129, 40, 0.89)", "rgba(245, 54, 54, 0.9)" ], "decimals": 0, "pattern": "Current", "thresholds": [ "1", "2" ], "type": "number", "unit": "short" } ], "targets": [ { "expr": "sum(pd_cluster_status{instance=\"$instance\", type=\"store_disconnected_count\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Disconnect Stores", "refId": "B", "step": 20 }, { "expr": "sum(pd_cluster_status{instance=\"$instance\", type=\"store_unhealth_count\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Unhealth Stores", "refId": "C", "step": 20 }, { "expr": "sum(pd_cluster_status{instance=\"$instance\", type=\"store_low_space_count\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "LowSpace Stores", "refId": "D", "step": 20 }, { "expr": "sum(pd_cluster_status{instance=\"$instance\", type=\"store_down_count\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Down Stores", "refId": "E", "step": 20 }, { "expr": "sum(pd_cluster_status{instance=\"$instance\", type=\"store_offline_count\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Offline Stores", "refId": "F", "step": 20 }, { "expr": "sum(pd_cluster_status{instance=\"$instance\", type=\"store_tombstone_count\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "Tombstone Stores", "refId": "G", "step": 20 } ], "timeFrom": "1s", "title": "Abnormal stores", "transform": "timeseries_aggregations", "type": "table" }, { "columns": [ { "text": "Current", "value": "current" } ], "datasource": "${DS_TEST-CLUSTER}", "fontSize": "90%", "gridPos": { "h": 7, "w": 4, "x": 0, "y": 14 }, "hideTimeOverride": true, "id": 115, "links": [], "pageSize": null, "scroll": false, "showHeader": true, "sort": { "col": 0, "desc": true }, "styles": [ { "alias": "", "colorMode": null, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "decimals": 2, "link": false, "pattern": "Value", "thresholds": [], "type": "number", "unit": "none" }, { "alias": "Meta", "colorMode": null, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "dateFormat": "YYYY-MM-DD HH:mm:ss", "decimals": 2, "pattern": "Metric", "thresholds": [], "type": "number", "unit": "short" } ], "targets": [ { "expr": "pd_cluster_metadata{instance=\"$instance\"}", "format": "time_series", "instant": true, "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A" } ], "timeFrom": "1s", "title": "pd_cluster_metadata", "transform": "timeseries_aggregations", "type": "table" }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": null, "description": "The current peer count of the cluster", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 8, "x": 4, "y": 14 }, "id": 18, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": false, "total": false, "values": false }, "lines": true, "linewidth": 3, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": true, "targets": [ { "expr": "sum(pd_cluster_status{instance=\"$instance\", type=\"region_count\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "count", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Current peer count", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "none", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 100 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "B", "1m", "now" ] }, "reducer": { "params": [], "type": "max" }, "type": "query" } ], "executionErrorState": "alerting", "frequency": "60s", "handler": 1, "message": "Regions are unhealthy", "name": "region health alert", "noDataState": "keep_state", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "It records the unusual Regions' count which may include pending peers, down peers, extra peers, offline peers, missing peers or learner peers", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 14 }, "id": 72, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "pd_regions_status{instance=\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A" }, { "expr": "sum(pd_regions_status) by (instance, type)", "format": "time_series", "hide": true, "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "B" } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 100 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Region health", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Cluster", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 1 }, "id": 119, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The number of different operators that are newly created", "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 2 }, "id": 45, "legend": { "alignAsTable": true, "avg": true, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(delta(pd_schedule_operators_count{instance=\"$instance\", event=\"create\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Schedule operator create", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "opm", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The number of different operators that have been checked. It mainly checks if the current step is finished; if yes, it returns the next step to be executed.", "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 2 }, "id": 79, "legend": { "alignAsTable": true, "avg": true, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(delta(pd_schedule_operators_count{instance=\"$instance\", event=\"check\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Schedule operator check", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "opm", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The number of different operators that are finished", "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 9 }, "id": 77, "legend": { "alignAsTable": true, "avg": true, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(delta(pd_schedule_operators_count{instance=\"$instance\", event=\"finish\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Schedule operator finish", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "opm", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 9 }, "id": 78, "legend": { "alignAsTable": true, "avg": true, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(delta(pd_schedule_operators_count{instance=\"$instance\", event=\"timeout\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Schedule operator timeout", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "opm", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The number of different operators that are replaced or canceled", "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 16 }, "id": 80, "legend": { "alignAsTable": true, "avg": true, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(delta(pd_schedule_operators_count{instance=\"$instance\", event=\"cancel\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 4 }, { "expr": "sum(delta(pd_schedule_operators_count{instance=\"$instance\", event=\"replace\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "B", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Schedule operator replaced or canceled", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "opm", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The number of operators in different status", "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 16 }, "id": 47, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(delta(pd_schedule_operators_count{instance=\"$instance\"}[1m])) by (event)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{event}}", "metric": "pd_scheduler_status", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Schedule operators count by state", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The time consumed when the operator is finished in .99", "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 23 }, "id": 67, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(pd_schedule_finish_operators_duration_seconds_bucket[5m])) by (type, le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "99% Operator finish duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The time consumed when the operator is finished in .50", "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 23 }, "id": 68, "legend": { "alignAsTable": true, "avg": true, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.5, sum(rate(pd_schedule_finish_operators_duration_seconds_bucket[5m])) by (type, le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "50% Operator finish duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The time consumed when the operator step is finished in .99", "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 30 }, "id": 81, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(pd_schedule_finish_operator_steps_duration_seconds_bucket[5m])) by (type, le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "99% Operator step finish duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "dtdurations", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The time consumed when the operator step is finished in .50", "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 30 }, "id": 82, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.5, sum(rate(pd_schedule_finish_operator_steps_duration_seconds_bucket[5m])) by (type, le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "50% Operator step finish duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "dtdurations", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Operator", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 2 }, "id": 120, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The capacity size of each TiKV instance", "fill": 0, "gridPos": { "h": 6, "w": 12, "x": 0, "y": 3 }, "id": 83, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": false, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": null, "sortDesc": null, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "pd_scheduler_store_status{store=~\"$store\", instance=\"$instance\", type=\"store_capacity\"}", "format": "time_series", "hide": false, "instant": false, "intervalFactor": 2, "legendFormat": "{{address}}-store-{{store}}", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Store capacity", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "decbytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "percent", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The available capacity size of each TiKV instance", "fill": 0, "gridPos": { "h": 6, "w": 12, "x": 12, "y": 3 }, "id": 91, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "{store=~\"$store\", instance=\"$instance\", type=\"store_available\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{address}}-store-{{store}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Store available", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "decbytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The used capacity size of each TiKV instance", "fill": 0, "gridPos": { "h": 6, "w": 12, "x": 0, "y": 9 }, "id": 90, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": false, "min": false, "rightSide": true, "show": true, "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "pd_scheduler_store_status{store=~\"$store\", instance=\"$instance\", type=\"store_used\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{address}}-store-{{store}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Store used", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "decbytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "It is equal to Store available capacity size over Store capacity size for each TiKV instance", "fill": 0, "gridPos": { "h": 6, "w": 12, "x": 12, "y": 9 }, "id": 84, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(pd_scheduler_store_status{store=~\"$store\", instance=\"$instance\", type=\"store_available\"}) by (address, store) / sum(pd_scheduler_store_status{instance=\"$instance\", type=\"store_capacity\"}) by (address, store)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{address}}-store-{{store}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Store available ratio", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The size amplification, which is equal to Store Region size over Store used capacity size, of each TiKV instance", "fill": 0, "gridPos": { "h": 6, "w": 12, "x": 0, "y": 15 }, "id": 85, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 3, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(pd_scheduler_store_status{store=~\"$store\", instance=\"$instance\", type=\"region_size\"}) by (address, store) / sum(pd_scheduler_store_status{instance=\"$instance\", type=\"store_used\"}) by (address, store) * 2^20", "format": "time_series", "instant": false, "intervalFactor": 2, "legendFormat": "{{address}}-store-{{store}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Size amplification", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": null, "description": "The Region score of each TiKV instance", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 6, "w": 12, "x": 12, "y": 15 }, "id": 41, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "pd_scheduler_store_status{store=~\"$store\", instance=\"$instance\", type=\"region_score\"}", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{address}}-store-{{store}}", "refId": "A", "step": 4 }, { "expr": "pd_scheduler_op_influence{instance=\"$instance\",scheduler=\"balance-region-scheduler\"}", "format": "time_series", "hide": true, "intervalFactor": 1, "legendFormat": "op-influence-{{store}}-{{type}}", "refId": "C" }, { "expr": "pd_scheduler_tolerant_resource{instance=\"$instance\",scheduler=\"balance-region-scheduler\"}", "format": "time_series", "hide": true, "intervalFactor": 1, "legendFormat": "tolerant-resource-{{source}}-{{target}}", "refId": "E" } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 1000000000 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Store Region score", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": null, "description": "The leader score of each TiKV instance", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 6, "w": 12, "x": 0, "y": 21 }, "id": 40, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "pd_scheduler_store_status{store=~\"$store\", instance=\"$instance\", type=\"leader_score\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{address}}-store-{{store}}", "refId": "A", "step": 4 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 1000000000 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Store leader score", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The total Region size of each TiKV instance", "fill": 0, "gridPos": { "h": 6, "w": 12, "x": 12, "y": 21 }, "id": 57, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "pd_scheduler_store_status{store=~\"$store\", instance=\"$instance\", type=\"region_size\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{address}}-store-{{store}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Store Region size", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "decmbytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": " \tThe total leader size of each TiKV instance", "fill": 0, "gridPos": { "h": 6, "w": 12, "x": 0, "y": 27 }, "id": 56, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "pd_scheduler_store_status{store=~\"$store\", instance=\"$instance\", type=\"leader_size\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{address}}-store-{{store}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Store leader size", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "decmbytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The Region count of each TiKV instance \t", "fill": 0, "gridPos": { "h": 6, "w": 12, "x": 12, "y": 27 }, "id": 59, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "pd_scheduler_store_status{store=~\"$store\", instance=\"$instance\", type=\"region_count\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{address}}-store-{{store}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Store Region count", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The leader count of each TiKV instance", "fill": 0, "gridPos": { "h": 6, "w": 12, "x": 0, "y": 33 }, "id": 58, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "pd_scheduler_store_status{store=~\"$store\", instance=\"$instance\", type=\"leader_count\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{address}}-store-{{store}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Store leader count", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Statistics - balance", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 3 }, "id": 121, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 0, "description": "The total number of leader Regions under hot write on each TiKV instance", "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 83 }, "id": 50, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "pd_hotspot_status{store=~\"$store\", instance=\"$instance\", type=\"hot_write_region_as_leader\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{address}}-store-{{store}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Hot write Region's leader distribution", "tooltip": { "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 0, "description": "The total number of Regions which are not leader under hot write on each TiKV instance", "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 83 }, "id": 51, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "pd_hotspot_status{store=~\"$store\", instance=\"$instance\", type=\"hot_write_region_as_peer\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{address}}-store-{{store}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Hot write Region's peer distribution", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": null, "description": "The total bytes of hot write on leader Regions for each TiKV instance", "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 90 }, "id": 48, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 1, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "pd_hotspot_status{store=~\"$store\", instance=\"$instance\", type=\"total_written_bytes_as_leader\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{address}}-store-{{store}}", "metric": "pd_hotspot_status", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Hot write Region's leader written bytes", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "decbytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": null, "description": "The total bytes of hot write on Regions which are not leader for each TiKV instance", "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 90 }, "id": 49, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "pd_hotspot_status{store=~\"$store\", instance=\"$instance\", type=\"total_written_bytes_as_peer\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{address}}-store-{{store}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Hot Region's peer written bytes", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "decbytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 0, "description": "The total number of leader Regions under hot read on each TiKV instance", "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 97 }, "id": 60, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "pd_hotspot_status{store=~\"$store\", instance=\"$instance\", type=\"hot_read_region_as_leader\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{address}}-store-{{store}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Hot read Region's leader distribution", "tooltip": { "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The total bytes of hot read on leader Regions for each TiKV instance", "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 97 }, "id": 62, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "pd_hotspot_status{store=~\"$store\", instance=\"$instance\", type=\"total_read_bytes_as_leader\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{address}}-store-{{store}}", "metric": "pd_hotspot_status", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Hot read Region's leader read bytes", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "decbytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 0, "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 104 }, "id": 61, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "pd_scheduler_store_status{store=~\"$store\", type=\"store_write_rate_bytes\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{address}}-store-{{store}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Store write rate bytes", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": null, "format": "Bps", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 0, "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 104 }, "id": 105, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "pd_scheduler_store_status{store=~\"$store\", type=\"store_read_rate_bytes\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{address}}-store-{{store}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Store read rate bytes", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "Bps", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 0, "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 111 }, "id": 106, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "pd_scheduler_store_status{store=~\"$store\", type=\"store_write_rate_keys\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{address}}-store-{{store}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Store write rate keys", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": null, "format": "Bps", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 0, "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 111 }, "id": 107, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "pd_scheduler_store_status{store=~\"$store\", type=\"store_read_rate_keys\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{address}}-store-{{store}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Store read rate keys", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": null, "format": "none", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] } ], "repeat": null, "title": "Statistics - hotspot", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }, "id": 122, "panels": [ { "aliasColors": {}, "bars": true, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The current running schedulers", "fill": 0, "gridPos": { "h": 8, "w": 24, "x": 0, "y": 112 }, "id": 46, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": false, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 1, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "expr": "pd_scheduler_status{type=\"allow\",instance=\"$instance\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{kind}}", "metric": "pd_scheduler_status", "refId": "A", "step": 2 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Scheduler is running", "tooltip": { "shared": true, "sort": 1, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The leader movement details among TiKV instances", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 120 }, "id": 87, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": false, "min": false, "rightSide": true, "show": true, "sort": "total", "sortDesc": true, "total": true, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "expr": "- sum(delta(pd_scheduler_balance_leader{store=~\"$store\", address=~\".*out\",instance=\"$instance\", type=\"move-leader\"}[30s])) by (address, store)", "format": "time_series", "intervalFactor": 2, "legendFormat": "store-{{store}}", "refId": "A" }, { "expr": "sum(delta(pd_scheduler_balance_leader{store=~\"$store\", address=~\".*in\",instance=\"$instance\", type=\"move-leader\"}[30s])) by (address, store)", "format": "time_series", "intervalFactor": 2, "legendFormat": "store-{{store}}", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Balance leader movement", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The Region movement details among TiKV instances", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 120 }, "hideTimeOverride": false, "id": 86, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": false, "min": false, "rightSide": true, "show": true, "sort": "total", "sortDesc": true, "total": true, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "expr": "-sum(delta(pd_scheduler_balance_region{store=~\"$store\", address=~\".*out\",instance=\"$instance\", type=\"move-peer\"}[1m])) by (address, store)", "format": "time_series", "intervalFactor": 2, "legendFormat": "store-{{store}}", "refId": "A" }, { "expr": "sum(delta(pd_scheduler_balance_region{store=~\"$store\", address=~\".*in\",instance=\"$instance\", type=\"move-peer\"}[1m])) by (address, store)", "format": "time_series", "intervalFactor": 2, "legendFormat": "store-{{store}}", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Balance Region movement", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": true, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The count of balance leader events", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 128 }, "id": 89, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": false, "min": false, "rightSide": true, "show": true, "total": true, "values": true }, "lines": false, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "expr": "sum(delta(pd_scheduler_balance_leader{store=~\"$store\", instance=\"$instance\", type!=\"move-leader\"}[30s])) by (type, address, store)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}-store-{{store}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Balance leader event", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": true, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The count of balance Region events", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 128 }, "id": 88, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": false, "min": false, "rightSide": true, "show": true, "sortDesc": false, "total": true, "values": true }, "lines": false, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "expr": "sum(delta(pd_scheduler_balance_region{store=~\"$store\", instance=\"$instance\", type!=\"move-peer\"}[30s])) by (type, address, store)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}-store-{{store}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Balance Region event", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The inner status of balance leader scheduler", "fill": 0, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 136 }, "id": 52, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(delta(pd_scheduler_event_count{instance=\"$instance\", type=\"balance-leader-scheduler\"}[5m])) by (name)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{name}}", "metric": "pd_scheduler_event_count", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Balance leader scheduler", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The inner status of balance Region scheduler", "fill": 0, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 136 }, "id": 53, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(delta(pd_scheduler_event_count{instance=\"$instance\", type=\"balance-region-scheduler\"}[5m])) by (name)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{name}}", "metric": "pd_scheduler_event_count", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Balance Region scheduler", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The replica checker's status", "fill": 0, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 144 }, "id": 70, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(delta(pd_checker_event_count{instance=\"$instance\", type=\"replica_checker\"}[1m])) by (name)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{name}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Replica checker", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The merge checker's status", "fill": 0, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 144 }, "id": 71, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(delta(pd_checker_event_count{instance=\"$instance\", type=\"merge_checker\"}[1m])) by (name)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{name}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Region merge checker", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 0, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 152 }, "id": 110, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(delta(pd_schedule_filter{store=~\"$store\", action=\"filter-target\"}[1m])) by (store, type, scope)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{scope}}-store-{{store}}-{{type}}", "metric": "pd_scheduler_event_count", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Filter target", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 0, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 152 }, "id": 109, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(delta(pd_schedule_filter{store=~\"$store\", action=\"filter-source\"}[1m])) by (store, type, scope)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{scope}}-store-{{store}}-{{type}}", "metric": "pd_scheduler_event_count", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Filter source", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 0, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 160 }, "id": 108, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(delta(pd_scheduler_balance_direction{instance=\"$instance\"}[1m])) by (type, source, target)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{source}}-{{target}}-{{type}}", "metric": "pd_scheduler_event_count", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Balance Direction", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 0, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 160 }, "id": 111, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "pd_schedule_store_limit{store=~\"$store\", type=\"available\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{store}}-avaliable", "metric": "pd_scheduler_event_count", "refId": "A", "step": 4 }, { "expr": "pd_schedule_store_limit{store=~\"$store\", type=\"take\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{store}}-take", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Store Limit", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] } ], "repeat": null, "title": "Scheduler", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 }, "id": 123, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": null, "description": "The rate of completing each kind of gRPC commands", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 }, "id": 1, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 300, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(grpc_server_handling_seconds_count{instance=\"$instance\"}[1m])) by (grpc_method)", "intervalFactor": 2, "legendFormat": "{{grpc_method}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Completed commands rate", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 10, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The time consumed of completing each kind of gRPC commands in .99", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 }, "id": 2, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 300, "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{instance=\"$instance\"}[5m])) by (grpc_method, le))", "intervalFactor": 2, "legendFormat": "{{grpc_method}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "99% Completed commands duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "gRPC", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 6 }, "id": 124, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The rate of handling etcd transactions", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 24, "x": 0, "y": 162 }, "id": 5, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 300, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(pd_txn_handle_txns_duration_seconds_count[5m])) by (instance, result)", "intervalFactor": 2, "legendFormat": "{{instance}} : {{result}}", "refId": "A", "step": 2 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Handle transactions rate", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The time consumed of handling etcd transactions in .99", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 170 }, "id": 6, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 300, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(pd_txn_handle_txns_duration_seconds_bucket[5m])) by (instance, result, le))", "intervalFactor": 2, "legendFormat": "{{instance}} {{result}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "99% Handle transactions duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The time consumed of writing WAL into the persistent storage in .99", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 170 }, "id": 7, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 300, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(etcd_disk_wal_fsync_duration_seconds_bucket[5m])) by (instance, le))", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "99% WAL fsync duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "transparent": false, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The latency of the network in .99", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 178 }, "id": 34, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 300, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(etcd_network_peer_round_trip_time_seconds_bucket[5m])) by (instance, To, le))", "intervalFactor": 2, "legendFormat": "{{instance}} - {{To}}", "metric": "", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "99% Peer round trip time seconds", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "transparent": false, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "alignLevel": null, "format": "short", "label": null, "logBase": 1, "max": null, "min": null }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "alert": { "conditions": [ { "evaluator": { "params": [ 0.1 ], "type": "lt" }, "operator": { "type": "and" }, "query": { "datasourceId": 1, "model": { "expr": "delta(etcd_disk_wal_fsync_duration_seconds_count[1m])", "intervalFactor": 2, "legendFormat": "{{instance}} etch disk wal fsync rate", "refId": "A", "step": 4 }, "params": [ "A", "1m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" } ], "executionErrorState": "alerting", "frequency": "60s", "handler": 1, "message": "PD etcd disk fsync maybe is down.", "name": "etcd disk fsync", "noDataState": "ok", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The rate of writing WAL into the persistent storage", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 178 }, "id": 44, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "delta(etcd_disk_wal_fsync_duration_seconds_count[1m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} etch disk wal fsync rate", "refId": "A", "step": 4 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "lt", "value": 0.1 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "etcd disk wal fsync rate", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "opm", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The current term of Raft", "fill": 1, "gridPos": { "h": 8, "w": 8, "x": 0, "y": 186 }, "id": 92, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "pd_server_etcd_state{type=\"term\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} - {{job}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Raft term", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The last committed index of Raft", "fill": 1, "gridPos": { "h": 8, "w": 8, "x": 8, "y": 186 }, "id": 93, "legend": { "alignAsTable": true, "alignLevel": null, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": false, "min": false, "rightSide": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "pd_server_etcd_state{type=\"committedIndex\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} - {{job}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Raft committed index", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": " \tThe last applied index of Raft", "fill": 1, "gridPos": { "h": 8, "w": 8, "x": 16, "y": 186 }, "id": 94, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "pd_server_etcd_state{type=\"appliedIndex\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} - {{job}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Raft applied index", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] } ], "repeat": null, "title": "etcd", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 7 }, "id": 125, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The count of TiDB requests", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 24, "x": 0, "y": 8 }, "id": 28, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 300, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(pd_client_request_handle_requests_duration_seconds_count[1m])) by (type)", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 2 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Handle requests count", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The time consumed of handling TiDB requests", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 24, "x": 0, "y": 16 }, "id": 29, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 300, "sort": "current", "sortDesc": false, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.98, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket[30s])) by (type, le))", "hide": false, "intervalFactor": 2, "legendFormat": "{{type}} 98th percentile", "refId": "A", "step": 2 }, { "expr": "avg(rate(pd_client_request_handle_requests_duration_seconds_sum[30s])) by (type) / avg(rate(pd_client_request_handle_requests_duration_seconds_count[30s])) by (type)", "intervalFactor": 2, "legendFormat": "{{type}} average", "refId": "B", "step": 2 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Handle requests duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "TiDB", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 8 }, "id": 126, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The heartbeat latency of each TiKV instance in .99", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 9 }, "id": 74, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "round(histogram_quantile(0.99, sum(rate(pd_scheduler_region_heartbeat_latency_seconds_bucket{store=~\"$store\"}[5m])) by (address, store, le)), 1000)", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{address}}-store-{{store}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "99% Region heartbeat latency", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ms", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "ms", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The count of the corresponding schedule commands which PD sends to each TiKV instance", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 9 }, "id": 64, "legend": { "alignAsTable": true, "avg": true, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(pd_scheduler_region_heartbeat{store=~\"$store\", type=\"push\",instance=\"$instance\"}[5m])*60) by (address, store, status)", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{address}}-{{status}}-store-{{store}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Region schedule push", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "opm", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The count of the heartbeats which each TiKV instance reports to PD", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 17 }, "id": 54, "legend": { "alignAsTable": true, "avg": true, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(delta(pd_scheduler_region_heartbeat{store=~\"$store\", instance=\"$instance\", type=\"report\", status=\"ok\"}[1m])) by (address, store)", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{address}}-store-{{store}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Region heartbeat report", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "opm", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The count of the heartbeats with the ok status", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 17 }, "id": 78, "legend": { "alignAsTable": true, "avg": true, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(delta(pd_scheduler_region_heartbeat{store=~\"$store\", instance=\"$instance\", type=\"report\", status=\"bind\"}[1m])) by (address, store)", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{address}}-store-{{store}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Region heartbeat report active", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "opm", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The count of the heartbeats with the error status", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 25 }, "id": 77, "legend": { "alignAsTable": true, "avg": true, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(delta(pd_scheduler_region_heartbeat{store=~\"$store\", instance=\"$instance\", type=\"report\", status=\"err\"}[1m])) by (address, store)", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{address}}-store-{{store}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Region heartbeat report error", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "opm", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Heartbeat", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 9 }, "id": 127, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 213 }, "id": 112, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "pd_region_syncer_status{type=\"sync_index\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Syncer index", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 213 }, "id": 113, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "pd_region_syncer_status{type=\"last_index\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "History last index", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] } ], "repeat": null, "title": "Region storage", "type": "row" } ], "refresh": "30s", "schemaVersion": 18, "style": "dark", "tags": [], "templating": { "list": [ { "allValue": null, "current": {}, "datasource": "${DS_TEST-CLUSTER}", "definition": "", "hide": 0, "includeAll": false, "label": null, "multi": false, "name": "instance", "options": [], "query": "label_values(pd_cluster_status, instance)", "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 0, "tagValuesQuery": null, "tags": [], "tagsQuery": null, "type": "query", "useTags": false }, { "allValue": ".*", "current": {}, "datasource": "${DS_TEST-CLUSTER}", "definition": "label_values(pd_scheduler_store_status, store)", "hide": 0, "includeAll": true, "label": "store", "multi": true, "name": "store", "options": [], "query": "label_values(pd_scheduler_store_status, store)", "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 1, "tagValuesQuery": "", "tags": [], "tagsQuery": "", "type": "query", "useTags": false } ] }, "time": { "from": "now-1h", "to": "now" }, "timepicker": { "refresh_intervals": [ "5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d" ], "time_options": [ "5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d" ] }, "timezone": "browser", "title": "Test-Cluster-PD", "uid": "Q6RuHYIWk", "version": 5 } ================================================ FILE: scripts/pdn.json ================================================ { "aliasColors": {}, "bars": false, "datasource": "${DS_TIDB-CLUSTER}", "fill": 0, "id": 60, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "span": 6, "stack": false, "steppedLine": false, "targets": [ { "expr": "pd_hotspot_status{instance=\"$instance\",type=\"hot_read_region_as_leader\"}", "intervalFactor": 2, "legendFormat": "{{store}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Hot read region's leader distribution", "tooltip": { "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "datasource": "${DS_TIDB-CLUSTER}", "fill": 0, "id": 61, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "span": 6, "stack": false, "steppedLine": false, "targets": [ { "expr": "pd_hotspot_status{instance=\"$instance\",type=\"hot_read_region_as_peer\"}", "intervalFactor": 2, "legendFormat": "{{store}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Hot read region's peer distribution", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "datasource": "${DS_TIDB-CLUSTER}", "fill": 1, "id": 62, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "span": 6, "stack": false, "steppedLine": false, "targets": [ { "expr": "pd_hotspot_status{instance=\"$instance\",type=\"total_read_bytes_as_leader\"}", "intervalFactor": 2, "legendFormat": "{{store}}", "metric": "pd_hotspot_status", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Hot read region's leader read bytes", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "datasource": "${DS_TIDB-CLUSTER}", "fill": 1, "id": 63, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "span": 6, "stack": false, "steppedLine": false, "targets": [ { "expr": "pd_hotspot_status{instance=\"$instance\",type=\"total_read_bytes_as_peer\"}", "intervalFactor": 2, "legendFormat": "{{store}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Hot read region's peer read bytes", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "decbytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, ================================================ FILE: scripts/performance_read.json ================================================ { "__inputs": [ { "name": "DS_TEST-CLUSTER", "label": "test-cluster", "description": "", "type": "datasource", "pluginId": "prometheus", "pluginName": "Prometheus" } ], "__requires": [ { "type": "grafana", "id": "grafana", "name": "Grafana", "version": "6.1.6" }, { "type": "panel", "id": "graph", "name": "Graph", "version": "" }, { "type": "datasource", "id": "prometheus", "name": "Prometheus", "version": "1.0.0" } ], "annotations": { "list": [ { "builtIn": 1, "datasource": "${DS_TEST-CLUSTER}", "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "type": "dashboard" } ] }, "editable": true, "gnetId": null, "graphTooltip": 0, "id": null, "links": [], "panels": [ { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, "id": 33, "panels": [ { "alert": { "conditions": [ { "evaluator": { "params": [ 0.5 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "5m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" } ], "executionErrorState": "alerting", "frequency": "60s", "handler": 1, "name": "Duration alert", "noDataState": "no_data", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB query durations by histogram buckets with different percents", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 1 }, "id": 3, "legend": { "alignAsTable": false, "avg": false, "current": false, "hideEmpty": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.999, sum(rate(tidb_server_handle_query_duration_seconds_bucket[1m])) by (le))", "format": "time_series", "instant": false, "intervalFactor": 2, "legendFormat": "999", "refId": "A", "step": 10 }, { "expr": "histogram_quantile(0.99, sum(rate(tidb_server_handle_query_duration_seconds_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 3, "legendFormat": "99", "refId": "B", "step": 15 }, { "expr": "histogram_quantile(0.95, sum(rate(tidb_server_handle_query_duration_seconds_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "95", "refId": "C" }, { "expr": "histogram_quantile(0.80, sum(rate(tidb_server_handle_query_duration_seconds_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "80", "refId": "D" } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 0.5 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 2 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "5m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" } ], "executionErrorState": "alerting", "frequency": "60s", "handler": 1, "name": "Get Token Duration alert", "noDataState": "no_data", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "Duration (us) for getting token, it should be small until concurrency limit is reached.", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 1 }, "id": 31, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tidb_server_get_token_duration_seconds_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 1, "legendFormat": "99", "refId": "A" } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 2 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "99% Get Token Duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "µs", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 500 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "5m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" } ], "executionErrorState": "alerting", "frequency": "60s", "handler": 1, "name": "Connection Count alert", "noDataState": "no_data", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB current connection counts", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 8 }, "id": 1, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": true, "show": true, "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "total", "fill": 0, "lines": false } ], "spaceLength": 10, "stack": true, "steppedLine": true, "targets": [ { "expr": "tidb_server_connections", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 10 }, { "expr": "sum(tidb_server_connections)", "format": "time_series", "intervalFactor": 2, "legendFormat": "total", "refId": "B", "step": 10 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 500 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Connection Count", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 1000000000 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "5m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" }, { "evaluator": { "params": [ 3000000000 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "5m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" } ], "executionErrorState": "alerting", "frequency": "60s", "handler": 1, "name": "Heap Memory Usage alert", "noDataState": "no_data", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB heap memory size in use ", "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 8 }, "id": 2, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "go_memstats_heap_inuse_bytes{job=~\"tidb.*\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}-{{job}}", "metric": "go_memstats_heap_inuse_bytes", "refId": "A", "step": 10 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 1000000000 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Heap Memory Usage", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "TiDB-Server", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 1 }, "id": 34, "panels": [ { "alert": { "conditions": [ { "evaluator": { "params": [ 0.01 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "5m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" } ], "executionErrorState": "alerting", "frequency": "60s", "handler": 1, "name": "Parse Duration alert", "noDataState": "no_data", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": null, "description": "The time cost of parsing SQL to AST", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 2 }, "id": 4, "interval": "", "legend": { "alignAsTable": true, "avg": true, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": null, "sortDesc": null, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tidb_session_parse_duration_seconds_bucket[1m])) by (le, sql_type))", "format": "time_series", "instant": false, "intervalFactor": 2, "legendFormat": "{{sql_type}}", "refId": "A", "step": 30 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 0.01 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "99% Parse Duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": null, "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Parse", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 2 }, "id": 35, "panels": [ { "alert": { "conditions": [ { "evaluator": { "params": [ 0.03 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "5m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" } ], "executionErrorState": "alerting", "frequency": "60s", "handler": 1, "name": "Compile Duration alert", "noDataState": "no_data", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": null, "description": "The time cost of building the query plan", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 24, "x": 0, "y": 3 }, "id": 5, "interval": "", "legend": { "alignAsTable": true, "avg": true, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": null, "sortDesc": null, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tidb_session_compile_duration_seconds_bucket[1m])) by (le, sql_type))", "format": "time_series", "instant": false, "intervalFactor": 2, "legendFormat": "{{sql_type}}", "refId": "A", "step": 30 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 0.03 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "99% Compile Duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": null, "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Compile", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 3 }, "id": 36, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "Bucketed histogram of transaction execution durations, including retry", "fill": 1, "gridPos": { "h": 7, "w": 8, "x": 0, "y": 4 }, "id": 30, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tidb_session_transaction_duration_seconds_bucket[1m])) by (le, sql_type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99-{{sql_type}}", "refId": "A" }, { "expr": "histogram_quantile(0.95, sum(rate(tidb_session_transaction_duration_seconds_bucket[1m])) by (le, sql_type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "95-{{sql_type}}", "refId": "B" }, { "expr": "histogram_quantile(0.80, sum(rate(tidb_session_transaction_duration_seconds_bucket[1m])) by (le, sql_type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "80-{{sql_type}}", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 500 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "5m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" } ], "executionErrorState": "alerting", "frequency": "60s", "handler": 1, "name": "Transaction Statement Num alert", "noDataState": "no_data", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB statements numbers within one transaction. Internal means TiDB inner transaction", "fill": 1, "gridPos": { "h": 7, "w": 8, "x": 8, "y": 4 }, "id": 6, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tidb_session_transaction_statement_num_bucket[30s])) by (le, sql_type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99-{{sql_type}}", "refId": "A" }, { "expr": "histogram_quantile(0.80, sum(rate(tidb_session_transaction_statement_num_bucket[30s])) by (le, sql_type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "80-{{sql_type}}", "refId": "B" } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 500 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Transaction Statement Num", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 3 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "5m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" } ], "executionErrorState": "alerting", "frequency": "60s", "handler": 1, "name": "Transaction Retry Num alert", "noDataState": "no_data", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB transaction retry histogram bucket statistics", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 8, "x": 16, "y": 4 }, "id": 7, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(1.0, sum(rate(tidb_session_retry_num_bucket[30s])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "100", "refId": "A", "step": 10 }, { "expr": "histogram_quantile(0.99, sum(rate(tidb_session_retry_num_bucket[30s])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99", "refId": "B" }, { "expr": "histogram_quantile(0.90, sum(rate(tidb_session_retry_num_bucket[30s])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "90", "refId": "C" } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 3 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Transaction Retry Num", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Transaction", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }, "id": 37, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "kv command durations statistics by command type", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 5 }, "id": 10, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.999, sum(rate(tidb_tikvclient_txn_cmd_duration_seconds_bucket{type=~\"get|batch_get|seek|seek_reverse\"}[1m])) by (le, type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 40 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "KV Cmd Duration 9999", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "kv command durations statistics by command type", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 5 }, "id": 11, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tidb_tikvclient_txn_cmd_duration_seconds_bucket{type=~\"get|batch_get|seek|seek_reverse\"}[1m])) by (le, type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 40 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "KV Cmd Duration 99", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 500 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "5m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" } ], "executionErrorState": "alerting", "frequency": "60s", "handler": 1, "name": "Lock Resolve OPS alert", "noDataState": "no_data", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "lock resolve times", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 12 }, "id": 8, "legend": { "alignAsTable": true, "avg": false, "current": false, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tidb_tikvclient_lock_resolver_actions_total[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "metric": "tidb_tikvclient_lock_resolver_actions_total", "refId": "A", "step": 40 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 500 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Lock Resolve OPS", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "kv command durations statistics by command type", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 12 }, "id": 79, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tidb_tikvclient_backoff_seconds_bucket[5m])) by (le, type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 40 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "99% KV Backoff Duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 500 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "5m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" } ], "executionErrorState": "alerting", "frequency": "60s", "handler": 1, "name": "KV Backoff OPS alert", "noDataState": "no_data", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "kv storage backoff times", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 19 }, "id": 9, "legend": { "alignAsTable": true, "avg": false, "current": false, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "total": true, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tidb_tikvclient_backoff_seconds_count[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A" } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 500 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "KV Backoff OPS", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "KV", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 }, "id": 38, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The duration of a client calling GetTSAsync until received the TS result.", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 6 }, "id": 12, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.999, sum(rate(pd_client_cmd_handle_cmds_duration_seconds_bucket{type=\"tso\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "999", "refId": "A", "step": 10 }, { "expr": "histogram_quantile(0.99, sum(rate(pd_client_cmd_handle_cmds_duration_seconds_bucket{type=\"tso\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99", "refId": "B" }, { "expr": "histogram_quantile(0.90, sum(rate(pd_client_cmd_handle_cmds_duration_seconds_bucket{type=\"tso\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "90", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "PD TSO Wait Duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 0.03 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "5m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" } ], "executionErrorState": "alerting", "frequency": "60s", "handler": 1, "name": "PD TSO RPC Duration alert", "noDataState": "no_data", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The duration of a client sending TSO request until received the response.", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 6 }, "id": 13, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.999, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{type=\"tso\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "999", "refId": "A", "step": 10 }, { "expr": "histogram_quantile(0.99, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{type=\"tso\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99", "refId": "B" }, { "expr": "histogram_quantile(0.90, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{type=\"tso\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "90", "refId": "C" } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 0.03 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "PD TSO RPC Duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "PD Client", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 6 }, "id": 39, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The execution time of gRPC message", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 7 }, "id": 29, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_grpc_msg_duration_seconds_bucket{type=~\"kv_get|kv_batch_get|coprocessor\"}[5m])) by (le, type))", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "99% gRPC messge duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 3.6 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "5m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" } ], "executionErrorState": "alerting", "frequency": "60s", "handler": 1, "name": "gRPC poll CPU alert", "noDataState": "no_data", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The CPU utilization of gRPC", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 7 }, "id": 14, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_thread_cpu_seconds_total{name=~\"grpc.*\"}[1m])) by (instance, name)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ instance }} - {{name}}", "refId": "A", "step": 4 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 3.6 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "gRPC poll CPU", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "gRPC", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 7 }, "id": 47, "panels": [ { "alert": { "conditions": [ { "evaluator": { "params": [ 3.6 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "5m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" } ], "executionErrorState": "alerting", "for": "5m", "frequency": "1m", "handler": 1, "name": "Storage ReadPool CPU alert", "noDataState": "no_data", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The CPU utilization of readpool", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 8 }, "id": 57, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_thread_cpu_seconds_total{name=~\"store_read.*\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tikv_thread_cpu_seconds_total", "refId": "A", "step": 4 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 3.6 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Storage ReadPool CPU", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "title": "Storage", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 8 }, "id": 49, "panels": [ { "alert": { "conditions": [ { "evaluator": { "params": [ 0.05 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "5m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" } ], "executionErrorState": "alerting", "for": "5m", "frequency": "1m", "handler": 1, "name": "Wait duration alert", "noDataState": "no_data", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The time consumed when coprocessor requests are wait for being handled", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 9 }, "id": 53, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(1, sum(rate(tikv_coprocessor_request_wait_seconds_bucket[1m])) by (le,req))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{req}}-100%", "refId": "D" }, { "expr": "histogram_quantile(0.99, sum(rate(tikv_coprocessor_request_wait_seconds_bucket[1m])) by (le,req))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{req}}-99%", "refId": "A", "step": 4 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 0.05 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Wait duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 1, "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The time consumed when handling coprocessor requests", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 9 }, "id": 51, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(1, sum(rate(tikv_coprocessor_request_handle_seconds_bucket[1m])) by (le,req))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{req}}-100%", "refId": "E" }, { "expr": "histogram_quantile(0.99, sum(rate(tikv_coprocessor_request_handle_seconds_bucket[1m])) by (le,req))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{req}}-99%", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Handle duration", "tooltip": { "msResolution": false, "shared": true, "sort": 1, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 1, "format": "s", "label": "", "logBase": 1, "max": null, "min": "0", "show": true }, { "decimals": 1, "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 7.2 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "5m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" } ], "executionErrorState": "alerting", "for": "5m", "frequency": "1m", "handler": 1, "name": "Coprocessor CPU alert", "noDataState": "no_data", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The CPU utilization of coprocessor", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 16 }, "id": 55, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_thread_cpu_seconds_total{name=~\"cop_.*\"}[1m])) by (instance)", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 4 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 7.2 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Coprocessor CPU", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "title": "Coprocessor", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 9 }, "id": 45, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The time consumed when executing get operations", "fill": 0, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 10 }, "id": 77, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "max(tikv_engine_get_micro_seconds{db=\"kv\",type=\"get_max\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "max", "refId": "A", "step": 10 }, { "expr": "max(tikv_engine_get_micro_seconds{db=\"kv\",type=\"get_percentile99\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "refId": "B", "step": 10 }, { "expr": "max(tikv_engine_get_micro_seconds{db=\"kv\",type=\"get_percentile95\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "refId": "C", "step": 10 }, { "expr": "max(tikv_engine_get_micro_seconds{db=\"kv\",type=\"get_average\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "refId": "D", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Get duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "µs", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The count of get operations", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 10 }, "id": 75, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_engine_memtable_efficiency{db=\"kv\", type=\"memtable_hit\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "memtable", "metric": "", "refId": "B", "step": 10 }, { "expr": "sum(rate(tikv_engine_cache_efficiency{db=\"kv\", type=~\"block_cache_data_hit|block_cache_filter_hit\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "block_cache", "metric": "", "refId": "E", "step": 10 }, { "expr": "sum(rate(tikv_engine_get_served{db=\"kv\", type=\"get_hit_l0\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "l0", "refId": "A", "step": 10 }, { "expr": "sum(rate(tikv_engine_get_served{db=\"kv\", type=\"get_hit_l1\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "l1", "refId": "C", "step": 10 }, { "expr": "sum(rate(tikv_engine_get_served{db=\"kv\", type=\"get_hit_l2_and_up\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "l2_and_up", "refId": "F", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Get operations", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The time consumed when executing seek operation", "fill": 0, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 18 }, "id": 73, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "max(tikv_engine_seek_micro_seconds{db=\"kv\",type=\"seek_max\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "max", "refId": "A", "step": 10 }, { "expr": "max(tikv_engine_seek_micro_seconds{db=\"kv\",type=\"seek_percentile99\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "refId": "B", "step": 10 }, { "expr": "max(tikv_engine_seek_micro_seconds{db=\"kv\",type=\"seek_percentile95\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "refId": "C", "step": 10 }, { "expr": "max(tikv_engine_seek_micro_seconds{db=\"kv\",type=\"seek_average\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "refId": "D", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Seek duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "µs", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The count of seek operations", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 18 }, "id": 71, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_engine_locate{db=\"kv\", type=\"number_db_seek\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "seek", "metric": "", "refId": "A", "step": 10 }, { "expr": "sum(rate(tikv_engine_locate{db=\"kv\", type=\"number_db_seek_found\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "seek_found", "metric": "", "refId": "B", "step": 10 }, { "expr": "sum(rate(tikv_engine_locate{db=\"kv\", type=\"number_db_next\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "next", "metric": "", "refId": "C", "step": 10 }, { "expr": "sum(rate(tikv_engine_locate{db=\"kv\", type=\"number_db_next_found\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "next_found", "metric": "", "refId": "D", "step": 10 }, { "expr": "sum(rate(tikv_engine_locate{db=\"kv\", type=\"number_db_prev\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "prev", "metric": "", "refId": "E", "step": 10 }, { "expr": "sum(rate(tikv_engine_locate{db=\"kv\", type=\"number_db_prev_found\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "prev_found", "metric": "", "refId": "F", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Seek operations", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The hit rate of block cache", "fill": 0, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 26 }, "id": 69, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "maxPerRow": 2, "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_engine_cache_efficiency{instance=~\"$instance\", db=\"$db\", type=\"block_cache_hit\"}[1m])) / (sum(rate(tikv_engine_cache_efficiency{db=\"$db\", type=\"block_cache_hit\"}[1m])) + sum(rate(tikv_engine_cache_efficiency{db=\"$db\", type=\"block_cache_miss\"}[1m])))", "format": "time_series", "intervalFactor": 2, "legendFormat": "all", "metric": "", "refId": "A", "step": 10 }, { "expr": "sum(rate(tikv_engine_cache_efficiency{db=\"kv\", type=\"block_cache_data_hit\"}[1m])) / (sum(rate(tikv_engine_cache_efficiency{db=\"kv\", type=\"block_cache_data_hit\"}[1m])) + sum(rate(tikv_engine_cache_efficiency{db=\"kv\", type=\"block_cache_data_miss\"}[1m])))", "format": "time_series", "intervalFactor": 2, "legendFormat": "data", "metric": "", "refId": "D", "step": 10 }, { "expr": "sum(rate(tikv_engine_cache_efficiency{db=\"kv\", type=\"block_cache_filter_hit\"}[1m])) / (sum(rate(tikv_engine_cache_efficiency{db=\"kv\", type=\"block_cache_filter_hit\"}[1m])) + sum(rate(tikv_engine_cache_efficiency{db=\"kv\", type=\"block_cache_filter_miss\"}[1m])))", "format": "time_series", "intervalFactor": 2, "legendFormat": "filter", "metric": "", "refId": "B", "step": 10 }, { "expr": "sum(rate(tikv_engine_cache_efficiency{db=\"kv\", type=\"block_cache_index_hit\"}[1m])) / (sum(rate(tikv_engine_cache_efficiency{db=\"kv\", type=\"block_cache_index_hit\"}[1m])) + sum(rate(tikv_engine_cache_efficiency{db=\"kv\", type=\"block_cache_index_miss\"}[1m])))", "format": "time_series", "intervalFactor": 2, "legendFormat": "index", "metric": "", "refId": "C", "step": 10 }, { "expr": "sum(rate(tikv_engine_bloom_efficiency{db=\"kv\", type=\"bloom_prefix_useful\"}[1m])) / sum(rate(tikv_engine_bloom_efficiency{db=\"kv\", type=\"bloom_prefix_checked\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "bloom prefix", "metric": "", "refId": "E", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Block cache hit", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "RocksDB-KV", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 10 }, "id": 61, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 2, "description": "Shows average latency for Reads and Writes IO Devices. Higher than typical latency for highly loaded storage indicates saturation (overload) and is frequent cause of performance problems. Higher than normal latency also can indicate internal storage problems.", "editable": true, "error": false, "fill": 2, "grid": {}, "gridPos": { "h": 7, "w": 24, "x": 0, "y": 11 }, "id": 59, "legend": { "alignAsTable": true, "avg": true, "current": false, "hideEmpty": true, "hideZero": true, "max": true, "min": true, "rightSide": true, "show": true, "sort": null, "sortDesc": null, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 1, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "calculatedInterval": "2m", "datasourceErrors": {}, "errors": {}, "expr": "irate(node_disk_read_time_seconds_total[5m]) / irate(node_disk_reads_completed_total[5m])", "format": "time_series", "interval": "", "intervalFactor": 1, "legendFormat": "Read: {{ instance }} - {{ device }}", "metric": "", "refId": "A", "step": 300, "target": "" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Disk Latency", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": "", "logBase": 2, "max": null, "min": null, "show": true }, { "format": "s", "label": "", "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 2, "description": "Shows amount of physical IOs (reads and writes) different devices are serving. Spikes in number of IOs served often corresponds to performance problems due to IO subsystem overload.", "editable": true, "error": false, "fill": 2, "grid": {}, "gridPos": { "h": 7, "w": 24, "x": 0, "y": 18 }, "id": 63, "legend": { "alignAsTable": true, "avg": true, "current": false, "hideEmpty": false, "hideZero": true, "max": true, "min": true, "rightSide": true, "show": true, "sort": null, "sortDesc": null, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 1, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "calculatedInterval": "2m", "datasourceErrors": {}, "errors": {}, "expr": "irate(node_disk_reads_completed_total[5m])", "format": "time_series", "interval": "", "intervalFactor": 1, "legendFormat": "Read: {{ instance }} - {{ device }}", "metric": "", "refId": "A", "step": 300, "target": "" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Disk Operations", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "iops", "label": "", "logBase": 1, "max": null, "min": 0, "show": true }, { "format": "short", "label": "", "logBase": 1, "max": null, "min": 0, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 2, "description": "Shows volume of reads and writes the storage is handling. This can be better measure of IO capacity usage for network attached and SSD storage as it is often bandwidth limited. Amount of data being written to the disk can be used to estimate Flash storage life time.", "editable": true, "error": false, "fill": 2, "grid": {}, "gridPos": { "h": 7, "w": 24, "x": 0, "y": 25 }, "id": 65, "legend": { "alignAsTable": true, "avg": true, "current": false, "hideEmpty": false, "hideZero": true, "max": true, "min": true, "rightSide": true, "show": true, "sort": null, "sortDesc": null, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 1, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "calculatedInterval": "2m", "datasourceErrors": {}, "errors": {}, "expr": "irate(node_disk_read_bytes_total[5m])", "format": "time_series", "interval": "", "intervalFactor": 1, "legendFormat": "Read: {{ instance }} - {{ device }}", "metric": "", "refId": "A", "step": 300, "target": "" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Disk Bandwidth", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "Bps", "label": "", "logBase": 1, "max": null, "min": 0, "show": true }, { "format": "short", "label": "", "logBase": 1, "max": null, "min": 0, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 2, "description": "Shows how much disk was loaded for reads or writes as average number of outstanding requests at different period of time. High disk load is a good measure of actual storage utilization. Different storage types handle load differently - some will show latency increases on low loads others can handle higher load with no problems.", "editable": true, "error": false, "fill": 2, "grid": {}, "gridPos": { "h": 7, "w": 24, "x": 0, "y": 32 }, "id": 67, "legend": { "alignAsTable": true, "avg": true, "current": false, "hideEmpty": false, "hideZero": true, "max": true, "min": true, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 1, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "calculatedInterval": "2m", "datasourceErrors": {}, "errors": {}, "expr": "irate(node_disk_read_time_seconds_total[5m])", "format": "time_series", "interval": "", "intervalFactor": 1, "legendFormat": "Read: {{instance}} - {{ device }}", "metric": "", "refId": "A", "step": 300, "target": "" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Disk Load", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": "", "logBase": 1, "max": null, "min": 0, "show": true }, { "format": "short", "label": "", "logBase": 1, "max": null, "min": 0, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "title": "Disk", "type": "row" } ], "refresh": false, "schemaVersion": 18, "style": "dark", "tags": [], "templating": { "list": [] }, "time": { "from": "now-6h", "to": "now" }, "timepicker": { "refresh_intervals": [ "5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d" ], "time_options": [ "5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d" ] }, "timezone": "", "title": "Test-Cluster-Performance-Read", "uid": "4aVOvxcWk", "version": 1 } ================================================ FILE: scripts/performance_write.json ================================================ { "__inputs": [ { "name": "DS_TEST-CLUSTER", "label": "test-cluster", "description": "", "type": "datasource", "pluginId": "prometheus", "pluginName": "Prometheus" } ], "__requires": [ { "type": "grafana", "id": "grafana", "name": "Grafana", "version": "6.1.6" }, { "type": "panel", "id": "graph", "name": "Graph", "version": "" }, { "type": "datasource", "id": "prometheus", "name": "Prometheus", "version": "1.0.0" } ], "annotations": { "list": [ { "builtIn": 1, "datasource": "${DS_TEST-CLUSTER}", "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "type": "dashboard" } ] }, "editable": true, "gnetId": null, "graphTooltip": 0, "id": null, "links": [], "panels": [ { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, "id": 33, "panels": [ { "alert": { "conditions": [ { "evaluator": { "params": [ 0.5 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "5m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" } ], "executionErrorState": "alerting", "frequency": "60s", "handler": 1, "name": "Duration alert", "noDataState": "no_data", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB query durations by histogram buckets with different percents", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 1 }, "id": 3, "legend": { "alignAsTable": false, "avg": false, "current": false, "hideEmpty": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.999, sum(rate(tidb_server_handle_query_duration_seconds_bucket[1m])) by (le))", "format": "time_series", "instant": false, "intervalFactor": 2, "legendFormat": "999", "refId": "A", "step": 10 }, { "expr": "histogram_quantile(0.99, sum(rate(tidb_server_handle_query_duration_seconds_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 3, "legendFormat": "99", "refId": "B", "step": 15 }, { "expr": "histogram_quantile(0.95, sum(rate(tidb_server_handle_query_duration_seconds_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "95", "refId": "C" }, { "expr": "histogram_quantile(0.80, sum(rate(tidb_server_handle_query_duration_seconds_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "80", "refId": "D" } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 0.5 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 2 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "5m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" } ], "executionErrorState": "alerting", "frequency": "60s", "handler": 1, "name": "Get Token Duration alert", "noDataState": "no_data", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "Duration (us) for getting token, it should be small until concurrency limit is reached.", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 1 }, "id": 31, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tidb_server_get_token_duration_seconds_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 1, "legendFormat": "99", "refId": "A" } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 2 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "99% Get Token Duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "µs", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 500 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "5m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" } ], "executionErrorState": "alerting", "frequency": "60s", "handler": 1, "name": "Connection Count alert", "noDataState": "no_data", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB current connection counts", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 8 }, "id": 1, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": true, "show": true, "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "total", "fill": 0, "lines": false } ], "spaceLength": 10, "stack": true, "steppedLine": true, "targets": [ { "expr": "tidb_server_connections", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 10 }, { "expr": "sum(tidb_server_connections)", "format": "time_series", "intervalFactor": 2, "legendFormat": "total", "refId": "B", "step": 10 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 500 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Connection Count", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 1000000000 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "5m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" }, { "evaluator": { "params": [ 3000000000 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "5m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" } ], "executionErrorState": "alerting", "frequency": "60s", "handler": 1, "name": "Heap Memory Usage alert", "noDataState": "no_data", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB heap memory size in use ", "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 8 }, "id": 2, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "go_memstats_heap_inuse_bytes{job=~\"tidb.*\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}-{{job}}", "metric": "go_memstats_heap_inuse_bytes", "refId": "A", "step": 10 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 1000000000 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Heap Memory Usage", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "TiDB-Server", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 1 }, "id": 34, "panels": [ { "alert": { "conditions": [ { "evaluator": { "params": [ 0.01 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "5m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" } ], "executionErrorState": "alerting", "frequency": "60s", "handler": 1, "name": "Parse Duration alert", "noDataState": "no_data", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": null, "description": "The time cost of parsing SQL to AST", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 2 }, "id": 4, "interval": "", "legend": { "alignAsTable": true, "avg": true, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": null, "sortDesc": null, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tidb_session_parse_duration_seconds_bucket[1m])) by (le, sql_type))", "format": "time_series", "instant": false, "intervalFactor": 2, "legendFormat": "{{sql_type}}", "refId": "A", "step": 30 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 0.01 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "99% Parse Duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": null, "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Parse", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 2 }, "id": 35, "panels": [ { "alert": { "conditions": [ { "evaluator": { "params": [ 0.03 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "5m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" } ], "executionErrorState": "alerting", "frequency": "60s", "handler": 1, "name": "Compile Duration alert", "noDataState": "no_data", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": null, "description": "The time cost of building the query plan", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 24, "x": 0, "y": 3 }, "id": 5, "interval": "", "legend": { "alignAsTable": true, "avg": true, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": null, "sortDesc": null, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tidb_session_compile_duration_seconds_bucket[1m])) by (le, sql_type))", "format": "time_series", "instant": false, "intervalFactor": 2, "legendFormat": "{{sql_type}}", "refId": "A", "step": 30 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 0.03 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "99% Compile Duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": null, "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Compile", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 3 }, "id": 36, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "Bucketed histogram of transaction execution durations, including retry", "fill": 1, "gridPos": { "h": 7, "w": 8, "x": 0, "y": 4 }, "id": 30, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tidb_session_transaction_duration_seconds_bucket[1m])) by (le, sql_type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99-{{sql_type}}", "refId": "A" }, { "expr": "histogram_quantile(0.95, sum(rate(tidb_session_transaction_duration_seconds_bucket[1m])) by (le, sql_type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "95-{{sql_type}}", "refId": "B" }, { "expr": "histogram_quantile(0.80, sum(rate(tidb_session_transaction_duration_seconds_bucket[1m])) by (le, sql_type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "80-{{sql_type}}", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 500 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "5m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" } ], "executionErrorState": "alerting", "frequency": "60s", "handler": 1, "name": "Transaction Statement Num alert", "noDataState": "no_data", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB statements numbers within one transaction. Internal means TiDB inner transaction", "fill": 1, "gridPos": { "h": 7, "w": 8, "x": 8, "y": 4 }, "id": 6, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tidb_session_transaction_statement_num_bucket[30s])) by (le, sql_type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99-{{sql_type}}", "refId": "A" }, { "expr": "histogram_quantile(0.80, sum(rate(tidb_session_transaction_statement_num_bucket[30s])) by (le, sql_type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "80-{{sql_type}}", "refId": "B" } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 500 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Transaction Statement Num", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 3 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "5m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" } ], "executionErrorState": "alerting", "frequency": "60s", "handler": 1, "name": "Transaction Retry Num alert", "noDataState": "no_data", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB transaction retry histogram bucket statistics", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 8, "x": 16, "y": 4 }, "id": 7, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(1.0, sum(rate(tidb_session_retry_num_bucket[30s])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "100", "refId": "A", "step": 10 }, { "expr": "histogram_quantile(0.99, sum(rate(tidb_session_retry_num_bucket[30s])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99", "refId": "B" }, { "expr": "histogram_quantile(0.90, sum(rate(tidb_session_retry_num_bucket[30s])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "90", "refId": "C" } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 3 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Transaction Retry Num", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Transaction", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }, "id": 37, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "kv command durations statistics by command type", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 5 }, "id": 10, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.999, sum(rate(tidb_tikvclient_txn_cmd_duration_seconds_bucket{type=~\"commit\"}[1m])) by (le, type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 40 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "KV Cmd Duration 999", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "kv command durations statistics by command type", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 5 }, "id": 11, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tidb_tikvclient_txn_cmd_duration_seconds_bucket{type=~\"commit\"}[1m])) by (le, type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 40 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "KV Cmd Duration 99", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 500 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "5m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" } ], "executionErrorState": "alerting", "frequency": "60s", "handler": 1, "name": "Lock Resolve OPS alert", "noDataState": "no_data", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "lock resolve times", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 12 }, "id": 8, "legend": { "alignAsTable": true, "avg": false, "current": false, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tidb_tikvclient_lock_resolver_actions_total[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "metric": "tidb_tikvclient_lock_resolver_actions_total", "refId": "A", "step": 40 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 500 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Lock Resolve OPS", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "kv command durations statistics by command type", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 12 }, "id": 69, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tidb_tikvclient_backoff_seconds_bucket[5m])) by (le, type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 40 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "99% KV Backoff Duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 500 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "5m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" } ], "executionErrorState": "alerting", "frequency": "60s", "handler": 1, "name": "KV Backoff OPS alert", "noDataState": "no_data", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "kv storage backoff times", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 19 }, "id": 9, "legend": { "alignAsTable": true, "avg": false, "current": false, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "total": true, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tidb_tikvclient_backoff_seconds_count[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A" } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 500 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "KV Backoff OPS", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "KV", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 }, "id": 38, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The duration of a client calling GetTSAsync until received the TS result.", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 6 }, "id": 12, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.999, sum(rate(pd_client_cmd_handle_cmds_duration_seconds_bucket{type=\"tso\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "999", "refId": "A", "step": 10 }, { "expr": "histogram_quantile(0.99, sum(rate(pd_client_cmd_handle_cmds_duration_seconds_bucket{type=\"tso\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99", "refId": "B" }, { "expr": "histogram_quantile(0.90, sum(rate(pd_client_cmd_handle_cmds_duration_seconds_bucket{type=\"tso\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "90", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "PD TSO Wait Duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 0.03 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "5m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" } ], "executionErrorState": "alerting", "frequency": "60s", "handler": 1, "name": "PD TSO RPC Duration alert", "noDataState": "no_data", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The duration of a client sending TSO request until received the response.", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 6 }, "id": 13, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.999, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{type=\"tso\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "999", "refId": "A", "step": 10 }, { "expr": "histogram_quantile(0.99, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{type=\"tso\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99", "refId": "B" }, { "expr": "histogram_quantile(0.90, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{type=\"tso\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "90", "refId": "C" } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 0.03 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "PD TSO RPC Duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "PD Client", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 6 }, "id": 39, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The execution time of gRPC message", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 7 }, "id": 29, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_grpc_msg_duration_seconds_bucket{type=~\"kv_prewrite|kv_commit\"}[5m])) by (le, type))", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "99% gRPC messge duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 3.6 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "5m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" } ], "executionErrorState": "alerting", "frequency": "60s", "handler": 1, "name": "gRPC poll CPU alert", "noDataState": "no_data", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The CPU utilization of gRPC", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 7 }, "id": 14, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "topk(5, sum(rate(tikv_thread_cpu_seconds_total{name=~\"grpc.*\"}[1m])) by (instance, name))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ instance }} - {{ name }}", "refId": "A", "step": 4 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 3.6 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "gRPC poll CPU", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "gRPC", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 7 }, "id": 40, "panels": [ { "alert": { "conditions": [ { "evaluator": { "params": [ 0.02 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "5m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" } ], "executionErrorState": "alerting", "frequency": "60s", "handler": 1, "name": "Scheduler latch wait duration alert", "noDataState": "no_data", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "the time which is caused by latch wait ", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 8 }, "id": 15, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "command": { "selected": false, "text": "scan_lock", "value": "scan_lock" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_scheduler_latch_wait_duration_seconds_bucket[5m])) by (le, type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99% - {{ type }}", "metric": "", "refId": "A", "step": 10 }, { "expr": "histogram_quantile(0.95, sum(rate(tikv_scheduler_latch_wait_duration_seconds_bucket[5m])) by (le, type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "95% - {{ type }}", "metric": "", "refId": "B", "step": 10 }, { "expr": "rate(tikv_scheduler_latch_wait_duration_seconds_sum[5m]) / rate(tikv_scheduler_latch_wait_duration_seconds_count[5m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg - {{ type }}", "metric": "", "refId": "C", "step": 10 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 0.02, "yaxis": "left" } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Scheduler latch wait duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 3.6 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "5m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" } ], "executionErrorState": "alerting", "frequency": "60s", "handler": 1, "name": "Scheduler worker CPU alert", "noDataState": "no_data", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The CPU utilization of scheduler worker", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 8 }, "id": 16, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_thread_cpu_seconds_total{name=~\"sched_.*\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tikv_thread_cpu_seconds_total", "refId": "A", "step": 4 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 3.6 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Scheduler worker CPU", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The time consumed when executing commit command", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 15 }, "id": 67, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "command": { "selected": false, "text": "commit", "value": "commit" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_scheduler_command_duration_seconds_bucket[1m])) by (le, type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ type }}", "metric": "", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "99% scheduler command duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Scheduler", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 8 }, "id": 43, "panels": [ { "alert": { "conditions": [ { "evaluator": { "params": [ 0.05 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "5m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" } ], "executionErrorState": "alerting", "frequency": "60s", "handler": 1, "name": "Propose wait duration alert", "noDataState": "no_data", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The wait time of each proposal", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 9 }, "id": 19, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_request_wait_time_duration_secs_bucket[1m])) by (le, instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ instance }}", "metric": "tikv_raftstore_request_wait_time_duration_secs_bucket", "refId": "A", "step": 4 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 0.05 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "99% propose wait duration by instance", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 1.7 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "datasourceId": 1, "model": { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"raftstore_.*\"}[1m])) by (instance)", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tikv_thread_cpu_seconds_total", "refId": "A", "step": 20 }, "params": [ "A", "1m", "now" ] }, "reducer": { "params": [], "type": "max" }, "type": "query" } ], "executionErrorState": "alerting", "frequency": "60s", "handler": 1, "message": "TiKV raftstore thread CPU usage is high", "name": "TiKV raft store CPU alert", "noDataState": "ok", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The CPU utilization of raftstore thread", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 9 }, "id": 21, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_thread_cpu_seconds_total{name=~\"raftstore_.*\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tikv_thread_cpu_seconds_total", "refId": "A", "step": 4 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 1.7 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Raft store CPU", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The time consumed by processing asynchronous write requests", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }, "id": 65, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type=\"write\"}[5m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "refId": "A", "step": 4 }, { "expr": "histogram_quantile(0.95, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{type=\"write\"}[5m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "refId": "B", "step": 4 }, { "expr": "sum(rate(tikv_storage_engine_async_request_duration_seconds_sum{type=\"write\"}[5m])) / sum(rate(tikv_storage_engine_async_request_duration_seconds_count{type=\"write\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "refId": "C", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Storage async write duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "raftstore", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 9 }, "id": 44, "panels": [ { "alert": { "conditions": [ { "evaluator": { "params": [ 0.05 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "5m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" } ], "executionErrorState": "alerting", "frequency": "60s", "handler": 1, "name": "Append log duration alert", "noDataState": "no_data", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The time consumed when Raft appends log on each TiKV instance", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 10 }, "id": 32, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_append_log_duration_seconds_bucket[1m])) by (le, instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ instance }}", "metric": "", "refId": "A", "step": 4 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 0.05 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "99% append log duration by instance", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The time consumed when executing write operation", "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 10 }, "id": 63, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "max(tikv_engine_write_micro_seconds{db=\"raft\", type=\"write_max\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "max", "refId": "A", "step": 10 }, { "expr": "max(tikv_engine_write_micro_seconds{db=\"raft\", type=\"write_percentile99\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "refId": "B", "step": 10 }, { "expr": "max(tikv_engine_write_micro_seconds{db=\"raft\", type=\"write_percentile95\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "refId": "C", "step": 10 }, { "expr": "max(tikv_engine_write_micro_seconds{db=\"raft\", type=\"write_average\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "refId": "D", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Write duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "µs", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "RocksDB-Raft", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 10 }, "id": 45, "panels": [ { "alert": { "conditions": [ { "evaluator": { "params": [ 0.1 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "5m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" } ], "executionErrorState": "alerting", "for": "5m", "frequency": "1m", "handler": 1, "name": "Apply wait duration alert", "noDataState": "no_data", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The time consumed when apply log requests are wait for being handled", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 11 }, "id": 49, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_apply_wait_time_duration_secs_bucket[5m])) by (le, instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ instance }}", "metric": "tikv_raftstore_request_wait_time_duration_secs_bucket", "refId": "A", "step": 4 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 0.1 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "99% apply wait duration by instance", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 0.05 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "5m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" } ], "executionErrorState": "alerting", "frequency": "60s", "handler": 1, "name": "Apply log duration alert", "noDataState": "no_data", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The time consumed when Raft applies log", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 11 }, "id": 22, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_apply_log_duration_seconds_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": " 99%", "metric": "", "refId": "A", "step": 4 }, { "expr": "histogram_quantile(0.95, sum(rate(tikv_raftstore_apply_log_duration_seconds_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "refId": "B", "step": 4 }, { "expr": "sum(rate(tikv_raftstore_apply_log_duration_seconds_sum[1m])) / sum(rate(tikv_raftstore_apply_log_duration_seconds_count[1m])) ", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "refId": "C", "step": 4 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 0.05 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Apply log duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The time consumed when executing write operation", "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 18 }, "id": 51, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "max(tikv_engine_write_micro_seconds{db=\"kv\", type=\"write_max\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "max", "refId": "A", "step": 10 }, { "expr": "max(tikv_engine_write_micro_seconds{db=\"kv\", type=\"write_percentile99\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "refId": "B", "step": 10 }, { "expr": "max(tikv_engine_write_micro_seconds{db=\"kv\", type=\"write_percentile95\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "refId": "C", "step": 10 }, { "expr": "max(tikv_engine_write_micro_seconds{db=\"kv\", type=\"write_average\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "refId": "D", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Write duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "µs", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 1.8 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "5m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" } ], "executionErrorState": "alerting", "for": "5m", "frequency": "1m", "handler": 1, "name": "Async apply CPU alert", "noDataState": "no_data", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The CPU utilization of async apply", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 18 }, "id": 47, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_thread_cpu_seconds_total{name=~\"apply_[0-9]+\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tikv_thread_cpu_seconds_total", "refId": "A", "step": 4 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 1.8 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Async apply CPU", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "RocksDB-KV", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 11 }, "id": 53, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 2, "description": "Shows average latency for Reads and Writes IO Devices. Higher than typical latency for highly loaded storage indicates saturation (overload) and is frequent cause of performance problems. Higher than normal latency also can indicate internal storage problems.", "editable": true, "error": false, "fill": 2, "grid": {}, "gridPos": { "h": 7, "w": 24, "x": 0, "y": 12 }, "id": 55, "legend": { "alignAsTable": true, "avg": true, "current": false, "hideEmpty": true, "hideZero": true, "max": true, "min": true, "rightSide": true, "show": true, "sort": null, "sortDesc": null, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 1, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "calculatedInterval": "2m", "datasourceErrors": {}, "errors": {}, "expr": "irate(node_disk_write_time_seconds_total[5m]) / irate(node_disk_writes_completed_total[5m])", "format": "time_series", "interval": "", "intervalFactor": 1, "legendFormat": "Write: {{ instance }} - {{ device }}", "metric": "", "refId": "A", "step": 300, "target": "" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Disk Latency", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": "", "logBase": 2, "max": null, "min": null, "show": true }, { "format": "s", "label": "", "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 2, "description": "Shows amount of physical IOs (reads and writes) different devices are serving. Spikes in number of IOs served often corresponds to performance problems due to IO subsystem overload.", "editable": true, "error": false, "fill": 2, "grid": {}, "gridPos": { "h": 7, "w": 24, "x": 0, "y": 19 }, "id": 57, "legend": { "alignAsTable": true, "avg": true, "current": false, "hideEmpty": false, "hideZero": true, "max": true, "min": true, "rightSide": true, "show": true, "sort": null, "sortDesc": null, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 1, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "calculatedInterval": "2m", "datasourceErrors": {}, "errors": {}, "expr": "irate(node_disk_writes_completed_total[5m])", "format": "time_series", "interval": "", "intervalFactor": 1, "legendFormat": "Write: {{ instance }} - {{ device }}", "metric": "", "refId": "A", "step": 300, "target": "" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Disk Operations", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "iops", "label": "", "logBase": 1, "max": null, "min": 0, "show": true }, { "format": "short", "label": "", "logBase": 1, "max": null, "min": 0, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 2, "description": "Shows volume of reads and writes the storage is handling. This can be better measure of IO capacity usage for network attached and SSD storage as it is often bandwidth limited. Amount of data being written to the disk can be used to estimate Flash storage life time.", "editable": true, "error": false, "fill": 2, "grid": {}, "gridPos": { "h": 7, "w": 24, "x": 0, "y": 26 }, "id": 59, "legend": { "alignAsTable": true, "avg": true, "current": false, "hideEmpty": false, "hideZero": true, "max": true, "min": true, "rightSide": true, "show": true, "sort": null, "sortDesc": null, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 1, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "calculatedInterval": "2m", "datasourceErrors": {}, "errors": {}, "expr": "irate(node_disk_written_bytes_total[5m])", "format": "time_series", "interval": "", "intervalFactor": 1, "legendFormat": "Write: {{ instance }} - {{ device }}", "metric": "", "refId": "A", "step": 300, "target": "" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Disk Bandwidth", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "Bps", "label": "", "logBase": 1, "max": null, "min": 0, "show": true }, { "format": "short", "label": "", "logBase": 1, "max": null, "min": 0, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 2, "description": "Shows how much disk was loaded for reads or writes as average number of outstanding requests at different period of time. High disk load is a good measure of actual storage utilization. Different storage types handle load differently - some will show latency increases on low loads others can handle higher load with no problems.", "editable": true, "error": false, "fill": 2, "grid": {}, "gridPos": { "h": 7, "w": 24, "x": 0, "y": 33 }, "id": 61, "legend": { "alignAsTable": true, "avg": true, "current": false, "hideEmpty": false, "hideZero": true, "max": true, "min": true, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 1, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "calculatedInterval": "2m", "datasourceErrors": {}, "errors": {}, "expr": "irate(node_disk_write_time_seconds_total[5m])", "format": "time_series", "interval": "", "intervalFactor": 1, "legendFormat": "Write: {{instance}} - {{ device }}", "metric": "", "refId": "A", "step": 300, "target": "" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Disk Load", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": "", "logBase": 1, "max": null, "min": 0, "show": true }, { "format": "short", "label": "", "logBase": 1, "max": null, "min": 0, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "title": "Disk", "type": "row" } ], "refresh": "5s", "schemaVersion": 18, "style": "dark", "tags": [], "templating": { "list": [] }, "time": { "from": "now-1h", "to": "now" }, "timepicker": { "refresh_intervals": [ "5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d" ], "time_options": [ "5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d" ] }, "timezone": "", "title": "Test-Cluster-Performance-Write", "uid": "Fcw5wqcWk", "version": 1 } ================================================ FILE: scripts/reparo.json ================================================ { "__inputs": [ { "name": "DS_TIDB-CLUSTER", "label": "tidb-cluster", "description": "", "type": "datasource", "pluginId": "prometheus", "pluginName": "Prometheus" } ], "__requires": [ { "type": "grafana", "id": "grafana", "name": "Grafana", "version": "4.6.3" }, { "type": "panel", "id": "graph", "name": "Graph", "version": "" }, { "type": "datasource", "id": "prometheus", "name": "Prometheus", "version": "1.0.0" } ], "annotations": { "list": [ { "builtIn": 1, "datasource": "-- Grafana --", "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "type": "dashboard" } ] }, "editable": true, "gnetId": null, "graphTooltip": 0, "hideControls": false, "id": null, "links": [], "refresh": false, "rows": [ { "collapse": false, "height": "250px", "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "fill": 1, "id": 1, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "span": 12, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.9, sum(rate(reparo_txn_duration_time_bucket[1m])) by (le,instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": "", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Txn Duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": false, "title": "Dashboard Row", "titleSize": "h6" }, { "collapse": false, "height": 250, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "fill": 1, "id": 2, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "span": 12, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.90, sum(rate(reparo_wait_dml_executed_bucket[1m])) by (le,instance))", "format": "time_series", "intervalFactor": 2, "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Wait DML Executed Duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": "", "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": false, "title": "Dashboard Row", "titleSize": "h6" }, { "collapse": false, "height": 250, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "fill": 1, "id": 3, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "span": 12, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.9, sum(rate(reparo_wait_ddl_executed_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Wait DDL Executed Duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": false, "title": "Dashboard Row", "titleSize": "h6" }, { "collapse": false, "height": 250, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "fill": 1, "id": 4, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "span": 12, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.9, sum(rate(reparo_add_job_latency_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Add Job Duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": false, "title": "Dashboard Row", "titleSize": "h6" }, { "collapse": false, "height": 250, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "fill": 1, "id": 5, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "span": 12, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.9, sum(rate(reparo_resolve_causality_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Resolve Causality Duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": false, "title": "Dashboard Row", "titleSize": "h6" }, { "collapse": false, "height": 250, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TIDB-CLUSTER}", "fill": 1, "id": 6, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "span": 12, "stack": false, "steppedLine": false, "targets": [ { "expr": "rate(reparo_execute_total[1m])", "format": "time_series", "intervalFactor": 2, "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "QPS", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": false, "title": "Dashboard Row", "titleSize": "h6" } ], "schemaVersion": 14, "style": "dark", "tags": [], "templating": { "list": [] }, "time": { "from": "2018-05-17T01:32:53.954Z", "to": "2018-05-17T02:32:53.954Z" }, "timepicker": { "refresh_intervals": [ "5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d" ], "time_options": [ "5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d" ] }, "timezone": "", "title": "TiDB-Reparo", "version": 22 } ================================================ FILE: scripts/syncer.json ================================================ { "__inputs": [ { "name": "DS_HOTEL-BIZREPORT", "label": "hotel-bizreport", "description": "", "type": "datasource", "pluginId": "prometheus", "pluginName": "Prometheus" } ], "__requires": [ { "type": "grafana", "id": "grafana", "name": "Grafana", "version": "4.6.3" }, { "type": "panel", "id": "graph", "name": "Graph", "version": "" }, { "type": "datasource", "id": "prometheus", "name": "Prometheus", "version": "1.0.0" } ], "annotations": { "list": [ { "builtIn": 1, "datasource": "-- Grafana --", "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "type": "dashboard" } ] }, "editable": true, "gnetId": null, "graphTooltip": 0, "hideControls": false, "id": null, "links": [], "refresh": "5s", "rows": [ { "collapse": false, "height": "250px", "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_HOTEL-BIZREPORT}", "fill": 1, "id": 1, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": false, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "span": 6, "stack": false, "steppedLine": false, "targets": [ { "expr": "rate(syncer_binlog_event_count[1m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{job}} - {{type}}", "metric": "syncer_binlog_events_total", "refId": "A", "step": 20 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "binlog events", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_HOTEL-BIZREPORT}", "fill": 1, "id": 9, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": false, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "span": 6, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.8, sum(rate(syncer_binlog_event_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{job}} - {{type}}", "metric": "syncer_binlog_events_total", "refId": "A", "step": 20 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "binlog event transform", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": false, "title": "Dashboard Row", "titleSize": "h6" }, { "collapse": false, "height": 250, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_HOTEL-BIZREPORT}", "fill": 1, "id": 8, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "span": 6, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.95, sum(rate(syncer_txn_cost_in_second_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{job}}", "refId": "A", "step": 20 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "transaction latency", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "alert": { "conditions": [ { "evaluator": { "params": [ 1000 ], "type": "lt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "1m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" } ], "executionErrorState": "alerting", "frequency": "60s", "handler": 1, "name": "transaction tps alert", "noDataState": "no_data", "notifications": [ { "id": 1 } ] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_HOTEL-BIZREPORT}", "fill": 1, "id": 7, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "span": 6, "stack": false, "steppedLine": false, "targets": [ { "expr": "rate(syncer_txn_cost_in_second_count[1m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{job}}", "refId": "A", "step": 20 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "lt", "value": 1000 } ], "timeFrom": null, "timeShift": null, "title": "transaction tps", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "alert": { "conditions": [ { "evaluator": { "params": [ 2 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "datasourceId": 1, "model": { "expr": " syncer_binlog_file{node=\"master\"} - ON(instance, job) syncer_binlog_file{node=\"syncer\"} ", "intervalFactor": 10, "legendFormat": "{{job}}", "refId": "A", "step": 50 }, "params": [ "A", "5m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" } ], "executionErrorState": "alerting", "frequency": "60s", "handler": 1, "name": "syncer_binlog_file alert", "noDataState": "no_data", "notifications": [ { "id": 1 } ] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_HOTEL-BIZREPORT}", "fill": 1, "id": 6, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "span": 6, "stack": false, "steppedLine": false, "targets": [ { "expr": " syncer_binlog_file{node=\"master\"} - ON(instance, job) syncer_binlog_file{node=\"syncer\"} ", "format": "time_series", "intervalFactor": 10, "legendFormat": "{{job}}", "refId": "A", "step": 100 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 2 } ], "timeFrom": null, "timeShift": null, "title": "binlog file gap", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_HOTEL-BIZREPORT}", "fill": 1, "id": 2, "legend": { "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, "rightSide": false, "show": true, "sort": "current", "sortDesc": false, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "span": 6, "stack": false, "steppedLine": false, "targets": [ { "expr": "syncer_binlog_pos{node=\"syncer\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{job}} {{node}}", "metric": "", "refId": "A", "step": 30 }, { "expr": "syncer_binlog_pos{node=\"master\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{job}} {{node}}", "refId": "B", "step": 30 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "binlog pos", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": false, "title": "Dashboard Row", "titleSize": "h6" }, { "collapse": false, "height": 250, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_HOTEL-BIZREPORT}", "fill": 1, "id": 4, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "span": 6, "stack": false, "steppedLine": false, "targets": [ { "expr": "syncer_binlog_file{node=\"master\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{job}} {{node}}", "refId": "A", "step": 30 }, { "expr": "syncer_binlog_file{node=\"syncer\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{job}} {{node}}", "refId": "B", "step": 30 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "syncer_binlog_file", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_HOTEL-BIZREPORT}", "fill": 1, "id": 3, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "span": 6, "stack": false, "steppedLine": false, "targets": [ { "expr": "rate(syncer_binlog_skipped_events_total[1m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{job}} {{type}}", "metric": "syncer_binlog_skipped_events_total", "refId": "A", "step": 20 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "binlog skipped events", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_HOTEL-BIZREPORT}", "fill": 1, "id": 10, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "span": 6, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(syncer_add_jobs_total[1m])) by (queueNo)", "format": "time_series", "intervalFactor": 2, "legendFormat": "", "metric": "syncer_binlog_skipped_events_total", "refId": "A", "step": 20 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "execution jobs", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_HOTEL-BIZREPORT}", "fill": 1, "id": 11, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": false, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "span": 6, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(syncer_add_jobs_total[1m]) - rate(syncer_finished_jobs_total[1m])) by (queueNo)", "format": "time_series", "intervalFactor": 2, "legendFormat": "", "metric": "syncer_binlog_skipped_events_total", "refId": "A", "step": 20 } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "pending jobs", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] } ], "repeat": null, "repeatIteration": null, "repeatRowId": null, "showTitle": false, "title": "Dashboard Row", "titleSize": "h6" } ], "schemaVersion": 14, "style": "dark", "tags": [], "templating": { "list": [] }, "time": { "from": "now-3h", "to": "now" }, "timepicker": { "refresh_intervals": [ "5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d" ], "time_options": [ "5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d" ] }, "timezone": "browser", "title": "Syncer", "version": 16 } ================================================ FILE: scripts/table-regions-statistic.py ================================================ #!/usr/bin/env python # !coding:utf-8 import argparse import subprocess import json import os from enum import Enum class Resource(Enum): KEY = 1 SIZE = 2 def count(table_region_set, all_regions, resource, group, to_draw): table_regions = filter(lambda region: region["id"] in table_region_set, all_regions["regions"]) table_regions = map(lambda region: (region["id"], int(region[get_resource_key(resource)])), table_regions) table_regions = sorted(table_regions, key=lambda region: region[0]) if to_draw: try: draw(table_regions, resource) except: print("need to install matplotlib") table_regions = sorted(table_regions, key=lambda region: region[1]) output(table_regions, generate_steps(resource, group=group, max_value=table_regions[-1][1] + 1), resource) def main(): args = parse_args() region_info = get_json("http://{}:{}/tables/{}/{}/regions".format(args.host, args.port, args.database, args.table)) table_region_set = set(map(lambda region: region["region_id"], region_info["record_regions"])) all_regions = get_json("http://{}:{}/pd/api/v1/regions".format(args.pd_host, args.pd_port)) count(table_region_set, all_regions, Resource.KEY, args.group, args.draw) count(table_region_set, all_regions, Resource.SIZE, args.group, args.draw) def generate_steps(resource, group, max_value): steps = [] if group: for i in range(0, group + 1): steps.append(int(i * max_value / group)) else: if resource == Resource.SIZE: steps = [0, 2, 20, 96, max_value] else: steps = [0, 20000, 200000, 960000, max_value] return steps def format_steps(steps): result = [] for step in steps: if step >= 1000: result.append("{}k".format(int(step / 1000))) else: result.append("{}".format(step)) return result def get_resource_key(resource): if resource == Resource.KEY: return 'approximate_keys' else: return 'approximate_size' def parse_args(): parser = argparse.ArgumentParser(description="Show region size and keys distribution of a TiDB table.") parser.add_argument("--host", dest="host", help="tidb-server address, default: 127.0.0.1", default="127.0.0.1") parser.add_argument("-d", dest="draw", help="whether to draw pictures, default: False", default=False, action='store_true') parser.add_argument("--port", dest="port", help="tidb-server status port, default: 10080", default="10080") parser.add_argument("--pd_host", dest="pd_host", help="pd-server address, default: 127.0.0.1", default="127.0.0.1") parser.add_argument("--pd_port", dest="pd_port", help="pd-server status port, default: 2379", default="2379") parser.add_argument("--group", dest="group", help="the result group num, default: 0 (split by default mode)", type=int, required=False, default=0) parser.add_argument("database", help="database name") parser.add_argument("table", help="table name") args = parser.parse_args() return args def draw(table_regions, resource): import matplotlib.pyplot as plt label = get_resource_key(resource) ax = plt.gca() ax.set_xlabel('region_order') ax.set_ylabel(label) x_list, y_list = [], [] for i, (_, region_size) in enumerate(table_regions): x_list.append(i) y_list.append(region_size) plt.scatter(x_list, y_list, color="r", alpha=0.5, s=5) plt.savefig(os.path.join("result_{}.png".format(label))) plt.show() def get_json(url): web_content = subprocess.check_output(["curl", "-sl", url]) json.loads(web_content) return json.loads(web_content) def output(table_regions, steps, resource): counts = [0] i = 1 for region in table_regions: if region[1] < steps[i]: counts[-1] += 1 else: counts.append(0) i += 1 output_steps = format_steps(steps) print("Region {}\t\t\tRegion num".format(get_resource_key(resource)).replace("approximate_", "")) for i, count in enumerate(counts): output_range = "{} ~ {}".format(output_steps[i], output_steps[i + 1]).ljust(16) print("{}\t{}".format(output_range, count)) print("") if __name__ == "__main__": main() ================================================ FILE: scripts/table-regions.py ================================================ #!/usr/bin/env python #!coding:utf-8 import argparse import subprocess import json from collections import Iterable # sql: select count(s.region_id) cnt, s.index_name, p.store_id from INFORMATION_SCHEMA.TIKV_REGION_STATUS s join INFORMATION_SCHEMA.tikv_region_peers p on s.region_id = p.region_id where s.table_name = 'table_name' and p.is_leader = 1 group by index_name, p.store_id order by index_name,cnt desc; def main(): args = parse_args() httpAPI = "http://{}:{}/tables/{}/{}/regions".format(args.host, args.port, args.database, args.table) webContent = subprocess.check_output(["curl", "-sl", httpAPI]) region_infos = json.loads(webContent) if not isinstance(region_infos, list): # without partition region_infos = [region_infos] # store_id -> num of regions table_region_leaders = {} # store_id -> StoreRegionPeers table_region_peers = {} # name -> region leader indices_region_leaders = {} # name -> region peers indices_region_peers = {} for region_info in region_infos: table_region_leaders = merge(table_region_leaders, parse_regions(region_info["record_regions"])) table_region_peers = merge_peers(table_region_peers, parse_region_peers(region_info["record_regions"])) if not args.hide_indices: for index_info in region_info["indices"]: index_name = index_info["name"] index_region_leaders = parse_regions(index_info["regions"]) indices_region_leaders[index_name] = merge(index_region_leaders, indices_region_leaders.get(index_name, {})) index_region_peers = parse_region_peers(index_info["regions"]) indices_region_peers[index_name] = merge_peers(index_region_peers, indices_region_peers.get(index_name, {})) # print record print("[RECORD - {}.{}] - Leaders Distribution:".format(args.database, args.table)) print_leaders(table_region_leaders) print("[RECORD - {}.{}] - Peers Distribution:".format(args.database, args.table)) print_peers(table_region_peers) # print indices if not args.hide_indices: print("") for index_name, index_region_info in indices_region_leaders.items(): print("[INDEX - {}] - Leaders Distribution:".format(index_name)) print_leaders(index_region_info) print("") for index_name, index_region_info in indices_region_peers.items(): print("[INDEX - {}] - Peers Distribution:".format(index_name)) print_peers(index_region_info) def parse_args(): parser = argparse.ArgumentParser(description="Show leader distribution of a TiDB table.") parser.add_argument("--host", dest="host", help="tidb-server address, default: 127.0.0.1", default="127.0.0.1") parser.add_argument("--port", dest="port", help="tidb-server status port, default: 10080", default="10080") parser.add_argument("database", help="database name") parser.add_argument("table", help="table name") parser.add_argument("--hide-indices", dest="hide_indices", help="whether collect distribution of indices regions", action='store_true', default=False) args = parser.parse_args() return args def merge(dist1, dist2): for k in dist2: dist1[k] = dist2[k] + dist1.get(k, 0) return dist1 def parse_regions(regions): info = {} for region in regions: if region["leader"]["store_id"] != None: store_id = region["leader"]["store_id"] info[store_id] = 1 + info.get(store_id, 0) return info class StoreRegionPeers: def __init__(self): # num of regions in normal role (Leader or Follower) self.num_normal = 0 # num of region in Learner role self.num_learners = 0 def add(self, peer): if peer.get("is_learner", False) == True: self.num_learners += 1 else: self.num_normal += 1 def merge(self, rhs): self.num_normal += rhs.num_normal self.num_learners += rhs.num_learners return self def num(self): return self.num_normal + self.num_learners def __str__(self): return str({"normal": self.num_normal, "learner": self.num_learners}) def __repr__(self): return str(self) def merge_peers(dist1, dist2): for k in dist2: if k in dist1: dist1[k] = dist2[k].merge(dist1[k]) else: dist1[k] = dist2[k] return dist1 def parse_region_peers(regions): info = {} for region in regions: for peer in region["peers"]: if peer["store_id"] != None: store_id = peer["store_id"] if store_id not in info: info[store_id] = StoreRegionPeers() info[store_id].add(peer) return info def print_leaders(info, indent = " "): total_leaders = 0 for store_id, num_leaders in info.items(): total_leaders += num_leaders print("{}total leader count: {}".format(indent, total_leaders)) for store_id, num_leaders in info.items(): print("{}store: {:6d}, num_leaders: {:6d}, percentage: {:.2f}%".format(indent, store_id, num_leaders,(num_leaders*100.0)/total_leaders)) def print_peers(info, indent = " "): total_peers = 0 for store_id, peers in info.items(): total_peers += peers.num() print("{}total peers count: {}".format(indent, total_peers)) for store_id, peers in info.items(): num_peers = peers.num() print("{}store: {:6d}, num_peers(num_learners): {:6d}({:6d}), percentage: {:.2f}%".format(indent, store_id, num_peers, peers.num_learners, (num_peers*100.0)/total_peers)) if __name__ == "__main__": main() ================================================ FILE: scripts/tidb.json ================================================ { "__inputs": [ { "name": "DS_TEST-CLUSTER", "label": "test-cluster", "description": "", "type": "datasource", "pluginId": "prometheus", "pluginName": "Prometheus" } ], "__requires": [ { "type": "grafana", "id": "grafana", "name": "Grafana", "version": "6.1.6" }, { "type": "panel", "id": "graph", "name": "Graph", "version": "" }, { "type": "datasource", "id": "prometheus", "name": "Prometheus", "version": "1.0.0" } ], "annotations": { "list": [ { "builtIn": 1, "datasource": "${DS_TEST-CLUSTER}", "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "type": "dashboard" } ] }, "editable": true, "gnetId": null, "graphTooltip": 1, "id": null, "links": [], "panels": [ { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, "id": 138, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB query durations by histogram buckets with different percents", "fill": 1, "gridPos": { "h": 6, "w": 12, "x": 0, "y": 1 }, "id": 80, "legend": { "alignAsTable": false, "avg": false, "current": false, "hideEmpty": false, "hideZero": false, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.999, sum(rate(tidb_server_handle_query_duration_seconds_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "999", "refId": "A" }, { "expr": "histogram_quantile(0.99, sum(rate(tidb_server_handle_query_duration_seconds_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99", "refId": "B" }, { "expr": "histogram_quantile(0.95, sum(rate(tidb_server_handle_query_duration_seconds_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "95", "refId": "C" }, { "expr": "histogram_quantile(0.80, sum(rate(tidb_server_handle_query_duration_seconds_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "80", "refId": "D" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB query processing numbers per second", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 6, "w": 12, "x": 12, "y": 1 }, "id": 42, "legend": { "alignAsTable": false, "avg": false, "current": true, "hideEmpty": true, "hideZero": false, "max": true, "min": false, "rightSide": false, "show": true, "sideWidth": 250, "sort": "max", "sortDesc": false, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "maxPerRow": 1, "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tidb_server_query_total[1m])) by (result)", "format": "time_series", "instant": false, "intervalFactor": 2, "legendFormat": "query {{result}}", "refId": "A", "step": 60 }, { "expr": "sum(rate(tidb_server_query_total{result=\"OK\"}[1m] offset 1d))", "format": "time_series", "hide": true, "instant": false, "intervalFactor": 2, "legendFormat": "yesterday", "refId": "B", "step": 90 }, { "expr": "sum(tidb_server_connections) * sum(rate(tidb_server_handle_query_duration_seconds_count[1m])) / sum(rate(tidb_server_handle_query_duration_seconds_sum[1m]))", "format": "time_series", "hide": true, "instant": false, "intervalFactor": 2, "legendFormat": "ideal QPS", "refId": "C", "step": 60 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "QPS", "tooltip": { "msResolution": true, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": null, "description": "TiDB statement statistics by statement type", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 6, "w": 12, "x": 0, "y": 7 }, "id": 21, "legend": { "alignAsTable": true, "avg": true, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": null, "sortDesc": null, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tidb_executor_statement_total[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 30 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Statement OPS", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 2, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB query total statistics including both successful and failed ones", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 6, "w": 12, "x": 12, "y": 7 }, "id": 2, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "maxPerRow": 1, "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "total", "lines": false } ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "rate(tidb_server_query_total[1m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} {{type}} {{result}}", "refId": "A", "step": 30 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "QPS By Instance", "tooltip": { "msResolution": true, "shared": true, "sort": 2, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": null, "description": "TiDB failed query statistics by query type", "fill": 0, "gridPos": { "h": 6, "w": 24, "x": 0, "y": 13 }, "id": 137, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(increase(tidb_server_execute_error_total[1m])) by (type, instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": " {{type}}-{{instance}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Failed Query OPM", "tooltip": { "shared": true, "sort": 2, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 2, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB slow query statistics with slow query durations and coprocessor waiting/executing durations", "fill": 1, "gridPos": { "h": 6, "w": 24, "x": 0, "y": 19 }, "id": 112, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.90, sum(rate(tidb_server_slow_query_process_duration_seconds_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "all_proc", "refId": "A" }, { "expr": "histogram_quantile(0.90, sum(rate(tidb_server_slow_query_cop_duration_seconds_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "all_cop_proc", "refId": "B" }, { "expr": "histogram_quantile(0.90, sum(rate(tidb_server_slow_query_wait_duration_seconds_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "all_cop_wait", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Slow query", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB durations for different query types with 99.9 percent buckets", "fill": 1, "gridPos": { "h": 6, "w": 12, "x": 0, "y": 25 }, "id": 136, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.999, sum(rate(tidb_server_handle_query_duration_seconds_bucket[1m])) by (le,sql_type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{sql_type}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "999 Duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB durations for different query types with 99 percent buckets", "fill": 1, "gridPos": { "h": 6, "w": 12, "x": 12, "y": 25 }, "id": 134, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tidb_server_handle_query_duration_seconds_bucket[1m])) by (le,sql_type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{sql_type}}", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "99 Duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB durations for different query types with 95 percent buckets", "fill": 1, "gridPos": { "h": 6, "w": 12, "x": 0, "y": 31 }, "id": 132, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.95, sum(rate(tidb_server_handle_query_duration_seconds_bucket[1m])) by (le,sql_type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{sql_type}}", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "95 Duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB durations for different query types with 80 percent buckets", "fill": 1, "gridPos": { "h": 6, "w": 12, "x": 12, "y": 31 }, "id": 130, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.80, sum(rate(tidb_server_handle_query_duration_seconds_bucket[1m])) by (le,sql_type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{sql_type}}", "refId": "D" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "80 Duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Query Summary", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 1 }, "id": 139, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB durations with 80 percent buckets by instance", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 38 }, "id": 23, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.80, sum(rate(tidb_server_handle_query_duration_seconds_bucket[1m])) by (le, instance))", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "B", "step": 60 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Duration 80 By Instance", "tooltip": { "msResolution": true, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 2, "max": null, "min": "0.001", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB durations with 95 percent buckets by instance", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 38 }, "id": 1, "legend": { "alignAsTable": true, "avg": false, "current": false, "hideEmpty": true, "hideZero": true, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.95, sum(rate(tidb_server_handle_query_duration_seconds_bucket[1m])) by (le, instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ instance }}", "refId": "B", "step": 60 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Duration 95 By Instance", "tooltip": { "msResolution": true, "shared": false, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [ "max" ] }, "yaxes": [ { "format": "s", "label": null, "logBase": 2, "max": null, "min": "0.001", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB durations with 99 percent buckets by instance", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 45 }, "id": 25, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tidb_server_handle_query_duration_seconds_bucket[1m])) by (le, instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 60 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Duration 99 By Instance", "tooltip": { "msResolution": true, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 2, "max": null, "min": "0.001", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB durations with 99.9 percent buckets by instance", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 45 }, "id": 81, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.999, sum(rate(tidb_server_handle_query_duration_seconds_bucket[1m])) by (le, instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 60 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Duration 999 By Instance", "tooltip": { "msResolution": true, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 2, "max": null, "min": "0.001", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB failed query statistics with failing infomation ", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 52 }, "id": 94, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "increase(tidb_server_execute_error_total[1m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}} @ {{instance}}", "refId": "A", "step": 60 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Failed Query OPM Detail", "tooltip": { "msResolution": true, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": null, "logBase": 2, "max": null, "min": "0.001", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The internal SQL is used by TiDB itself.", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 52 }, "id": 68, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": false, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tidb_session_restricted_sql_total[30s]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Internal SQL OPS", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Query Detail", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 2 }, "id": 140, "panels": [ { "aliasColors": {}, "bars": false, "cacheTimeout": null, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB uptime since last restart", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 3 }, "id": 184, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pluginVersion": "6.1.6", "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "total", "fill": 0, "lines": false } ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "(time() - process_start_time_seconds{job=\"tidb\"})", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 40 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Uptime", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "dtdurations", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB process rss memory usage. TiDB heap memory size in use ", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 3 }, "id": 3, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "process_resident_memory_bytes{job=\"tidb\"}", "format": "time_series", "hide": false, "intervalFactor": 1, "legendFormat": "process-{{instance}}", "refId": "A" }, { "expr": "go_memstats_heap_alloc_bytes{job=\"tidb\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "heap-{{instance}}", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Memory Usage", "tooltip": { "msResolution": true, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": "", "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": "", "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB cpu usage calculated with process cpu running seconds", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 10 }, "id": 168, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "total", "fill": 0, "lines": false } ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "rate(process_cpu_seconds_total{job=\"tidb\"}[1m])", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 40 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "CPU Usage", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB current connection counts", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 10 }, "id": 8, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "total", "fill": 0, "lines": false } ], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "expr": "tidb_server_connections", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 40 }, { "expr": "sum(tidb_server_connections)", "format": "time_series", "intervalFactor": 2, "legendFormat": "total", "refId": "B", "step": 40 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Connection Count", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB process opened file descriptors count", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 17 }, "id": 188, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "process_open_fds{job=\"tidb\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Open FD Count", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB process current goroutines count", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 17 }, "id": 61, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": " go_goroutines{job=~\"tidb.*\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Goroutine Count", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB process Go garbage collection time cost", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 24 }, "id": 183, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "total", "fill": 0, "lines": false } ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "rate(go_gc_duration_seconds_sum{job=\"tidb\"}[1m])", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 40 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Go GC Duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "dtdurations", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "Total threads TiDB process created currently", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 24 }, "id": 186, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": " go_threads{job=\"tidb\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Go Threads", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The Go garbage collection counts per second", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 31 }, "id": 185, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": " rate(go_gc_duration_seconds_count{job=\"tidb\"}[1m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Go GC Count", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The fraction of this program's available CPU time used by the GC since the program started.", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 31 }, "id": 187, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "total", "fill": 0, "lines": false } ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "go_memstats_gc_cpu_fraction{job=\"tidb\"}", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 40 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Go GC CPU Usage", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB Server critical events total, including start/close/shutdown/hang etc", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 38 }, "id": 49, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "increase(tidb_server_event_total[10m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}-server {{type}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Events OPM", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB instance monitor average keep alive times", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 38 }, "id": 82, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(increase(tidb_monitor_keep_alive_total[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Keep Alive OPM", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB instance prepare statements count", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 45 }, "id": 165, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "total", "fill": 0, "lines": false } ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "tidb_server_prepared_stmts", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 40 }, { "expr": "sum(tidb_server_prepared_stmts)", "format": "time_series", "intervalFactor": 2, "legendFormat": "total", "refId": "B", "step": 40 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Prepare Statement Count", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB monitor time jump back count", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 45 }, "id": 166, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(increase(tidb_monitor_time_jump_back_total[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Time Jump Back OPS", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB instance critical errors count including panic etc", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 52 }, "id": 54, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "increase(tidb_server_panic_total[1m])", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "panic-{{instance}}", "refId": "A" }, { "expr": "increase(tidb_server_critical_error_total[1m])", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "critical-{{instance}}", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Write Binlog Error", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB instance critical errors count including panic etc", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 52 }, "id": 191, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "tidb_server_critical_error_total", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Skip Binlog Count", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "Duration (us) for getting token, it should be small until concurrency limit is reached.", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 59 }, "id": 111, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tidb_server_get_token_duration_seconds_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 1, "legendFormat": "99", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Get Token Duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "µs", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB processing handshake error count", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 59 }, "id": 167, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(increase(tidb_server_handshake_error_total[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Handshake Error OPS", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Server", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 3 }, "id": 141, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB transaction processing counts by type and source. Internal means TiDB inner transcation calls", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 8, "x": 0, "y": 4 }, "id": 69, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tidb_session_transaction_duration_seconds_count[1m])) by (type, sql_type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}-{{sql_type}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Transaction OPS", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "Bucketed histogram of transaction execution durations, including retry", "fill": 1, "gridPos": { "h": 7, "w": 8, "x": 8, "y": 4 }, "id": 72, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tidb_session_transaction_duration_seconds_bucket[1m])) by (le, sql_type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99-{{sql_type}}", "refId": "A" }, { "expr": "histogram_quantile(0.95, sum(rate(tidb_session_transaction_duration_seconds_bucket[1m])) by (le, sql_type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "95-{{sql_type}}", "refId": "B" }, { "expr": "histogram_quantile(0.80, sum(rate(tidb_session_transaction_duration_seconds_bucket[1m])) by (le, sql_type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "80-{{sql_type}}", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB statements numbers within one transaction. Internal means TiDB inner transaction", "fill": 1, "gridPos": { "h": 7, "w": 8, "x": 16, "y": 4 }, "id": 74, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tidb_session_transaction_statement_num_bucket[30s])) by (le, sql_type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99-{{sql_type}}", "refId": "A" }, { "expr": "histogram_quantile(0.80, sum(rate(tidb_session_transaction_statement_num_bucket[30s])) by (le, sql_type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "80-{{sql_type}}", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Transaction Statement Num", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB transaction retry histogram bucket statistics", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 8, "x": 0, "y": 11 }, "id": 67, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(1.0, sum(rate(tidb_session_retry_num_bucket[30s])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "100", "refId": "A", "step": 10 }, { "expr": "histogram_quantile(0.99, sum(rate(tidb_session_retry_num_bucket[30s])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99", "refId": "B" }, { "expr": "histogram_quantile(0.90, sum(rate(tidb_session_retry_num_bucket[30s])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "90", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Transaction Retry Num", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "Error numbers of transaction retry", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 8, "x": 8, "y": 11 }, "id": 36, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tidb_session_retry_error_total[30s])) by (type, sql_type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}-{{sql_type}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Session Retry Error OPS", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB transaction latch wait time on key value storage", "fill": 1, "gridPos": { "h": 7, "w": 8, "x": 16, "y": 11 }, "id": 175, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tidb_tikvclient_local_latch_wait_seconds_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99", "refId": "A" }, { "expr": "histogram_quantile(0.95, sum(rate(tidb_tikvclient_local_latch_wait_seconds_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "95", "refId": "B" }, { "expr": "histogram_quantile(0.80, sum(rate(tidb_tikvclient_local_latch_wait_seconds_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "80", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Local Latch Wait Duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB total kv transaction counts", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 8, "x": 0, "y": 18 }, "id": 4, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tidb_tikvclient_txn_cmd_duration_seconds_count[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 40 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "KV Transaction OPS", "tooltip": { "msResolution": true, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The duration of the transaction commit/rollback on TiKV.", "fill": 1, "gridPos": { "h": 7, "w": 8, "x": 8, "y": 18 }, "id": 193, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tidb_tikvclient_txn_cmd_duration_seconds_bucket[1m])) by (le, type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99-{{type}}", "refId": "A" }, { "expr": "histogram_quantile(0.95, sum(rate(tidb_tikvclient_txn_cmd_duration_seconds_bucket[1m])) by (le, type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "95-{{type}}", "refId": "B" }, { "expr": "histogram_quantile(0.80, sum(rate(tidb_tikvclient_txn_cmd_duration_seconds_bucket[1m])) by (le, type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "80-{{type}}", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "KV Transaction Duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "regions transaction operates on count", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 8, "x": 16, "y": 18 }, "id": 44, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.90, sum(rate(tidb_tikvclient_txn_regions_num_bucket[1m])) by (le, instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 40 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Transaction Regions Num 90", "tooltip": { "msResolution": true, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "kv write times per transaction execution", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 8, "x": 0, "y": 25 }, "id": 33, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": true, "min": false, "rightSide": true, "show": true, "sort": "avg", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(1, sum(rate(tidb_tikvclient_txn_write_kv_num_bucket[1m])) by (le, instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": "100 {{instance}}", "refId": "B", "step": 40 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Transaction Max Write KV Num", "tooltip": { "msResolution": false, "shared": true, "sort": 2, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "kv write size per transaction execution", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 8, "x": 8, "y": 25 }, "id": 34, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": true, "min": false, "rightSide": true, "show": true, "sort": "avg", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(1, sum(rate(tidb_tikvclient_txn_write_size_bytes_bucket[1m])) by (le, instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 40 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Transaction Max Write Size Bytes", "tooltip": { "msResolution": false, "shared": true, "sort": 2, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "safe point loading times", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 8, "x": 16, "y": 25 }, "id": 83, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": true, "min": false, "rightSide": true, "show": true, "sort": "avg", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tidb_tikvclient_load_safepoint_total{type=\"ok\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "B", "step": 40 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Load Safepoint OPS", "tooltip": { "msResolution": false, "shared": true, "sort": 2, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "When the pessimistic transaction begins to work, it will send heartbeat requests to update its TTL. \nThis metric is the latency of the send heartbeat operation.", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 8, "x": 0, "y": 32 }, "id": 194, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": true, "min": false, "rightSide": true, "show": true, "sort": "avg", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.80, sum(rate(tidb_tikvclient_txn_heart_beat_bucket[1m])) by (le, type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "80-{{type}}", "refId": "B", "step": 40 }, { "expr": "histogram_quantile(0.95, sum(rate(tidb_tikvclient_txn_heart_beat_bucket[1m])) by (le, type))", "format": "time_series", "intervalFactor": 1, "legendFormat": "95-{{type}}", "refId": "A" }, { "expr": "histogram_quantile(0.99, sum(rate(tidb_tikvclient_txn_heart_beat_bucket[1m])) by (le, type))", "format": "time_series", "intervalFactor": 1, "legendFormat": "99-{{type}}", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Send HeartBeat Duration", "tooltip": { "msResolution": false, "shared": true, "sort": 2, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "This metric means the pessimistic lives too long which is abnormal.", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 8, "x": 8, "y": 32 }, "id": 195, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "", "format": "time_series", "intervalFactor": 1, "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "TTL Lifetime Reach Counter", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Transaction", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }, "id": 142, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": null, "description": "The time cost of parsing SQL to AST", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 82 }, "id": 156, "interval": "", "legend": { "alignAsTable": true, "avg": true, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": null, "sortDesc": null, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.95, sum(rate(tidb_session_parse_duration_seconds_bucket[1m])) by (le, sql_type))", "format": "time_series", "instant": false, "intervalFactor": 2, "legendFormat": "{{sql_type}}", "refId": "A", "step": 30 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Parse Duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": null, "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": null, "description": "The time cost of building the query plan", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 82 }, "id": 154, "interval": "", "legend": { "alignAsTable": true, "avg": true, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": null, "sortDesc": null, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.95, sum(rate(tidb_session_compile_duration_seconds_bucket[1m])) by (le, sql_type))", "format": "time_series", "instant": false, "intervalFactor": 2, "legendFormat": "{{sql_type}}", "refId": "A", "step": 30 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Compile Duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": null, "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": null, "description": "The time cost of executing the SQL which does not include the time to get the results of the query .", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 90 }, "id": 169, "interval": "", "legend": { "alignAsTable": true, "avg": true, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": null, "sortDesc": null, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.95, sum(rate(tidb_session_execute_duration_seconds_bucket[1m])) by (le, sql_type))", "format": "time_series", "instant": false, "intervalFactor": 2, "legendFormat": "{{sql_type}}", "refId": "A", "step": 30 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Execution Duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": null, "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": null, "description": "TiDB executors using more cpu and memory resources", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 90 }, "id": 76, "legend": { "alignAsTable": true, "avg": true, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": null, "sortDesc": null, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tidb_executor_expensive_total[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 30 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Expensive Executors OPS", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 10, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": null, "description": "TiDB plan cache hit total", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 98 }, "id": 91, "legend": { "alignAsTable": true, "avg": true, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": null, "sortDesc": null, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tidb_server_plan_cache_total[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 30 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Queries Using Plan Cache OPS", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 2, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Executor", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 }, "id": 143, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "durations of distsql execution by type", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 122 }, "id": 12, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [ { "type": "dashboard" } ], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.999, sum(rate(tidb_distsql_handle_query_duration_seconds_bucket[1m])) by (le, type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "999-{{type}}", "refId": "D" }, { "expr": "histogram_quantile(0.99, sum(rate(tidb_distsql_handle_query_duration_seconds_bucket[1m])) by (le, type))", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "99-{{type}}", "metric": "tidb_distsql_handle_query_duration_seconds_bucket", "refId": "A", "step": 4 }, { "expr": "histogram_quantile(0.90, sum(rate(tidb_distsql_handle_query_duration_seconds_bucket[1m])) by (le, type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "90-{{type}}", "refId": "B" }, { "expr": "histogram_quantile(0.50, sum(rate(tidb_distsql_handle_query_duration_seconds_bucket[1m])) by (le, type))", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "50-{{type}}", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Distsql Duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 2, "max": null, "min": "0.0005", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "distsql query handling durations per second", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 122 }, "id": 14, "legend": { "avg": false, "current": false, "max": false, "min": false, "rightSide": false, "show": false, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tidb_distsql_handle_query_duration_seconds_count[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "", "metric": "tidb_distsql_query_total", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Distsql QPS", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": 0, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "the numebr of distsql partial scan numbers", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 8, "x": 0, "y": 129 }, "id": 60, "legend": { "avg": false, "current": false, "max": false, "min": false, "rightSide": false, "show": false, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tidb_distsql_scan_keys_partial_num_count[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "", "metric": "tidb_distsql_query_total", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Distsql Partial QPS", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": 0, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "the numebr of distsql scan numbers", "fill": 1, "gridPos": { "h": 7, "w": 8, "x": 8, "y": 129 }, "id": 57, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(1, sum(rate(tidb_distsql_scan_keys_num_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "100", "refId": "A" }, { "expr": "histogram_quantile(0.90, sum(rate(tidb_distsql_scan_keys_num_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "90", "refId": "B" }, { "expr": "histogram_quantile(0.50, sum(rate(tidb_distsql_scan_keys_num_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "50", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Scan Keys Num", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "the numebr of distsql partial scan key numbers", "fill": 1, "gridPos": { "h": 7, "w": 8, "x": 16, "y": 129 }, "id": 58, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(1, sum(rate(tidb_distsql_scan_keys_partial_num_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "100", "refId": "A" }, { "expr": "histogram_quantile(0.90, sum(rate(tidb_distsql_scan_keys_partial_num_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "90", "refId": "B" }, { "expr": "histogram_quantile(0.80, sum(rate(tidb_distsql_scan_keys_partial_num_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "50", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Scan Keys Partial Num", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "distsql partial numbers per query", "fill": 1, "gridPos": { "h": 7, "w": 8, "x": 0, "y": 136 }, "id": 59, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(1, sum(rate(tidb_distsql_partial_num_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "100", "refId": "A" }, { "expr": "histogram_quantile(0.90, sum(rate(tidb_distsql_partial_num_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "90", "refId": "B" }, { "expr": "histogram_quantile(0.50, sum(rate(tidb_distsql_partial_num_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "50", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Partial Num", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "kv storage coprocessor processing durations", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 8, "x": 8, "y": 136 }, "id": 41, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": true, "min": false, "rightSide": true, "show": true, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.999, sum(rate(tidb_tikvclient_cop_duration_seconds_bucket[1m])) by (le, instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Coprocessor Seconds 999", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Distsql", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 6 }, "id": 144, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "kv backoff time durations by type", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 7 }, "id": 6, "legend": { "alignAsTable": false, "avg": false, "current": false, "max": true, "min": false, "rightSide": false, "show": true, "sort": null, "sortDesc": null, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.999, sum(rate(tidb_tikvclient_backoff_seconds_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "999", "refId": "A", "step": 40 }, { "expr": "histogram_quantile(0.99, sum(rate(tidb_tikvclient_backoff_seconds_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99", "refId": "B" }, { "expr": "histogram_quantile(0.80, sum(rate(tidb_tikvclient_backoff_seconds_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "80", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "KV Backoff Duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 2, "description": "kv region error times", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 7 }, "id": 11, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tidb_tikvclient_region_err_total[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "metric": "tidb_server_session_execute_parse_duration_count", "refId": "A", "step": 40 }, { "expr": "sum(rate(tidb_tikvclient_region_err_total{type=\"server_is_busy\"}[1m]))", "format": "time_series", "hide": true, "intervalFactor": 2, "legendFormat": "sum", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "TiClient Region Error OPS", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "kv storage backoff times", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 14 }, "id": 53, "legend": { "alignAsTable": true, "avg": false, "current": false, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "total": true, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tidb_tikvclient_backoff_seconds_count[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "KV Backoff OPS", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "lock resolve times", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 14 }, "id": 32, "legend": { "alignAsTable": true, "avg": false, "current": false, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tidb_tikvclient_lock_resolver_actions_total[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "metric": "tidb_tikvclient_lock_resolver_actions_total", "refId": "A", "step": 40 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Lock Resolve OPS", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "lock cleanup failed times and safe point update times", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 21 }, "id": 84, "legend": { "alignAsTable": true, "avg": false, "current": false, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tidb_tikvclient_lock_cleanup_task_total[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "cleanup_secondary_failure_{{type}}", "metric": "tidb_tikvclient_lock_resolver_actions_total", "refId": "A", "step": 40 }, { "expr": "sum(rate(tidb_tikvclient_load_safepoint_total{type=\"fail\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "load_safepoint_failure", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Other Errors OPS", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "KV Errors", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 7 }, "id": 145, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "kv request total by instance and command type", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 8, "x": 0, "y": 8 }, "id": 172, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tidb_tikvclient_request_seconds_count[1m])) by (instance, type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}-{{type}}", "refId": "A", "step": 40 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "KV Request OPS", "tooltip": { "msResolution": true, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "kv requests durations by store", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 8, "x": 8, "y": 8 }, "id": 48, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": true, "min": false, "rightSide": true, "show": true, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tidb_tikvclient_request_seconds_bucket{type!=\"GC\"}[1m])) by (le, store))", "format": "time_series", "intervalFactor": 2, "legendFormat": "store-{{store}}", "refId": "A", "step": 40 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "KV Request Duration 99 by store", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "kv request durations by request type", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 8, "x": 16, "y": 8 }, "id": 30, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": true, "min": false, "rightSide": true, "show": true, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tidb_tikvclient_request_seconds_bucket{type!=\"GC\"}[1m])) by (le,type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 40 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "KV Request Duration 99 by type", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "KV Request", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 8 }, "id": 147, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "pd command count by type", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 8, "x": 0, "y": 10 }, "id": 20, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(pd_client_cmd_handle_cmds_duration_seconds_count{type!=\"tso\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "PD Client CMD OPS", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "pd client command durations by type within 99.9 percent buckets", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 8, "x": 8, "y": 10 }, "id": 35, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.999, sum(rate(pd_client_cmd_handle_cmds_duration_seconds_bucket{type!~\"tso|tso_async_wait\"}[1m])) by (le, type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "999-{{type}}", "refId": "A", "step": 10 }, { "expr": "histogram_quantile(0.99, sum(rate(pd_client_cmd_handle_cmds_duration_seconds_bucket{type!~\"tso|tso_async_wait\"}[1m])) by (le, type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99-{{type}}", "refId": "B" }, { "expr": "histogram_quantile(0.90, sum(rate(pd_client_cmd_handle_cmds_duration_seconds_bucket{type!~\"tso|tso_async_wait\"}[1m])) by (le, type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "90-{{type}}", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "PD Client CMD Duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "pd client command fail count by type", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 8, "x": 16, "y": 10 }, "id": 43, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(pd_client_cmd_handle_failed_cmds_duration_seconds_count[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "PD Client CMD Fail OPS", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The duration of a client calling GetTSAsync until received the TS result.", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 8, "x": 0, "y": 17 }, "id": 79, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(pd_client_cmd_handle_cmds_duration_seconds_count{type=\"tso\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "cmd", "refId": "C" }, { "expr": "sum(rate(pd_client_request_handle_requests_duration_seconds_count{type=\"tso\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "request", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "PD TSO OPS", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The duration of a client starting to wait for the TS until received the TS result.", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 8, "x": 8, "y": 17 }, "id": 77, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.999, sum(rate(pd_client_cmd_handle_cmds_duration_seconds_bucket{type=\"wait\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "999", "refId": "A", "step": 10 }, { "expr": "histogram_quantile(0.99, sum(rate(pd_client_cmd_handle_cmds_duration_seconds_bucket{type=\"wait\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99", "refId": "B" }, { "expr": "histogram_quantile(0.90, sum(rate(pd_client_cmd_handle_cmds_duration_seconds_bucket{type=\"wait\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "90", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "PD TSO Wait Duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The duration of a client sending TSO request until received the response.", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 8, "x": 16, "y": 17 }, "id": 78, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.999, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{type=\"tso\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "999", "refId": "A", "step": 10 }, { "expr": "histogram_quantile(0.99, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{type=\"tso\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99", "refId": "B" }, { "expr": "histogram_quantile(0.90, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{type=\"tso\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "90", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "PD TSO RPC Duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The duration of the waiting time for getting the start timestamp oracle", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 8, "x": 0, "y": 24 }, "id": 159, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.999, sum(rate(tidb_pdclient_ts_future_wait_seconds_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "999", "refId": "A", "step": 10 }, { "expr": "histogram_quantile(0.99, sum(rate(tidb_pdclient_ts_future_wait_seconds_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99", "refId": "B" }, { "expr": "histogram_quantile(0.90, sum(rate(tidb_pdclient_ts_future_wait_seconds_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "90", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Start TSO Wait Duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "PD Client", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 9 }, "id": 148, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB loading schema time durations by instance", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 47 }, "id": 27, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tidb_domain_load_schema_duration_seconds_bucket[1m])) by (le, instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Load Schema Duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB loading schema times including both failed and successful ones", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 47 }, "id": 28, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "/.*failed/", "bars": true } ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tidb_domain_load_schema_total[1m])) by (instance,type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}-{{type}}", "metric": "tidb_domain_load_schema_duration_count", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Load Schema OPS", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 10, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 2, "description": "TiDB schema lease error counts", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 54 }, "id": 29, "legend": { "alignAsTable": true, "avg": false, "current": false, "hideEmpty": true, "hideZero": true, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(increase(tidb_session_schema_lease_error_total[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tidb_server_", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Schema Lease Error OPM", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB load privilege counts", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 54 }, "id": 157, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "/.*failed/", "bars": true } ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tidb_domain_load_privilege_total[1m])) by (instance,type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}-{{type}}", "metric": "tidb_domain_load_schema_duration_count", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Load Privilege OPS", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 10, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Schema Load", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 10 }, "id": 149, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB DDL duration statistics", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 8, "x": 0, "y": 12 }, "id": 9, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.95, sum(rate(tidb_ddl_handle_job_duration_seconds_bucket[1m])) by (le, type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "DDL Duration 95", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB batch add index durations by histogram buckets", "fill": 1, "gridPos": { "h": 7, "w": 8, "x": 8, "y": 12 }, "id": 63, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(1, sum(rate(tidb_ddl_batch_add_idx_duration_seconds_bucket[1m])) by (le, type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A" }, { "expr": "sum(rate(tidb_ddl_add_index_total[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Batch Add Index Duration 100", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB ddl request in queue", "fill": 1, "gridPos": { "h": 7, "w": 8, "x": 16, "y": 12 }, "id": 62, "legend": { "alignAsTable": true, "avg": false, "current": false, "hideEmpty": true, "hideZero": true, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "tidb_ddl_waiting_jobs", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}-{{type}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "DDL Waiting Jobs Count", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB different ddl worker numbers", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 19 }, "id": 55, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "increase(tidb_ddl_worker_operation_total[1m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}-{{type}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "DDL META OPM", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB worker duration by type, action, results", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 19 }, "id": 56, "legend": { "alignAsTable": true, "avg": false, "current": false, "hideEmpty": true, "hideZero": true, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(increase(tidb_ddl_worker_operation_duration_seconds_bucket[1m])) by (le, type, action, result))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}-{{action}}-{{result}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "DDL Worker Duration 99", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB ddl schema syncer statistics, including init, start, watch, clear function call time cost", "fill": 1, "gridPos": { "h": 7, "w": 8, "x": 0, "y": 26 }, "id": 64, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(1, sum(rate(tidb_ddl_deploy_syncer_duration_seconds_bucket[2m])) by (le, type, result))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}-{{result}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Deploy Syncer Duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB ddl owner time operations on etcd duration statistics ", "fill": 1, "gridPos": { "h": 7, "w": 8, "x": 8, "y": 26 }, "id": 65, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(1, sum(rate(tidb_ddl_owner_handle_syncer_duration_seconds_bucket[2m])) by (le, type, result))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}-{{result}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Owner Handle Syncer Duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB schema syncer version update time duration", "fill": 1, "gridPos": { "h": 7, "w": 8, "x": 16, "y": 26 }, "id": 66, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(1, sum(rate(tidb_ddl_update_self_ver_duration_seconds_bucket[2m])) by (le, result))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{result}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Update Self Version Duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "executed DDL jobs per minute", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 33 }, "id": 190, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tidb_ddl_handle_job_duration_seconds_count[1m])) by (type)", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{ type }}", "refId": "A" }, { "expr": "sum(rate(tidb_ddl_handle_job_duration_seconds_count[1m]))", "format": "time_series", "intervalFactor": 1, "legendFormat": "total", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "DDL OPM", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB DDL add index progress in percentage. The value is [0,100]", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 33 }, "id": 193, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "options": {}, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "tidb_ddl_add_index_percentage_progress", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{instance}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "DDL add index progress in percentage", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "DDL", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 11 }, "id": 150, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB auto analyze time durations within 95 percent histogram buckets", "fill": 1, "gridPos": { "h": 7, "w": 8, "x": 0, "y": 149 }, "id": 46, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.95, sum(rate(tidb_statistics_auto_analyze_duration_seconds_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "auto analyze duration", "refId": "A", "step": 30 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Auto Analyze Duration 95", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB auto analyze query per second", "fill": 1, "gridPos": { "h": 7, "w": 8, "x": 8, "y": 149 }, "id": 47, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tidb_statistics_auto_analyze_total[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 30 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Auto Analyze QPS", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB statistics inaccurate rate", "fill": 1, "gridPos": { "h": 7, "w": 8, "x": 16, "y": 149 }, "id": 70, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tidb_statistics_stats_inaccuracy_rate_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99", "refId": "A", "step": 30 }, { "expr": "histogram_quantile(0.90, sum(rate(tidb_statistics_stats_inaccuracy_rate_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "90", "refId": "B" }, { "expr": "histogram_quantile(0.50, sum(rate(tidb_statistics_stats_inaccuracy_rate_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "50", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Stats Inaccuracy Rate", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB optimizer using pseudo estimation counts", "fill": 1, "gridPos": { "h": 7, "w": 8, "x": 0, "y": 156 }, "id": 71, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tidb_statistics_pseudo_estimation_total[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "Pseudo OPS", "refId": "A", "step": 30 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Pseudo Estimation OPS", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB dumping statistics back to kv storage times", "fill": 1, "gridPos": { "h": 7, "w": 8, "x": 8, "y": 156 }, "id": 92, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tidb_statistics_dump_feedback_total[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 30 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Dump Feedback OPS", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB store quering feedback counts", "fill": 1, "gridPos": { "h": 7, "w": 8, "x": 16, "y": 156 }, "id": 170, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tidb_statistics_store_query_feedback_total[1m])) by (type) ", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 30 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Store Query Feedback QPS", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "Counter of query feedback whose actual count is much different than calculated by current statistics", "fill": 1, "gridPos": { "h": 7, "w": 8, "x": 0, "y": 163 }, "id": 113, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tidb_statistics_high_error_rate_feedback_total[1m]))", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "Significant Feedback", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Significant Feedback", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB updating statistics using feed back counts", "fill": 1, "gridPos": { "h": 7, "w": 8, "x": 8, "y": 163 }, "id": 93, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tidb_statistics_update_stats_total[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 30 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Update Stats OPS", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB fast analyze statistics ", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 8, "x": 16, "y": 163 }, "id": 173, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": true, "min": false, "rightSide": true, "show": true, "sort": "avg", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(1, sum(rate(tidb_statistics_fast_analyze_status_bucket[1m])) by (le, type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 40 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Fast Analyze Status 100", "tooltip": { "msResolution": false, "shared": true, "sort": 2, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Statistics", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 12 }, "id": 161, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB new session durations for new etcd sessions", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 8, "x": 0, "y": 150 }, "id": 162, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.95, sum(rate(tidb_owner_new_session_duration_seconds_bucket[1m])) by (le, instance, result))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}-{{result}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "New ETCD Session Duration 95", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB owner watcher counts", "fill": 1, "gridPos": { "h": 7, "w": 8, "x": 8, "y": 150 }, "id": 163, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tidb_owner_watch_owner_total[1m])) by (type, result, instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}-{{result}}-{{instance}}", "refId": "A", "step": 30 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Owner Watcher OPS", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 7, "w": 8, "x": 16, "y": 150 }, "id": 174, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tidb_owner_watch_owner_total[1m])) by (type, result, instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}-{{result}}-{{instance}}", "refId": "A", "step": 30 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Owner Watcher OPS", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "title": "Owner", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 }, "id": 151, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB auto id requests per second including single table/global auto id processing and single table auto id rebase processing", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 151 }, "id": 50, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tidb_autoid_operation_duration_seconds_count[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "AutoID QPS", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "AutoID QPS", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB auto id requests durations", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 151 }, "id": 51, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tidb_autoid_operation_duration_seconds_bucket[1m])) by (le, type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99-{{type}}", "refId": "B" }, { "expr": "histogram_quantile(0.80, sum(rate(tidb_autoid_operation_duration_seconds_bucket[1m])) by (le, type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "80-{{type}}", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "AutoID Duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 2, "max": null, "min": "0.001", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB region cache operations count", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 158 }, "id": 164, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tidb_tikvclient_region_cache_operations_total{result=\"err\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 30 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Region Cache Error OPS", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB meta operation durations including get/set schema and ddl jobs", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 158 }, "id": 52, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tidb_meta_operation_duration_seconds_bucket[1m])) by (le, type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Meta Operations Duration 99", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Meta", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 }, "id": 152, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "kv storage garbage collection counts by type", "fill": 1, "gridPos": { "h": 7, "w": 8, "x": 0, "y": 152 }, "id": 85, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(increase(tidb_tikvclient_gc_worker_actions_total[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Worker Action OPM", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 2, "description": "kv storage garbage collection time durations", "fill": 1, "gridPos": { "h": 7, "w": 8, "x": 8, "y": 152 }, "id": 86, "legend": { "avg": false, "current": false, "hideEmpty": true, "hideZero": true, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tidb_tikvclient_gc_seconds_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Duration 99", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "cacheTimeout": null, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "kv storage garbage collection config including gc_life_time and gc_run_interval", "fill": 1, "gridPos": { "h": 7, "w": 8, "x": 16, "y": 152 }, "id": 87, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pluginVersion": "6.1.6", "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "max(tidb_tikvclient_gc_config) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Config", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "kv storage garbage collection failing counts", "fill": 1, "gridPos": { "h": 7, "w": 8, "x": 0, "y": 159 }, "id": 88, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(increase(tidb_tikvclient_gc_failure[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "GC Failure OPM", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "kv storage unsafe destroy range failed counts", "fill": 1, "gridPos": { "h": 7, "w": 8, "x": 8, "y": 159 }, "id": 158, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(increase(tidb_tikvclient_gc_unsafe_destroy_range_failures[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Delete Range Failure OPM", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "kv storage region garbage collection clean too many locks count", "fill": 1, "gridPos": { "h": 7, "w": 8, "x": 16, "y": 159 }, "id": 90, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(increase(tidb_tikvclient_gc_region_too_many_locks[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "Locks Error OPM", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Too Many Locks Error OPM", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "kv storage garbage collection results including failed and successful ones", "fill": 1, "gridPos": { "h": 7, "w": 8, "x": 0, "y": 166 }, "id": 89, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(increase(tidb_tikvclient_gc_action_result[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Action Result OPM", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "kv storage delete range task execution status by type", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 8, "x": 8, "y": 166 }, "id": 181, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "total", "fill": 0, "lines": false } ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(tidb_tikvclient_range_task_stats) by (type, result)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}-{{result}}", "refId": "A", "step": 40 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Delete Range Task Status", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "kv storage range worker processing one task duration", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 8, "x": 16, "y": 166 }, "id": 182, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.95, sum(rate(tidb_tikvclient_range_task_push_duration_bucket[1m])) by (le, instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Push Task Duration 95", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "GC", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 15 }, "id": 178, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "kv storage batch requests in queue", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 8, "x": 0, "y": 153 }, "id": 176, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "total", "fill": 0, "lines": false } ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(tidb_tikvclient_pending_batch_requests) by (store)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{store}}", "refId": "A", "step": 40 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Pending Request Count by TiKV", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "kv storage batch processing durations", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 8, "x": 8, "y": 153 }, "id": 179, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.95, sum(rate(tidb_tikvclient_batch_wait_duration_bucket[1m])) by (le, instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Wait Duration 95", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ns", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "kv storage batch processing unvailable durations", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 8, "x": 16, "y": 153 }, "id": 180, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.95, sum(rate(tidb_tikvclient_batch_client_unavailable_seconds_bucket[1m])) by (le, instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Batch Client Unavailable Duration 95", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "title": "Batch Client", "type": "row" } ], "refresh": "30s", "schemaVersion": 18, "style": "dark", "tags": [], "templating": { "list": [] }, "time": { "from": "now-1h", "to": "now" }, "timepicker": { "refresh_intervals": [ "5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d" ], "time_options": [ "5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d" ] }, "timezone": "browser", "title": "Test-Cluster-TiDB", "uid": "000000011", "version": 5 } ================================================ FILE: scripts/tidb_summary.json ================================================ { "__inputs": [ { "name": "DS_TEST-CLUSTER", "label": "test-cluster", "description": "", "type": "datasource", "pluginId": "prometheus", "pluginName": "Prometheus" } ], "__requires": [ { "type": "grafana", "id": "grafana", "name": "Grafana", "version": "6.1.6" }, { "type": "panel", "id": "graph", "name": "Graph", "version": "" }, { "type": "datasource", "id": "prometheus", "name": "Prometheus", "version": "1.0.0" } ], "annotations": { "list": [ { "builtIn": 1, "datasource": "${DS_TEST-CLUSTER}", "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "type": "dashboard" } ] }, "editable": true, "gnetId": null, "graphTooltip": 1, "id": null, "links": [], "panels": [ { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, "id": 140, "panels": [ { "aliasColors": {}, "bars": false, "cacheTimeout": null, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB uptime since the last restart.", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 1 }, "id": 184, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pluginVersion": "6.1.6", "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "total", "fill": 0, "lines": false } ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "(time() - process_start_time_seconds{job=\"tidb\"})", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 40 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Uptime", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "dtdurations", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB current connection counts.", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 1 }, "id": 8, "legend": { "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "total", "fill": 0, "lines": false } ], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "expr": "tidb_server_connections", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 40 }, { "expr": "sum(tidb_server_connections)", "format": "time_series", "intervalFactor": 2, "legendFormat": "total", "refId": "B", "step": 40 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Connection Count", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 0, "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB CPU usage calculated with process CPU running seconds.", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 8 }, "id": 168, "legend": { "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "total", "fill": 0, "lines": false } ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "rate(process_cpu_seconds_total{job=\"tidb\"}[1m])", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 40 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "CPU Usage", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 1, "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB process rss memory usage.\nTiDB heap memory size in use.", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 8 }, "id": 3, "legend": { "alignAsTable": false, "avg": false, "current": false, "hideEmpty": true, "hideZero": false, "max": false, "min": false, "rightSide": false, "show": true, "sideWidth": null, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "process_resident_memory_bytes{job=\"tidb\"}", "format": "time_series", "hide": false, "intervalFactor": 1, "legendFormat": "process-{{instance}}", "refId": "A" }, { "expr": "go_memstats_heap_alloc_bytes{job=\"tidb\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "heap-{{instance}}", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Memory Usage", "tooltip": { "msResolution": true, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": "", "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": "", "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } } ], "title": "Server", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 1 }, "id": 138, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB query durations by histogram buckets with different percents.", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 2 }, "id": 80, "legend": { "alignAsTable": false, "avg": false, "current": false, "hideEmpty": false, "hideZero": false, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tidb_server_handle_query_duration_seconds_bucket{sql_type!=\"internal\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99", "refId": "B" }, { "expr": "histogram_quantile(0.95, sum(rate(tidb_server_handle_query_duration_seconds_bucket{sql_type!=\"internal\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "95", "refId": "C" }, { "expr": "sum(rate(tidb_server_handle_query_duration_seconds_sum{sql_type!=\"internal\"}[30s])) / sum(rate(tidb_server_handle_query_duration_seconds_count{sql_type!=\"internal\"}[30s]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "refId": "D" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": null, "description": "TiDB failed query statistics by query type.", "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 2 }, "id": 137, "legend": { "alignAsTable": false, "avg": false, "current": false, "hideEmpty": true, "hideZero": true, "max": false, "min": false, "rightSide": false, "show": true, "sideWidth": 250, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(increase(tidb_server_execute_error_total[1m])) by (type, instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": " {{type}}-{{instance}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Failed Query OPS", "tooltip": { "shared": true, "sort": 2, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 0, "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB query processing numbers per second.", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 9 }, "id": 42, "legend": { "alignAsTable": false, "avg": false, "current": false, "hideEmpty": true, "hideZero": false, "max": false, "min": false, "rightSide": false, "show": true, "sideWidth": 250, "sort": null, "sortDesc": null, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "maxPerRow": 1, "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tidb_server_query_total[1m])) by (result)", "format": "time_series", "instant": false, "intervalFactor": 2, "legendFormat": "query {{result}}", "refId": "A", "step": 60 }, { "expr": "sum(rate(tidb_server_query_total{result=\"OK\"}[1m] offset 1d))", "format": "time_series", "hide": true, "instant": false, "intervalFactor": 2, "legendFormat": "yesterday", "refId": "B", "step": 90 }, { "expr": "sum(tidb_server_connections) * sum(rate(tidb_server_handle_query_duration_seconds_count[1m])) / sum(rate(tidb_server_handle_query_duration_seconds_sum[1m]))", "format": "time_series", "hide": true, "instant": false, "intervalFactor": 2, "legendFormat": "ideal QPS", "refId": "C", "step": 60 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "QPS", "tooltip": { "msResolution": true, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 0, "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB query total statistics including both successful and failed ones.", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 9 }, "id": 2, "legend": { "alignAsTable": false, "avg": false, "current": false, "hideEmpty": true, "hideZero": true, "max": false, "min": false, "rightSide": false, "show": true, "sideWidth": 250, "sort": "max", "sortDesc": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "maxPerRow": 1, "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "total", "lines": false } ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "rate(tidb_server_query_total[1m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} ", "refId": "A", "step": 30 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "QPS By Instance", "tooltip": { "msResolution": true, "shared": true, "sort": 2, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 0, "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": null, "description": "TiDB statement statistics by statement type.", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 16 }, "id": 21, "legend": { "alignAsTable": false, "avg": false, "current": false, "hideEmpty": true, "hideZero": true, "max": false, "min": false, "rightSide": false, "show": true, "sort": null, "sortDesc": null, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tidb_executor_statement_total[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 30 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "QPS by Statement", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 0, "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": null, "description": "MySQL command statistics by command type", "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 16 }, "id": 189, "legend": { "alignAsTable": false, "avg": false, "current": false, "hideEmpty": true, "hideZero": true, "max": false, "min": false, "rightSide": false, "show": true, "sideWidth": 250, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "rate(tidb_server_query_total[1m])", "format": "time_series", "intervalFactor": 2, "legendFormat": " {{type}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "QPS by CMD", "tooltip": { "shared": true, "sort": 2, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 0, "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "title": "Query Summary", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 2 }, "id": 142, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": null, "description": "The time cost of parsing SQL to AST", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 3 }, "id": 156, "interval": "", "legend": { "alignAsTable": false, "avg": false, "current": false, "hideEmpty": true, "hideZero": true, "max": false, "min": false, "rightSide": false, "show": true, "sort": null, "sortDesc": null, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tidb_session_parse_duration_seconds_bucket{sql_type=\"general\"}[1m])) by (le))", "format": "time_series", "instant": false, "intervalFactor": 2, "legendFormat": "99", "refId": "A", "step": 30 }, { "expr": "histogram_quantile(0.95, sum(rate(tidb_session_parse_duration_seconds_bucket{sql_type=\"general\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 1, "legendFormat": "95", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Parse Duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": null, "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": null, "description": "The time cost of building the query plan", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 3 }, "id": 154, "interval": "", "legend": { "alignAsTable": false, "avg": false, "current": false, "hideEmpty": true, "hideZero": true, "max": false, "min": false, "rightSide": false, "show": true, "sort": null, "sortDesc": null, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tidb_session_compile_duration_seconds_bucket{sql_type=\"general\"}[1m])) by (le))", "format": "time_series", "instant": false, "intervalFactor": 2, "legendFormat": "99", "refId": "A", "step": 30 }, { "expr": "histogram_quantile(0.95, sum(rate(tidb_session_compile_duration_seconds_bucket{sql_type=\"general\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 1, "legendFormat": "95", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Compile Duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": null, "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": null, "description": "The time cost of executing the SQL which does not include the time to get the results of the query .", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 10 }, "id": 169, "interval": "", "legend": { "alignAsTable": false, "avg": false, "current": false, "hideEmpty": true, "hideZero": true, "max": false, "min": false, "rightSide": false, "show": true, "sort": null, "sortDesc": null, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tidb_session_execute_duration_seconds_bucket{sql_type=\"general\"}[1m])) by (le))", "format": "time_series", "instant": false, "intervalFactor": 2, "legendFormat": "99", "refId": "A", "step": 30 }, { "expr": "histogram_quantile(0.95, sum(rate(tidb_session_execute_duration_seconds_bucket{sql_type=\"general\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 1, "legendFormat": "95", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Execution Duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": null, "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": null, "description": "TiDB plan cache hit total.", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 10 }, "id": 91, "legend": { "alignAsTable": false, "avg": false, "current": false, "hideEmpty": true, "hideZero": true, "max": false, "min": false, "rightSide": false, "show": true, "sort": null, "sortDesc": null, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tidb_server_plan_cache_total[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 30 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Queries Using Plan Cache OPS", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 0, "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "title": "Query Detail", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 3 }, "id": 141, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB transaction processing counts by type.", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 4 }, "id": 69, "legend": { "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tidb_session_transaction_duration_seconds_count{sql_type=\"general\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "TPS", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 0, "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "Bucketed histogram of transaction execution durations, including retry.", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 4 }, "id": 72, "legend": { "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tidb_session_transaction_duration_seconds_bucket{sql_type=\"general\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99", "refId": "A" }, { "expr": "histogram_quantile(0.95, sum(rate(tidb_session_transaction_duration_seconds_bucket{sql_type=\"general\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "95", "refId": "B" }, { "expr": "histogram_quantile(0.80, sum(rate(tidb_session_transaction_duration_seconds_bucket{sql_type=\"general\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "80", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The max TiDB statements numbers within one transaction.", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 11 }, "id": 74, "legend": { "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(1, sum(rate(tidb_session_transaction_statement_num_bucket{sql_type=\"general\"}[30s])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Max Transaction Statement Num", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 0, "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The max TiDB transaction retry count.", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 11 }, "id": 67, "legend": { "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(1.0, sum(rate(tidb_session_retry_num_bucket[30s])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "max", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Max Transaction Retry Num", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 0, "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "title": "Transaction", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }, "id": 145, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "KV request durations by store (TiKV). It contains requests that are executed automatically by the internal background.", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 8, "x": 0, "y": 5 }, "id": 48, "legend": { "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, "rightSide": false, "show": true, "sort": "max", "sortDesc": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tidb_tikvclient_request_seconds_bucket{type!=\"GC\"}[1m])) by (le, store))", "format": "time_series", "intervalFactor": 2, "legendFormat": "store-{{store}}", "refId": "A", "step": 40 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "KV Request Duration 99 by store", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "KV request durations by request type. It contains requests that are executed automatically by the internal background.", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 8, "x": 8, "y": 5 }, "id": 30, "legend": { "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, "rightSide": false, "show": true, "sort": "max", "sortDesc": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tidb_tikvclient_request_seconds_bucket{type!=\"GC\"}[1m])) by (le,type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 40 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "KV Request Duration 99 by type", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "KV request count by request type. It contains requests that are executed automatically by the internal background.", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 8, "x": 16, "y": 5 }, "id": 172, "legend": { "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tidb_tikvclient_request_seconds_count[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 40 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "KV Request OPS", "tooltip": { "msResolution": true, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 0, "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB total kv transaction counts. It contains transactions that are executed automatically by the internal background.", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 8, "x": 0, "y": 12 }, "id": 4, "legend": { "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tidb_tikvclient_txn_cmd_duration_seconds_count[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 40 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "KV Transaction OPS", "tooltip": { "msResolution": true, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 0, "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The max writes bytes of the transaction. It contains transactions that are executed automatically by the internal background.", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 8, "x": 8, "y": 12 }, "id": 34, "legend": { "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, "rightSide": false, "show": true, "sort": "avg", "sortDesc": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(1, sum(rate(tidb_tikvclient_txn_write_size_bytes_bucket[1m])) by (le, instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 40 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Max Transaction Write Size Bytes", "tooltip": { "msResolution": false, "shared": true, "sort": 2, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The max writes kv count of the transaction. It contains transactions that are executed automatically by the internal background.", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 8, "x": 16, "y": 12 }, "id": 33, "legend": { "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, "rightSide": false, "show": true, "sort": "avg", "sortDesc": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(1, sum(rate(tidb_tikvclient_txn_write_kv_num_bucket[1m])) by (le, instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "B", "step": 40 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Max Transaction Write KV Num", "tooltip": { "msResolution": false, "shared": true, "sort": 2, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 0, "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The max writes regions of the transaction. It contains transactions that are executed automatically by the internal background.", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 8, "x": 0, "y": 19 }, "id": 44, "legend": { "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(1, sum(rate(tidb_tikvclient_txn_regions_num_bucket[1m])) by (le, instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 40 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Max Transaction Regions Num", "tooltip": { "msResolution": true, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 0, "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The duration of TiDB starts to wait for the TSO until received the TS result.", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 8, "x": 8, "y": 19 }, "id": 77, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.999, sum(rate(pd_client_cmd_handle_cmds_duration_seconds_bucket{type=\"wait\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "999", "refId": "A", "step": 10 }, { "expr": "histogram_quantile(0.99, sum(rate(pd_client_cmd_handle_cmds_duration_seconds_bucket{type=\"wait\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99", "refId": "B" }, { "expr": "histogram_quantile(0.90, sum(rate(pd_client_cmd_handle_cmds_duration_seconds_bucket{type=\"wait\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "90", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "PD TSO Wait Duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The duration of a client starting to wait for the TS until received the TS result.", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 8, "x": 16, "y": 19 }, "id": 78, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.999, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{type=\"tso\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "999", "refId": "A", "step": 10 }, { "expr": "histogram_quantile(0.99, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{type=\"tso\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99", "refId": "B" }, { "expr": "histogram_quantile(0.90, sum(rate(pd_client_request_handle_requests_duration_seconds_bucket{type=\"tso\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "90", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "PD TSO RPC Duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB auto-ID requests per second including single table/global auto-ID processing and single table auto-ID rebase processing.", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 26 }, "id": 50, "legend": { "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tidb_autoid_operation_duration_seconds_count[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "AutoID QPS", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 0, "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiDB auto-ID requests durations.", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 26 }, "id": 51, "legend": { "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tidb_autoid_operation_duration_seconds_bucket[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99", "refId": "B" }, { "expr": "sum(rate(tidb_autoid_operation_duration_seconds_sum[1m])) / sum(rate(tidb_autoid_operation_duration_seconds_count[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "AutoID Duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": null, "format": "s", "label": "", "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "title": "Write Slow", "type": "row" } ], "refresh": "30s", "schemaVersion": 18, "style": "dark", "tags": [], "templating": { "list": [] }, "time": { "from": "now-1h", "to": "now" }, "timepicker": { "refresh_intervals": [ "5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d" ], "time_options": [ "5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d" ] }, "timezone": "browser", "title": "Test-Cluster-TiDB-Summary", "uid": "000000012", "version": 1 } ================================================ FILE: scripts/tiflash_proxy_details.json ================================================ { "__inputs": [ { "name": "DS_TEST-CLUSTER", "label": "test-cluster", "description": "", "type": "datasource", "pluginId": "prometheus", "pluginName": "Prometheus" } ], "__requires": [ { "type": "grafana", "id": "grafana", "name": "Grafana", "version": "6.1.6" }, { "type": "panel", "id": "graph", "name": "Graph", "version": "" }, { "type": "panel", "id": "heatmap", "name": "Heatmap", "version": "" }, { "type": "datasource", "id": "prometheus", "name": "Prometheus", "version": "1.0.0" } ], "annotations": { "list": [ { "builtIn": 1, "datasource": "${DS_TEST-CLUSTER}", "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "type": "dashboard" } ] }, "editable": true, "gnetId": null, "graphTooltip": 1, "id": null, "iteration": 1577960059869, "links": [], "panels": [ { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, "id": 2742, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The CPU usage of each TiKV instance", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 1 }, "id": 1708, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "rate(tiflash_proxy_process_cpu_seconds_total{job=\"tiflash\"}[1m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "CPU", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The memory usage per TiKV instance", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 1 }, "id": 1709, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "avg(tiflash_proxy_process_resident_memory_bytes{instance=~\"$instance\"}) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Memory", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The I/O utilization per TiKV instance", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 9 }, "id": 1710, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "rate(node_disk_io_time_seconds_total[1m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} - {{device}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "IO utilization", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": null, "description": "TiKV uptime since the last restart", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 9 }, "id": 4106, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "(time() - tiflash_proxy_process_start_time_seconds)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Uptime", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "dtdurations", "label": "", "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": " \tThe number of leaders on each TiKV instance", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 17 }, "id": 1715, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "total", "lines": false } ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(tiflash_proxy_tikv_raftstore_region_count{instance=~\"$instance\", type=\"leader\"}) by (instance)", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 10 }, { "expr": "delta(tiflash_proxy_tikv_raftstore_region_count{instance=~\"$instance\", type=\"leader\"}[30s]) < -10", "format": "time_series", "hide": true, "intervalFactor": 2, "legendFormat": "", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Leader", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The number of Regions on each TiKV instance", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 17 }, "id": 1714, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(tiflash_proxy_tikv_raftstore_region_count{instance=~\"$instance\", type=\"region\"}) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Region", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": "", "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Cluster", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 1 }, "id": 2743, "panels": [ { "alert": { "conditions": [ { "evaluator": { "params": [ 0 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "5m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" } ], "executionErrorState": "alerting", "frequency": "60s", "handler": 1, "name": "Critical error alert", "noDataState": "no_data", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 7, "w": 24, "x": 0, "y": 2 }, "id": 2741, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_critical_error_total{instance=~\"$instance\"}[1m])) by (instance, type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}-{{type}}", "refId": "A" } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 0 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Critical error", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "Indicates occurrences of events that make the TiKV instance unavailable temporarily, such as Write Stall, Channel Full, Scheduler Busy, and Coprocessor Full", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 9 }, "id": 1584, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_scheduler_too_busy_total{instance=~\"$instance\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "scheduler-{{instance}}", "metric": "", "refId": "A", "step": 4 }, { "expr": "sum(rate(tiflash_proxy_tikv_channel_full_total{instance=~\"$instance\"}[1m])) by (instance, type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "channelfull-{{instance}}-{{type}}", "metric": "", "refId": "B", "step": 4 }, { "expr": "sum(rate(tiflash_proxy_tikv_coprocessor_request_error{instance=~\"$instance\", type='full'}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "coprocessor-{{instance}}", "metric": "", "refId": "C", "step": 4 }, { "expr": "avg(tiflash_proxy_tikv_engine_write_stall{instance=~\"$instance\", type=\"write_stall_percentile99\", db=~\"$db\"}) by (instance, db)", "format": "time_series", "intervalFactor": 2, "legendFormat": "stall-{{instance}}-{{db}}", "refId": "D" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Server is busy", "tooltip": { "msResolution": false, "shared": true, "sort": 2, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 0 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "10s", "now" ] }, "reducer": { "params": [], "type": "max" }, "type": "query" } ], "executionErrorState": "alerting", "frequency": "10s", "handler": 1, "message": "TiKV server report failures", "name": "server report failures alert", "noDataState": "ok", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The total number of reporting failure messages", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 9 }, "id": 18, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_server_report_failure_msg_total{instance=~\"$instance\"}[1m])) by (type,instance,store_id)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} - {{type}} - to - {{store_id}}", "metric": "tiflash_proxy_tikv_server_raft_store_msg_total", "refId": "A", "step": 10 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 0 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Server report failures", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The number of different raftstore errors on each TiKV instance", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 16 }, "id": 1718, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_storage_engine_async_request_total{instance=~\"$instance\", status!~\"success|all\"}[1m])) by (instance, status)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}-{{status}}", "metric": "", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Raftstore error", "tooltip": { "msResolution": false, "shared": true, "sort": 2, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The number of scheduler errors per type on each TiKV instance", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 16 }, "id": 1719, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_scheduler_stage_total{instance=~\"$instance\", stage=~\"snapshot_err|prepare_write_err\"}[1m])) by (instance, stage)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}-{{stage}}", "metric": "", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Scheduler error", "tooltip": { "msResolution": false, "shared": true, "sort": 2, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The number of different coprocessor errors on each TiKV instance", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 23 }, "id": 1720, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_coprocessor_request_error{instance=~\"$instance\"}[1m])) by (instance, reason)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}-{{reason}}", "metric": "", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Coprocessor error", "tooltip": { "msResolution": false, "shared": true, "sort": 2, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The number of gRPC message errors per type on each TiKV instance", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 23 }, "id": 1721, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_grpc_msg_fail_total{instance=~\"$instance\"}[1m])) by (instance, type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}-{{type}}", "metric": "", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "gRPC message error", "tooltip": { "msResolution": false, "shared": true, "sort": 2, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The count of dropped leaders per TiKV instance", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 30 }, "id": 1722, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "total", "lines": false } ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(delta(tiflash_proxy_tikv_raftstore_region_count{instance=~\"$instance\", type=\"leader\"}[1m])) by (instance)", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Leader drop", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The count of missing leaders per TiKV instance", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 30 }, "id": 1723, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "total", "lines": false } ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(tiflash_proxy_tikv_raftstore_leader_missing{instance=~\"$instance\"}) by (instance)", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Leader missing", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Errors", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 2 }, "id": 2744, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The size of each column family", "editable": true, "error": false, "fill": 3, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 3 }, "id": 33, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "expr": "sum(tiflash_proxy_tikv_engine_size_bytes{instance=~\"$instance\"}) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "CF size", "tooltip": { "msResolution": false, "shared": true, "sort": 2, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "decbytes", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The storage size per TiKV instance", "editable": true, "error": false, "fill": 5, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 3 }, "id": 1705, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 0, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "expr": "sum(tiflash_proxy_tikv_engine_size_bytes{instance=~\"$instance\"}) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Store size", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "decbytes", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 0 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "datasourceId": 1, "model": { "expr": "sum(rate(tiflash_proxy_tikv_channel_full_total{instance=~\"$instance\"}[1m])) by (instance, type)", "intervalFactor": 2, "legendFormat": "{{instance}} - {{type}}", "metric": "", "refId": "A", "step": 10 }, "params": [ "A", "10s", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" } ], "executionErrorState": "alerting", "frequency": "10s", "handler": 1, "message": "TiKV channel full", "name": "TiKV channel full alert", "noDataState": "ok", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The total number of channel full errors on each TiKV instance", "editable": true, "error": false, "fill": 3, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 11 }, "id": 22, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_channel_full_total{instance=~\"$instance\"}[1m])) by (instance, type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} - {{type}}", "metric": "", "refId": "A", "step": 10 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 0 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Channel full", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The number of leaders being written on each TiKV instance", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 11 }, "id": 75, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_region_written_keys_count{instance=~\"$instance\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tiflash_proxy_tikv_region_written_keys_bucket", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Active written leaders", "tooltip": { "msResolution": false, "shared": true, "sort": 2, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 1073741824 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "B", "1m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" } ], "executionErrorState": "alerting", "frequency": "60s", "handler": 1, "name": "approximate region size alert", "noDataState": "no_data", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The approximate Region size", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 19 }, "id": 1481, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tiflash_proxy_tikv_raftstore_region_size_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "metric": "", "refId": "B", "step": 10 }, { "expr": "histogram_quantile(0.95, sum(rate(tiflash_proxy_tikv_raftstore_region_size_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "metric": "", "refId": "C", "step": 10 }, { "expr": "sum(rate(tiflash_proxy_tikv_raftstore_region_size_sum{instance=~\"$instance\"}[1m])) / sum(rate(tiflash_proxy_tikv_raftstore_region_size_count{instance=~\"$instance\"}[1m])) ", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "metric": "", "refId": "D", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Approximate Region size", "tooltip": { "msResolution": false, "shared": true, "sort": 2, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": true, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 19 }, "id": 3638, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "max": true, "min": false, "rightSide": true, "show": false, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": false, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "paceLength": 10, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_raftstore_region_size_bucket{instance=~\"$instance\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "metric": "", "refId": "B", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Approximate Region size Histogram", "tooltip": { "msResolution": false, "shared": false, "sort": 2, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "histogram", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The average rate of writing bytes to Regions per TiKV instance", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 27 }, "id": 58, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_region_written_bytes_sum[1m])) by (instance) / sum(rate(tiflash_proxy_tikv_region_written_bytes_count[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tiflash_proxy_tikv_regi", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Region average written bytes", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "cards": { "cardPadding": null, "cardRound": null }, "color": { "cardColor": "#b4ff00", "colorScale": "sqrt", "colorScheme": "interpolateOranges", "exponent": 0.5, "mode": "spectrum" }, "dataFormat": "timeseries", "datasource": "${DS_TEST-CLUSTER}", "gridPos": { "h": 8, "w": 12, "x": 12, "y": 27 }, "heatmap": {}, "hideZeroBuckets": false, "highlightCards": true, "id": 3636, "legend": { "show": false }, "links": [], "reverseYBuckets": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_region_written_bytes_bucket[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tiflash_proxy_tikv_regi", "refId": "A", "step": 10 } ], "timeFrom": null, "timeShift": null, "title": "Region written bytes", "tooltip": { "show": true, "showHistogram": false }, "type": "heatmap", "xAxis": { "show": true }, "xBucketNumber": null, "xBucketSize": null, "yAxis": { "decimals": null, "format": "decbytes", "logBase": 1, "max": null, "min": null, "show": true, "splitFactor": null }, "yBucketBound": "auto", "yBucketNumber": null, "yBucketSize": null }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The average rate of written keys to Regions per TiKV instance", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 35 }, "id": 57, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_region_written_keys_sum{instance=~\"$instance\"}[1m])) by (instance) / sum(rate(tiflash_proxy_tikv_region_written_keys_count{instance=~\"$instance\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tiflash_proxy_tikv_region_written_keys_bucket", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Region average written keys", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "cards": { "cardPadding": null, "cardRound": null }, "color": { "cardColor": "#b4ff00", "colorScale": "sqrt", "colorScheme": "interpolateOranges", "exponent": 0.5, "mode": "spectrum" }, "dataFormat": "timeseries", "datasource": "${DS_TEST-CLUSTER}", "gridPos": { "h": 8, "w": 12, "x": 12, "y": 35 }, "heatmap": {}, "hideZeroBuckets": false, "highlightCards": true, "id": 3637, "legend": { "show": false }, "links": [], "reverseYBuckets": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_region_written_keys_bucket{instance=~\"$instance\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tiflash_proxy_tikv_region_written_keys_bucket", "refId": "A", "step": 10 } ], "timeFrom": null, "timeShift": null, "title": "Region written keys", "tooltip": { "show": true, "showHistogram": false }, "type": "heatmap", "xAxis": { "show": true }, "xBucketNumber": null, "xBucketSize": null, "yAxis": { "decimals": null, "format": "short", "logBase": 1, "max": null, "min": null, "show": true, "splitFactor": null }, "yBucketBound": "auto", "yBucketNumber": null, "yBucketSize": null }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The ratio of request batch output to input per TiKV instance", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 43 }, "id": 3718, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_server_request_batch_ratio_sum{instance=~\"$instance\"}[1m])) by (type) / sum(rate(tiflash_proxy_tikv_server_request_batch_ratio_count{instance=~\"$instance\"}[1m])) by (type)", "format": "time_series", "hide": false, "intervalFactor": 1, "legendFormat": "{{type}} avg", "refId": "B" }, { "expr": "histogram_quantile(0.99, sum(rate(tiflash_proxy_tikv_server_request_batch_ratio_bucket{instance=~\"$instance\"}[1m])) by (le, type))", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{type}} 99", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Request batch ratio", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 10, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The size of requests into request batch per TiKV instance", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 43 }, "id": 3720, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_server_request_batch_size_sum{instance=~\"$instance\"}[1m])) by (type) / sum(rate(tiflash_proxy_tikv_server_request_batch_size_count{instance=~\"$instance\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{type}} avg", "refId": "A" }, { "expr": "histogram_quantile(0.99, sum(rate(tiflash_proxy_tikv_server_request_batch_size_bucket{instance=~\"$instance\"}[1m])) by (le, type))", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{type}} 99", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Request batch input", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 10, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Server", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 3 }, "id": 2745, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The count of different kinds of gRPC message", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 }, "id": 95, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_grpc_msg_duration_seconds_count{instance=~\"$instance\", type!=\"kv_gc\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "metric": "tiflash_proxy_tikv_grpc_msg_duration_seconds_bucket", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "gRPC message count", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The count of different kinds of gRPC message which is failed", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 }, "id": 107, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_grpc_msg_fail_total{instance=~\"$instance\", type!=\"kv_gc\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "metric": "tiflash_proxy_tikv_grpc_msg_fail_total", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "gRPC message failed", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The execution time of gRPC message", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 }, "id": 98, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tiflash_proxy_tikv_grpc_msg_duration_seconds_bucket{instance=~\"$instance\", type!=\"kv_gc\"}[1m])) by (le, type))", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "99% gRPC messge duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 10, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 12 }, "id": 2532, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_grpc_msg_duration_seconds_sum{instance=~\"$instance\"}[1m])) by (type) / sum(rate(tiflash_proxy_tikv_grpc_msg_duration_seconds_count[1m])) by (type)", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Average gRPC messge duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 20 }, "id": 2533, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tiflash_proxy_tikv_server_grpc_req_batch_size_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "99% request", "refId": "A", "step": 10 }, { "expr": "histogram_quantile(0.99, sum(rate(tiflash_proxy_tikv_server_grpc_resp_batch_size_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99% response", "refId": "B" }, { "expr": "sum(rate(tiflash_proxy_tikv_server_grpc_req_batch_size_sum{instance=~\"$instance\"}[1m])) / sum(rate(tiflash_proxy_tikv_server_grpc_req_batch_size_count{instance=~\"$instance\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg request", "refId": "C" }, { "expr": "sum(rate(tiflash_proxy_tikv_server_grpc_resp_batch_size_sum{instance=~\"$instance\"}[1m])) / sum(rate(tiflash_proxy_tikv_server_grpc_resp_batch_size_count{instance=~\"$instance\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg response", "refId": "D" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "gRPC batch size", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 20 }, "id": 2534, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tiflash_proxy_tikv_server_raft_message_batch_size_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "99%", "refId": "A", "step": 10 }, { "expr": "sum(rate(tiflash_proxy_tikv_server_raft_message_batch_size_sum{instance=~\"$instance\"}[1m])) / sum(rate(tiflash_proxy_tikv_server_raft_message_batch_size_count{instance=~\"$instance\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "raft message batch size", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "gRPC", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }, "id": 2746, "panels": [ { "alert": { "conditions": [ { "evaluator": { "params": [ 1.7 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "datasourceId": 1, "model": { "expr": "sum(rate(tiflash_proxy_tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"raftstore_.*\"}[1m])) by (instance)", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tiflash_proxy_tikv_thread_cpu_seconds_total", "refId": "A", "step": 20 }, "params": [ "A", "1m", "now" ] }, "reducer": { "params": [], "type": "max" }, "type": "query" } ], "executionErrorState": "alerting", "for": "0m", "frequency": "60s", "handler": 1, "message": "TiKV raftstore thread CPU usage is high", "name": "TiKV raft store CPU alert", "noDataState": "ok", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The CPU utilization of raftstore thread", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 5 }, "id": 61, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"raftstore_.*\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tiflash_proxy_tikv_thread_cpu_seconds_total", "refId": "A", "step": 4 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 0.85 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Raft store CPU", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 1.8 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "1m", "now" ] }, "reducer": { "params": [], "type": "max" }, "type": "query" } ], "executionErrorState": "alerting", "for": "0m", "frequency": "1m", "handler": 1, "message": "TiKV async apply thread CPU usage is high", "name": "TiKV async apply CPU alert", "noDataState": "ok", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The CPU utilization of async apply", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 5 }, "id": 79, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"apply_[0-9]+\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tiflash_proxy_tikv_thread_cpu_seconds_total", "refId": "A", "step": 4 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 0.9 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Async apply CPU", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 3.6 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "1m", "now" ] }, "reducer": { "params": [], "type": "max" }, "type": "query" } ], "executionErrorState": "alerting", "for": "0m", "frequency": "1m", "handler": 1, "message": "TiKV scheduler worker thread CPU usage is high", "name": "TiKV scheduler worker CPU alert", "noDataState": "ok", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The CPU utilization of scheduler worker", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 12 }, "id": 64, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"sched_.*\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tiflash_proxy_tikv_thread_cpu_seconds_total", "refId": "A", "step": 4 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 0.9 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Scheduler worker CPU", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 3.6 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "1m", "now" ] }, "reducer": { "params": [], "type": "max" }, "type": "query" } ], "executionErrorState": "alerting", "for": "0m", "frequency": "1m", "handler": 1, "message": "TiKV gRPC poll thread CPU usage is high", "name": "TiKV gRPC poll CPU alert", "noDataState": "ok", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The CPU utilization of gRPC", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 12 }, "id": 105, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"grpc.*\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 4 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 0.9 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "gRPC poll CPU", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 7.2 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "1m", "now" ] }, "reducer": { "params": [], "type": "max" }, "type": "query" } ], "executionErrorState": "alerting", "for": "0m", "frequency": "1m", "handler": 1, "message": "TiKV unified read pool thread CPU usage is high", "name": "Unified read pool CPU alert", "noDataState": "ok", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The CPU utilization of the unified read pool", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 19 }, "id": 4287, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"unified_read_po*\"}[1m])) by (instance)", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 4 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 7.2 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Unified read pool CPU", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 3.6 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "1m", "now" ] }, "reducer": { "params": [], "type": "max" }, "type": "query" } ], "executionErrorState": "alerting", "for": "0m", "frequency": "1m", "handler": 1, "message": "TiKV Storage ReadPool thread CPU usage is high", "name": "TiKV Storage ReadPool CPU alert", "noDataState": "ok", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The CPU utilization of readpool", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 19 }, "id": 1908, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"store_read_norm.*\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} - normal", "metric": "tiflash_proxy_tikv_thread_cpu_seconds_total", "refId": "A", "step": 4 }, { "expr": "sum(rate(tiflash_proxy_tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"store_read_high.*\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} - high", "metric": "tiflash_proxy_tikv_thread_cpu_seconds_total", "refId": "B", "step": 4 }, { "expr": "sum(rate(tiflash_proxy_tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"store_read_low.*\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} - low", "metric": "tiflash_proxy_tikv_thread_cpu_seconds_total", "refId": "C", "step": 4 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 3.6 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Storage ReadPool CPU", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 7.2 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "1m", "now" ] }, "reducer": { "params": [], "type": "max" }, "type": "query" } ], "executionErrorState": "alerting", "for": "0m", "frequency": "1m", "handler": 1, "message": "TiKV Coprocessor thread CPU alert", "name": "TiKV Coprocessor CPU alert", "noDataState": "ok", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The CPU utilization of coprocessor", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 26 }, "id": 78, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"cop_normal.*\"}[1m])) by (instance)", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "{{instance}} - normal", "refId": "A", "step": 4 }, { "expr": "sum(rate(tiflash_proxy_tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"cop_high.*\"}[1m])) by (instance)", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "{{instance}} - high", "refId": "B", "step": 4 }, { "expr": "sum(rate(tiflash_proxy_tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"cop_low.*\"}[1m])) by (instance)", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "{{instance}} - low", "refId": "C", "step": 4 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 7.2 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Coprocessor CPU", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The CPU utilization of RocksDB", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 26 }, "id": 69, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"rocksdb.*\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tiflash_proxy_tikv_thread_cpu_seconds_total", "refId": "A", "step": 4 } ], "thresholds": [ { "colorMode": "warning", "fill": true, "line": true, "op": "gt", "value": 1 }, { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 4 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "RocksDB CPU", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": " \tThe CPU utilization of split check", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 33 }, "id": 68, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"split_check\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tiflash_proxy_tikv_thread_cpu_seconds_total", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Split check CPU", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 33 }, "id": 2531, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"gc_worker.*\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "GC worker CPU", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The CPU utilization of snapshot worker", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 40 }, "id": 67, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"snapshot_worker\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tiflash_proxy_tikv_thread_cpu_seconds_total", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Snapshot worker CPU", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Thread CPU", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 }, "id": 2747, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The count of requests that TiKV sends to PD", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 }, "id": 1069, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_pd_request_duration_seconds_count{instance=~\"$instance\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ type }}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "PD requests", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The time consumed by requests that TiKV sends to PD", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 }, "id": 1070, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_pd_request_duration_seconds_sum{instance=~\"$instance\"}[1m])) by (type) / sum(rate(tiflash_proxy_tikv_pd_request_duration_seconds_count{instance=~\"$instance\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ type }}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "PD request duration (average)", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": " \tThe total number of PD heartbeat messages", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 14 }, "id": 1215, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_pd_heartbeat_message_total{instance=~\"$instance\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ type }}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "PD heartbeats", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "opm", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The total number of peers validated by the PD worker", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 14 }, "id": 1396, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_pd_validate_peer_total{instance=~\"$instance\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ type }}", "metric": "", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "PD validate peers", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "PD", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 6 }, "id": 2748, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The time consumed when Raft applies log", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 7 }, "id": 31, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tiflash_proxy_tikv_raftstore_apply_log_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": " 99%", "metric": "", "refId": "A", "step": 4 }, { "expr": "histogram_quantile(0.95, sum(rate(tiflash_proxy_tikv_raftstore_apply_log_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "refId": "B", "step": 4 }, { "expr": "sum(rate(tiflash_proxy_tikv_raftstore_apply_log_duration_seconds_sum{instance=~\"$instance\"}[1m])) / sum(rate(tiflash_proxy_tikv_raftstore_apply_log_duration_seconds_count{instance=~\"$instance\"}[1m])) ", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "refId": "C", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Apply log duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The time consumed for Raft to apply logs per TiKV instance", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 7 }, "id": 32, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tiflash_proxy_tikv_raftstore_apply_log_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le, instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": " {{instance}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Apply log duration per server", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The time consumed when Raft appends log", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 14 }, "id": 39, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tiflash_proxy_tikv_raftstore_append_log_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": " 99%", "metric": "", "refId": "A", "step": 4 }, { "expr": "histogram_quantile(0.95, sum(rate(tiflash_proxy_tikv_raftstore_append_log_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "refId": "B", "step": 4 }, { "expr": "sum(rate(tiflash_proxy_tikv_raftstore_append_log_duration_seconds_sum{instance=~\"$instance\"}[1m])) / sum(rate(tiflash_proxy_tikv_raftstore_append_log_duration_seconds_count{instance=~\"$instance\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "refId": "C", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Append log duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The time consumed when Raft appends log on each TiKV instance", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 14 }, "id": 40, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tiflash_proxy_tikv_raftstore_append_log_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le, instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} ", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Append log duration per server", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The time consumed when Raft commits log", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 21 }, "id": 3690, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tiflash_proxy_tikv_raftstore_commit_log_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "refId": "A" }, { "expr": "histogram_quantile(0.95, sum(rate(tiflash_proxy_tikv_raftstore_commit_log_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "refId": "B" }, { "expr": "sum(rate(tiflash_proxy_tikv_raftstore_commit_log_duration_seconds_sum{instance=~\"$instance\"}[1m])) / sum(rate(tiflash_proxy_tikv_raftstore_commit_log_duration_seconds_count{instance=~\"$instance\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Commit log duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The time consumed when Raft commits log on each TiKV instance", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 21 }, "id": 3688, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tiflash_proxy_tikv_raftstore_commit_log_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le, instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Commit log duration per server", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Raft IO", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 7 }, "id": 2749, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The count of different ready type of Raft", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, "id": 5, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_raftstore_raft_ready_handled_total{instance=~\"$instance\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "metric": "tiflash_proxy_tikv_raftstore_raft_ready_handled_total", "refId": "A", "step": 4 }, { "expr": "sum(rate(tiflash_proxy_tikv_raftstore_raft_process_duration_secs_count{instance=~\"$instance\", type=\"ready\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "count", "refId": "B", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Ready handled", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The time consumed for peer processes to be ready in Raft", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, "id": 118, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tiflash_proxy_tikv_raftstore_raft_process_duration_secs_bucket{instance=~\"$instance\", type='ready'}[1m])) by (le, instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "C", "step": 4 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 1 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Process ready duration per server", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The time consumed by raftstore events (P99).99", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }, "id": 123, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tiflash_proxy_tikv_raftstore_event_duration_bucket{instance=~\"$instance\"}[1m])) by (le, type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "C", "step": 4 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 1 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "0.99 Duration of raft store events", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Raft process", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 8 }, "id": 2750, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The number of Raft messages sent by each TiKV instance", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 47 }, "id": 1615, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_raftstore_raft_sent_message_total{instance=~\"$instance\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Sent messages per server", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The number of Raft messages flushed by each TiKV instance", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 47 }, "id": 1616, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_server_raft_message_flush_total{instance=~\"$instance\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tiflash_proxy_tikv_server_raft_message_flush_total", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Flush messages per server", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The number of Raft messages received by each TiKV instance", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 54 }, "id": 106, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_server_raft_message_recv_total{instance=~\"$instance\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Receive messages per server", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The number of different types of Raft messages that are sent", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 54 }, "id": 11, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_raftstore_raft_sent_message_total{instance=~\"$instance\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Messages", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The total number of vote messages that are sent in Raft", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 61 }, "id": 25, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_raftstore_raft_sent_message_total{instance=~\"$instance\", type=\"vote\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Vote", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The number of dropped Raft messages per type", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 61 }, "id": 1309, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_raftstore_raft_dropped_message_total{instance=~\"$instance\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Raft dropped messages", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Raft message", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 9 }, "id": 2751, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The proposal count of all Regions in a mio tick", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 48 }, "id": 108, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tiflash_proxy_tikv_raftstore_apply_proposal_bucket{instance=~\"$instance\"}[1m])) by (le, instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Raft proposals per ready", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The number of proposals per type", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 48 }, "id": 7, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_raftstore_proposal_total{instance=~\"$instance\", type=~\"local_read|normal|read_index|batch\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "metric": "tiflash_proxy_tikv_raftstore_proposal_total", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Raft read/write proposals", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The number of read proposals which are made by each TiKV instance", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 55 }, "id": 119, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_raftstore_proposal_total{instance=~\"$instance\", type=~\"local_read|read_index\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Raft read proposals per server", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The number of write proposals which are made by each TiKV instance", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 55 }, "id": 120, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_raftstore_proposal_total{instance=~\"$instance\", type=\"normal\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tiflash_proxy_tikv_raftstore_proposal_total", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Raft write proposals per server", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The wait time of each proposal", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 62 }, "id": 41, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tiflash_proxy_tikv_raftstore_request_wait_time_duration_secs_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "metric": "tiflash_proxy_tikv_raftstore_request_wait_time_duration_secs_bucket", "refId": "A", "step": 4 }, { "expr": "histogram_quantile(0.95, sum(rate(tiflash_proxy_tikv_raftstore_request_wait_time_duration_secs_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "refId": "B", "step": 4 }, { "expr": "sum(rate(tiflash_proxy_tikv_raftstore_request_wait_time_duration_secs_sum{instance=~\"$instance\"}[1m])) / sum(rate(tiflash_proxy_tikv_raftstore_request_wait_time_duration_secs_count{instance=~\"$instance\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "refId": "C", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Propose wait duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The wait time of each proposal in each TiKV instance", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 62 }, "id": 42, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tiflash_proxy_tikv_raftstore_request_wait_time_duration_secs_bucket{instance=~\"$instance\"}[1m])) by (le, instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Propose wait duration per server", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 69 }, "id": 2535, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tiflash_proxy_tikv_raftstore_apply_wait_time_duration_secs_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "metric": "tiflash_proxy_tikv_raftstore_request_wait_time_duration_secs_bucket", "refId": "A", "step": 4 }, { "expr": "histogram_quantile(0.95, sum(rate(tiflash_proxy_tikv_raftstore_apply_wait_time_duration_secs_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "refId": "B", "step": 4 }, { "expr": "sum(rate(tiflash_proxy_tikv_raftstore_apply_wait_time_duration_secs_sum{instance=~\"$instance\"}[1m])) / sum(rate(tiflash_proxy_tikv_raftstore_apply_wait_time_duration_secs_count{instance=~\"$instance\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "refId": "C", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Apply wait duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 69 }, "id": 2536, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tiflash_proxy_tikv_raftstore_apply_wait_time_duration_secs_bucket{instance=~\"$instance\"}[1m])) by (le, instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Apply wait duration per server", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The rate at which peers propose logs", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 76 }, "id": 1975, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": true, "min": true, "rightSide": true, "show": true, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "avg(rate(tiflash_proxy_tikv_raftstore_propose_log_size_sum{instance=~\"$instance\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Raft log speed", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": null, "format": "short", "label": "bytes/s", "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 76 }, "id": 4375, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tiflash_proxy_tikv_raftstore_store_perf_context_time_duration_secs_bucket{instance=~\"$instance\"}[1m])) by (le, type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "store-{{type}}", "metric": "tiflash_proxy_tikv_raftstore_request_wait_time_duration_secs_bucket", "refId": "A", "step": 4 }, { "expr": "histogram_quantile(0.99, sum(rate(tiflash_proxy_tikv_raftstore_apply_perf_context_time_duration_secs_bucket{instance=~\"$instance\"}[1m])) by (le, type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "apply-{{type}}", "refId": "B", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Perf Context duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Raft propose", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 10 }, "id": 2752, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The number of admin proposals", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 11 }, "id": 76, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_raftstore_proposal_total{instance=~\"$instance\", type=~\"conf_change|transfer_leader\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "metric": "tiflash_proxy_tikv_raftstore_proposal_total", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Admin proposals", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The number of the processed apply command", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 11 }, "id": 77, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_raftstore_admin_cmd_total{instance=~\"$instance\", status=\"success\", type!=\"compact\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "metric": "tiflash_proxy_tikv_raftstore_admin_cmd_total", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Admin apply", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The number of raftstore split checksss", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 18 }, "id": 70, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_raftstore_check_split_total{instance=~\"$instance\", type!=\"ignore\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "metric": "tiflash_proxy_tikv_raftstore_check_split_total", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Check split", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The time consumed when running split check in .9999", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 18 }, "id": 71, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.9999, sum(rate(tiflash_proxy_tikv_raftstore_check_split_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le, instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tiflash_proxy_tikv_raftstore_check_split_duration_seconds_bucket", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "99.99% Check split duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Raft admin", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 11 }, "id": 4200, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The time used by each level in the unified read pool per second. Level 0 refers to small queries.", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 13 }, "id": 4194, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_multilevel_level_elapsed{instance=~\"$instance\", name=\"unified-read-pool\"}[1m])) by (level)", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{level}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Time used by level", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "µs", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The chance that level 0 (small) tasks are scheduled in the unified read pool.", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 13 }, "id": 4196, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": true, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "tiflash_proxy_tikv_multilevel_level0_chance{instance=~\"$instance\", name=\"unified-read-pool\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{instance}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Level 0 chance", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The number of concurrently running tasks in the unified read pool.", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 21 }, "id": 4198, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(avg_over_time(tiflash_proxy_tikv_unified_read_pool_running_tasks[1m])) by (instance)", "format": "time_series", "interval": "", "intervalFactor": 1, "legendFormat": "{{instance}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Running tasks", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "title": "Unified Read Pool", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 12 }, "id": 2754, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The total count of different kinds of commands received", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 14 }, "id": 2, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_storage_command_total{instance=~\"$instance\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Storage command total", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 10, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The total number of engine asynchronous request errors", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 14 }, "id": 8, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_storage_engine_async_request_total{instance=~\"$instance\", status!~\"all|success\"}[1m])) by (status)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{status}}", "metric": "tiflash_proxy_tikv_raftstore_raft_process_duration_secs_bucket", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Storage async request error", "tooltip": { "msResolution": true, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The time consumed by processing asynchronous snapshot requests", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 22 }, "id": 15, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tiflash_proxy_tikv_storage_engine_async_request_duration_seconds_bucket{instance=~\"$instance\", type=\"snapshot\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "refId": "A", "step": 4 }, { "expr": "histogram_quantile(0.95, sum(rate(tiflash_proxy_tikv_storage_engine_async_request_duration_seconds_bucket{instance=~\"$instance\", type=\"snapshot\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "refId": "B", "step": 4 }, { "expr": "sum(rate(tiflash_proxy_tikv_storage_engine_async_request_duration_seconds_sum{instance=~\"$instance\", type=\"snapshot\"}[1m])) / sum(rate(tiflash_proxy_tikv_storage_engine_async_request_duration_seconds_count{instance=~\"$instance\", type=\"snapshot\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "refId": "C", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Storage async snapshot duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The time consumed by processing asynchronous write requests", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 22 }, "id": 109, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tiflash_proxy_tikv_storage_engine_async_request_duration_seconds_bucket{instance=~\"$instance\", type=\"write\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "refId": "A", "step": 4 }, { "expr": "histogram_quantile(0.95, sum(rate(tiflash_proxy_tikv_storage_engine_async_request_duration_seconds_bucket{instance=~\"$instance\", type=\"write\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "refId": "B", "step": 4 }, { "expr": "sum(rate(tiflash_proxy_tikv_storage_engine_async_request_duration_seconds_sum{instance=~\"$instance\", type=\"write\"}[1m])) / sum(rate(tiflash_proxy_tikv_storage_engine_async_request_duration_seconds_count{instance=~\"$instance\", type=\"write\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "refId": "C", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Storage async write duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Storage", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 }, "id": 2755, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The total number of commands on each stage", "fill": 1, "gridPos": { "h": 10, "w": 12, "x": 0, "y": 15 }, "height": "400", "id": 167, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "maxPerRow": 1, "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_scheduler_too_busy_total{instance=~\"$instance\"}[1m])) by (stage)", "format": "time_series", "intervalFactor": 2, "legendFormat": "busy", "refId": "A", "step": 20 }, { "expr": "sum(rate(tiflash_proxy_tikv_scheduler_stage_total{instance=~\"$instance\"}[1m])) by (stage)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{stage}}", "refId": "B", "step": 20 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Scheduler stage total", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The total writing bytes of commands on each stage", "fill": 1, "gridPos": { "h": 10, "w": 12, "x": 12, "y": 15 }, "height": "400", "id": 3834, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "maxPerRow": 1, "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(tiflash_proxy_tikv_scheduler_writing_bytes{instance=~\"$instance\"}) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 20 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Scheduler writing bytes", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The count of different priority commands", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 25 }, "height": "", "id": 1, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "maxPerRow": 2, "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_scheduler_commands_pri_total{instance=~\"$instance\"}[1m])) by (priority)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{priority}}", "metric": "", "refId": "A", "step": 40 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Scheduler priority commands", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 300 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "5m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" } ], "executionErrorState": "alerting", "frequency": "120s", "handler": 1, "message": "TiKV scheduler context total", "name": "scheduler pending commands alert", "noDataState": "ok", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The count of pending commands per TiKV instance", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 25 }, "height": "", "id": 193, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "maxPerRow": 2, "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(tiflash_proxy_tikv_scheduler_contex_total{instance=~\"$instance\"}) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "", "refId": "A", "step": 40 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 300 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Scheduler pending commands", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Scheduler", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 }, "id": 2756, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The total number of commands on each stage in commit command", "fill": 1, "gridPos": { "h": 10, "w": 24, "x": 0, "y": 16 }, "height": "400", "id": 168, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "maxPerRow": 1, "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "command": { "selected": false, "text": "commit", "value": "commit" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_scheduler_too_busy_total{instance=~\"$instance\", type=\"$command\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "busy", "refId": "A", "step": 4 }, { "expr": "sum(rate(tiflash_proxy_tikv_scheduler_stage_total{instance=~\"$instance\", type=\"$command\"}[1m])) by (stage)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{stage}}", "refId": "B", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Scheduler stage total", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The time consumed when executing commit command", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 26 }, "id": 3, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "command": { "selected": false, "text": "commit", "value": "commit" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tiflash_proxy_tikv_scheduler_command_duration_seconds_bucket{instance=~\"$instance\", type=\"$command\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "metric": "", "refId": "A", "step": 10 }, { "expr": "histogram_quantile(0.95, sum(rate(tiflash_proxy_tikv_scheduler_command_duration_seconds_bucket{instance=~\"$instance\", type=\"$command\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "metric": "", "refId": "B", "step": 10 }, { "expr": "sum(rate(tiflash_proxy_tikv_scheduler_command_duration_seconds_sum{instance=~\"$instance\", type=\"$command\"}[1m])) / sum(rate(tiflash_proxy_tikv_scheduler_command_duration_seconds_count{instance=~\"$instance\", type=\"$command\"}[1m])) ", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "metric": "", "refId": "C", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Scheduler command duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The time which is caused by latch wait in commit command", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 26 }, "id": 194, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "command": { "selected": false, "text": "commit", "value": "commit" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tiflash_proxy_tikv_scheduler_latch_wait_duration_seconds_bucket{instance=~\"$instance\", type=\"$command\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "metric": "", "refId": "A", "step": 10 }, { "expr": "histogram_quantile(0.95, sum(rate(tiflash_proxy_tikv_scheduler_latch_wait_duration_seconds_bucket{instance=~\"$instance\", type=\"$command\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "metric": "", "refId": "B", "step": 10 }, { "expr": "sum(rate(tiflash_proxy_tikv_scheduler_latch_wait_duration_seconds_sum{instance=~\"$instance\", type=\"$command\"}[1m])) / sum(rate(tiflash_proxy_tikv_scheduler_latch_wait_duration_seconds_count{instance=~\"$instance\", type=\"$command\"}[1m])) ", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "metric": "", "refId": "C", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Scheduler latch wait duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The count of keys read by a commit command", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 34 }, "id": 195, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "command": { "selected": false, "text": "commit", "value": "commit" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tiflash_proxy_tikv_scheduler_kv_command_key_read_bucket{instance=~\"$instance\", type=\"$command\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "metric": "kv_command_key", "refId": "A", "step": 10 }, { "expr": "histogram_quantile(0.95, sum(rate(tiflash_proxy_tikv_scheduler_kv_command_key_read_bucket{instance=~\"$instance\", type=\"$command\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "metric": "", "refId": "B", "step": 10 }, { "expr": "sum(rate(tiflash_proxy_tikv_scheduler_kv_command_key_read_sum{instance=~\"$instance\", type=\"$command\"}[1m])) / sum(rate(tiflash_proxy_tikv_scheduler_kv_command_key_read_count{instance=~\"$instance\", type=\"$command\"}[1m])) ", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "metric": "", "refId": "C", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Scheduler keys read", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The count of keys written by a commit command", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 34 }, "id": 373, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "command": { "selected": false, "text": "commit", "value": "commit" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tiflash_proxy_tikv_scheduler_kv_command_key_write_bucket{instance=~\"$instance\", type=\"$command\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "metric": "kv_command_key", "refId": "A", "step": 10 }, { "expr": "histogram_quantile(0.95, sum(rate(tiflash_proxy_tikv_scheduler_kv_command_key_write_bucket{instance=~\"$instance\", type=\"$command\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "metric": "", "refId": "B", "step": 10 }, { "expr": "sum(rate(tiflash_proxy_tikv_scheduler_kv_command_key_write_sum{instance=~\"$instance\", type=\"$command\"}[1m])) / sum(rate(tiflash_proxy_tikv_scheduler_kv_command_key_write_count{instance=~\"$instance\", type=\"$command\"}[1m])) ", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "metric": "", "refId": "C", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Scheduler keys written", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The keys scan details of each CF when executing commit command", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 42 }, "id": 560, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "command": { "selected": false, "text": "commit", "value": "commit" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_scheduler_kv_scan_details{instance=~\"$instance\", req=\"$command\"}[1m])) by (tag)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{tag}}", "metric": "", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Scheduler scan details", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The keys scan details of lock CF when executing commit command", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 42 }, "id": 675, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "command": { "selected": false, "text": "commit", "value": "commit" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_scheduler_kv_scan_details{instance=~\"$instance\", req=\"$command\", cf=\"lock\"}[1m])) by (tag)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{tag}}", "metric": "", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Scheduler scan details [lock]", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The keys scan details of write CF when executing commit command", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 50 }, "id": 829, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "command": { "selected": false, "text": "commit", "value": "commit" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_scheduler_kv_scan_details{instance=~\"$instance\", req=\"$command\", cf=\"write\"}[1m])) by (tag)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{tag}}", "metric": "", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Scheduler scan details [write]", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The keys scan details of default CF when executing commit command", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 50 }, "id": 830, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "command": { "selected": false, "text": "commit", "value": "commit" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_scheduler_kv_scan_details{instance=~\"$instance\", req=\"$command\", cf=\"default\"}[1m])) by (tag)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{tag}}", "metric": "", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Scheduler scan details [default]", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": "command", "title": "Scheduler - $command", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 15 }, "id": 2759, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The rate of Raft snapshot messages sent", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 8, "x": 0, "y": 16 }, "id": 35, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(delta(tiflash_proxy_tikv_raftstore_raft_sent_message_total{instance=~\"$instance\", type=\"snapshot\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": " ", "refId": "A", "step": 60 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Rate snapshot message", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "opm", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The time consumed when handling snapshots", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 8, "x": 8, "y": 16 }, "id": 36, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tiflash_proxy_tikv_server_send_snapshot_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "send", "refId": "A", "step": 60 }, { "expr": "histogram_quantile(0.99, sum(rate(tiflash_proxy_tikv_raftstore_snapshot_duration_seconds_bucket{instance=~\"$instance\", type=\"apply\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "apply", "refId": "B", "step": 60 }, { "expr": "histogram_quantile(0.99, sum(rate(tiflash_proxy_tikv_raftstore_snapshot_duration_seconds_bucket{instance=~\"$instance\", type=\"generate\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "generate", "refId": "C", "step": 60 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "99% Handle snapshot duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The number of snapshots in different states", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 8, "x": 16, "y": 16 }, "id": 38, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": true, "targets": [ { "expr": "sum(tiflash_proxy_tikv_raftstore_snapshot_traffic_total{instance=~\"$instance\"}) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "metric": "", "refId": "A", "step": 60 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Snapshot state count", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The snapshot size (P99.99).9999", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 23 }, "id": 44, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.9999, sum(rate(tiflash_proxy_tikv_snapshot_size_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "size", "metric": "tiflash_proxy_tikv_snapshot_size_bucket", "refId": "A", "step": 40 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "99.99% Snapshot size", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The number of KV within a snapshot in .9999", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 23 }, "id": 43, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.9999, sum(rate(tiflash_proxy_tikv_snapshot_kv_count_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "count", "metric": "tiflash_proxy_tikv_snapshot_kv_count_bucket", "refId": "A", "step": 40 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "99.99% Snapshot KV count", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Snapshot", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 16 }, "id": 2760, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The number of tasks handled by worker", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 21 }, "id": 59, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_worker_handled_task_total{instance=~\"$instance\"}[1m])) by (name)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{name}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Worker handled tasks", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": " \tCurrent pending and running tasks of worker", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 21 }, "id": 1395, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_worker_pending_task_total{instance=~\"$instance\"}[1m])) by (name)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{name}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Worker pending tasks", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The number of tasks handled by future_pool", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 29 }, "id": 1876, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_futurepool_handled_task_total{instance=~\"$instance\"}[1m])) by (name)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{name}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "FuturePool handled tasks", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "Current pending and running tasks of future_pool", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 29 }, "id": 1877, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_futurepool_pending_task_total{instance=~\"$instance\"}[1m])) by (name)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{name}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "FuturePool pending tasks", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Task", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 17 }, "id": 2761, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 24 }, "id": 2108, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": true, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(tiflash_proxy_tikv_threads_state{instance=~\"$instance\"}) by (instance, state)", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{instance}}-{{state}}", "refId": "A", "step": 4 }, { "expr": "sum(tiflash_proxy_tikv_threads_state{instance=~\"$instance\"}) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}-total", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Threads state", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 24 }, "id": 2258, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": true, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_threads_io_bytes_total{instance=~\"$instance\"}[30s])) by (name, io) > 1024", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 1, "legendFormat": "{{name}}-{{io}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Threads IO", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "Bps", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 31 }, "id": 2660, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": true, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_thread_voluntary_context_switches{instance=~\"$instance\"}[30s])) by (instance, name) > 200", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 1, "legendFormat": "{{instance}} - {{name}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Thread Voluntary Context Switches", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 31 }, "id": 2661, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": true, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_thread_nonvoluntary_context_switches{instance=~\"$instance\"}[30s])) by (instance, name) > 50", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 1, "legendFormat": "{{instance}} - {{name}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Thread Nonvoluntary Context Switches", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Threads", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 18 }, "id": 2762, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The count of get operations", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 25 }, "id": 138, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_engine_memtable_efficiency{instance=~\"$instance\", db=\"$db\", type=\"memtable_hit\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "memtable", "metric": "", "refId": "B", "step": 10 }, { "expr": "sum(rate(tiflash_proxy_tikv_engine_cache_efficiency{instance=~\"$instance\", db=\"$db\", type=~\"block_cache_data_hit|block_cache_filter_hit\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "block_cache", "metric": "", "refId": "E", "step": 10 }, { "expr": "sum(rate(tiflash_proxy_tikv_engine_get_served{instance=~\"$instance\", db=\"$db\", type=\"get_hit_l0\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "l0", "refId": "A", "step": 10 }, { "expr": "sum(rate(tiflash_proxy_tikv_engine_get_served{instance=~\"$instance\", db=\"$db\", type=\"get_hit_l1\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "l1", "refId": "C", "step": 10 }, { "expr": "sum(rate(tiflash_proxy_tikv_engine_get_served{instance=~\"$instance\", db=\"$db\", type=\"get_hit_l2_and_up\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "l2_and_up", "refId": "F", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Get operations", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The time consumed when executing get operations", "fill": 0, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 25 }, "id": 82, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "max(tiflash_proxy_tikv_engine_get_micro_seconds{instance=~\"$instance\", db=\"$db\",type=\"get_max\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "max", "refId": "A", "step": 10 }, { "expr": "avg(tiflash_proxy_tikv_engine_get_micro_seconds{instance=~\"$instance\", db=\"$db\",type=\"get_percentile99\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "refId": "B", "step": 10 }, { "expr": "avg(tiflash_proxy_tikv_engine_get_micro_seconds{instance=~\"$instance\", db=\"$db\",type=\"get_percentile95\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "refId": "C", "step": 10 }, { "expr": "avg(tiflash_proxy_tikv_engine_get_micro_seconds{instance=~\"$instance\", db=\"$db\",type=\"get_average\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "refId": "D", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Get duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "µs", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The count of seek operations", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 33 }, "id": 129, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_engine_locate{instance=~\"$instance\", db=\"$db\", type=\"number_db_seek\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "seek", "metric": "", "refId": "A", "step": 10 }, { "expr": "sum(rate(tiflash_proxy_tikv_engine_locate{instance=~\"$instance\", db=\"$db\", type=\"number_db_seek_found\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "seek_found", "metric": "", "refId": "B", "step": 10 }, { "expr": "sum(rate(tiflash_proxy_tikv_engine_locate{instance=~\"$instance\", db=\"$db\", type=\"number_db_next\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "next", "metric": "", "refId": "C", "step": 10 }, { "expr": "sum(rate(tiflash_proxy_tikv_engine_locate{instance=~\"$instance\", db=\"$db\", type=\"number_db_next_found\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "next_found", "metric": "", "refId": "D", "step": 10 }, { "expr": "sum(rate(tiflash_proxy_tikv_engine_locate{instance=~\"$instance\", db=\"$db\", type=\"number_db_prev\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "prev", "metric": "", "refId": "E", "step": 10 }, { "expr": "sum(rate(tiflash_proxy_tikv_engine_locate{instance=~\"$instance\", db=\"$db\", type=\"number_db_prev_found\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "prev_found", "metric": "", "refId": "F", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Seek operations", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The time consumed when executing seek operation", "fill": 0, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 33 }, "id": 125, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "max(tiflash_proxy_tikv_engine_seek_micro_seconds{instance=~\"$instance\", db=\"$db\",type=\"seek_max\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "max", "refId": "A", "step": 10 }, { "expr": "avg(tiflash_proxy_tikv_engine_seek_micro_seconds{instance=~\"$instance\", db=\"$db\",type=\"seek_percentile99\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "refId": "B", "step": 10 }, { "expr": "avg(tiflash_proxy_tikv_engine_seek_micro_seconds{instance=~\"$instance\", db=\"$db\",type=\"seek_percentile95\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "refId": "C", "step": 10 }, { "expr": "avg(tiflash_proxy_tikv_engine_seek_micro_seconds{instance=~\"$instance\", db=\"$db\",type=\"seek_average\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "refId": "D", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Seek duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "µs", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The count of write operations", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 41 }, "id": 139, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_engine_write_served{instance=~\"$instance\", db=\"$db\", type=~\"write_done_by_self|write_done_by_other\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "done", "refId": "A", "step": 10 }, { "expr": "sum(rate(tiflash_proxy_tikv_engine_write_served{instance=~\"$instance\", db=\"$db\", type=\"write_timeout\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "timeout", "refId": "B", "step": 10 }, { "expr": "sum(rate(tiflash_proxy_tikv_engine_write_served{instance=~\"$instance\", db=\"$db\", type=\"write_with_wal\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "with_wal", "refId": "C", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Write operations", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The time consumed when executing write operation", "fill": 0, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 41 }, "id": 126, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "max(tiflash_proxy_tikv_engine_write_micro_seconds{instance=~\"$instance\", db=\"$db\",type=\"write_max\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "max", "refId": "A", "step": 10 }, { "expr": "avg(tiflash_proxy_tikv_engine_write_micro_seconds{instance=~\"$instance\", db=\"$db\",type=\"write_percentile99\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "refId": "B", "step": 10 }, { "expr": "avg(tiflash_proxy_tikv_engine_write_micro_seconds{instance=~\"$instance\", db=\"$db\",type=\"write_percentile95\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "refId": "C", "step": 10 }, { "expr": "avg(tiflash_proxy_tikv_engine_write_micro_seconds{instance=~\"$instance\", db=\"$db\",type=\"write_average\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "refId": "D", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Write duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "µs", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The time consumed when executing write wal operation", "fill": 0, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 41 }, "id": 130, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "max(tiflash_proxy_tikv_engine_write_wal_time_micro_seconds{instance=~\"$instance\", db=\"$db\",type=\"write_wal_micros_max\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "max", "refId": "A", "step": 10 }, { "expr": "avg(tiflash_proxy_tikv_engine_write_wal_time_micro_seconds{instance=~\"$instance\", db=\"$db\",type=\"write_wal_micros_percentile99\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "refId": "B", "step": 10 }, { "expr": "avg(tiflash_proxy_tikv_engine_write_wal_time_micro_seconds{instance=~\"$instance\", db=\"$db\",type=\"write_wal_micros_percentile95\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "refId": "C", "step": 10 }, { "expr": "avg(tiflash_proxy_tikv_engine_write_wal_time_micro_seconds{instance=~\"$instance\", db=\"$db\",type=\"write_wal_micros_average\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "refId": "D", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Write WAL duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "µs", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": " \tThe count of WAL sync operations", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 49 }, "id": 137, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_engine_wal_file_synced{instance=~\"$instance\", db=\"$db\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "sync", "metric": "", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "WAL sync operations", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The time consumed when executing WAL sync operation", "fill": 0, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 49 }, "id": 135, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "maxPerRow": 2, "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "max(tiflash_proxy_tikv_engine_wal_file_sync_micro_seconds{instance=~\"$instance\", db=\"$db\",type=\"wal_file_sync_max\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "max", "refId": "A", "step": 10 }, { "expr": "avg(tiflash_proxy_tikv_engine_wal_file_sync_micro_seconds{instance=~\"$instance\", db=\"$db\",type=\"wal_file_sync_percentile99\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "refId": "B", "step": 10 }, { "expr": "avg(tiflash_proxy_tikv_engine_wal_file_sync_micro_seconds{instance=~\"$instance\", db=\"$db\",type=\"wal_file_sync_percentile95\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "refId": "C", "step": 10 }, { "expr": "avg(tiflash_proxy_tikv_engine_wal_file_sync_micro_seconds{instance=~\"$instance\", db=\"$db\",type=\"wal_file_sync_average\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "refId": "D", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "WAL sync duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "µs", "label": null, "logBase": 10, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The count of compaction and flush operations", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 57 }, "id": 128, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_engine_event_total{instance=~\"$instance\", db=\"$db\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "metric": "tiflash_proxy_tikv_engine_event_total", "refId": "B", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Compaction operations", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The time consumed when executing the compaction and flush operations", "fill": 0, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 57 }, "id": 136, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "max(tiflash_proxy_tikv_engine_compaction_time{instance=~\"$instance\", db=\"$db\",type=\"compaction_time_max\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "max", "metric": "", "refId": "A", "step": 10 }, { "expr": "avg(tiflash_proxy_tikv_engine_compaction_time{instance=~\"$instance\", db=\"$db\",type=\"compaction_time_percentile99\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "refId": "B", "step": 10 }, { "expr": "avg(tiflash_proxy_tikv_engine_compaction_time{instance=~\"$instance\", db=\"$db\",type=\"compaction_time_percentile95\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "refId": "C", "step": 10 }, { "expr": "avg(tiflash_proxy_tikv_engine_compaction_time{instance=~\"$instance\", db=\"$db\",type=\"compaction_time_average\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "refId": "D", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Compaction duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "µs", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The time consumed when reading SST files", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 65 }, "id": 140, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "max(tiflash_proxy_tikv_engine_sst_read_micros{instance=~\"$instance\", db=\"$db\", type=\"sst_read_micros_max\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "max", "metric": "", "refId": "A", "step": 10 }, { "expr": "avg(tiflash_proxy_tikv_engine_sst_read_micros{instance=~\"$instance\", db=\"$db\", type=\"sst_read_micros_percentile99\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "metric": "", "refId": "B", "step": 10 }, { "expr": "avg(tiflash_proxy_tikv_engine_sst_read_micros{instance=~\"$instance\", db=\"$db\", type=\"sst_read_micros_percentile95\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "metric": "", "refId": "C", "step": 10 }, { "expr": "avg(tiflash_proxy_tikv_engine_sst_read_micros{instance=~\"$instance\", db=\"$db\", type=\"sst_read_micros_average\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "metric": "", "refId": "D", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "SST read duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "µs", "label": null, "logBase": 10, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The time which is caused by write stall", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 153 }, "id": 87, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "max(tiflash_proxy_tikv_engine_write_stall{instance=~\"$instance\", db=\"$db\", type=\"write_stall_max\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "max", "metric": "", "refId": "A", "step": 10 }, { "expr": "avg(tiflash_proxy_tikv_engine_write_stall{instance=~\"$instance\", db=\"$db\", type=\"write_stall_percentile99\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "metric": "", "refId": "B", "step": 10 }, { "expr": "avg(tiflash_proxy_tikv_engine_write_stall{instance=~\"$instance\", db=\"$db\", type=\"write_stall_percentile95\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "metric": "", "refId": "C", "step": 10 }, { "expr": "avg(tiflash_proxy_tikv_engine_write_stall{instance=~\"$instance\", db=\"$db\", type=\"write_stall_average\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "metric": "", "refId": "D", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Write stall duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "µs", "label": null, "logBase": 10, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The memtable size of each column family", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 153 }, "id": 103, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "avg(tiflash_proxy_tikv_engine_memory_bytes{instance=~\"$instance\", db=\"$db\", type=\"mem-tables\"}) by (cf)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cf}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Memtable size", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The hit rate of memtable", "fill": 0, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 73 }, "id": 88, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_engine_memtable_efficiency{instance=~\"$instance\", db=\"$db\", type=\"memtable_hit\"}[1m])) / (sum(rate(tiflash_proxy_tikv_engine_memtable_efficiency{db=\"$db\", type=\"memtable_hit\"}[1m])) + sum(rate(tiflash_proxy_tikv_engine_memtable_efficiency{db=\"$db\", type=\"memtable_miss\"}[1m])))", "format": "time_series", "intervalFactor": 2, "legendFormat": "hit", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Memtable hit", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The block cache size. Broken down by column family if shared block cache is disabled.", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 81 }, "id": 102, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "topk(20, avg(tiflash_proxy_tikv_engine_block_cache_size_bytes{instance=~\"$instance\", db=\"$db\"}) by(cf, instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}-{{cf}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Block cache size", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The hit rate of block cache", "fill": 0, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 81 }, "id": 80, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "maxPerRow": 2, "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_engine_cache_efficiency{instance=~\"$instance\", db=\"$db\", type=\"block_cache_hit\"}[1m])) / (sum(rate(tiflash_proxy_tikv_engine_cache_efficiency{instance=~\"$instance\", db=\"$db\", type=\"block_cache_hit\"}[1m])) + sum(rate(tiflash_proxy_tikv_engine_cache_efficiency{instance=~\"$instance\", db=\"$db\", type=\"block_cache_miss\"}[1m])))", "format": "time_series", "intervalFactor": 2, "legendFormat": "all", "metric": "", "refId": "A", "step": 10 }, { "expr": "sum(rate(tiflash_proxy_tikv_engine_cache_efficiency{instance=~\"$instance\", db=\"$db\", type=\"block_cache_data_hit\"}[1m])) / (sum(rate(tiflash_proxy_tikv_engine_cache_efficiency{instance=~\"$instance\", db=\"$db\", type=\"block_cache_data_hit\"}[1m])) + sum(rate(tiflash_proxy_tikv_engine_cache_efficiency{instance=~\"$instance\", db=\"$db\", type=\"block_cache_data_miss\"}[1m])))", "format": "time_series", "intervalFactor": 2, "legendFormat": "data", "metric": "", "refId": "D", "step": 10 }, { "expr": "sum(rate(tiflash_proxy_tikv_engine_cache_efficiency{instance=~\"$instance\", db=\"$db\", type=\"block_cache_filter_hit\"}[1m])) / (sum(rate(tiflash_proxy_tikv_engine_cache_efficiency{instance=~\"$instance\", db=\"$db\", type=\"block_cache_filter_hit\"}[1m])) + sum(rate(tiflash_proxy_tikv_engine_cache_efficiency{instance=~\"$instance\", db=\"$db\", type=\"block_cache_filter_miss\"}[1m])))", "format": "time_series", "intervalFactor": 2, "legendFormat": "filter", "metric": "", "refId": "B", "step": 10 }, { "expr": "sum(rate(tiflash_proxy_tikv_engine_cache_efficiency{instance=~\"$instance\", db=\"$db\", type=\"block_cache_index_hit\"}[1m])) / (sum(rate(tiflash_proxy_tikv_engine_cache_efficiency{instance=~\"$instance\", db=\"$db\", type=\"block_cache_index_hit\"}[1m])) + sum(rate(tiflash_proxy_tikv_engine_cache_efficiency{instance=~\"$instance\", db=\"$db\", type=\"block_cache_index_miss\"}[1m])))", "format": "time_series", "intervalFactor": 2, "legendFormat": "index", "metric": "", "refId": "C", "step": 10 }, { "expr": "sum(rate(tiflash_proxy_tikv_engine_bloom_efficiency{instance=~\"$instance\", db=\"$db\", type=\"bloom_prefix_useful\"}[1m])) / sum(rate(tiflash_proxy_tikv_engine_bloom_efficiency{instance=~\"$instance\", db=\"$db\", type=\"bloom_prefix_checked\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "bloom prefix", "metric": "", "refId": "E", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Block cache hit", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The flow of different kinds of block cache operations", "fill": 0, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 89 }, "height": "", "id": 467, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_engine_flow_bytes{instance=~\"$instance\", db=\"$db\", type=\"block_cache_byte_read\"}[1m]))", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 2, "legendFormat": "total_read", "refId": "A", "step": 10 }, { "expr": "sum(rate(tiflash_proxy_tikv_engine_flow_bytes{instance=~\"$instance\", db=\"$db\", type=\"block_cache_byte_write\"}[1m]))", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 2, "legendFormat": "total_written", "refId": "C", "step": 10 }, { "expr": "sum(rate(tiflash_proxy_tikv_engine_cache_efficiency{instance=~\"$instance\", db=\"$db\", type=\"block_cache_data_bytes_insert\"}[1m]))", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 2, "legendFormat": "data_insert", "metric": "", "refId": "D", "step": 10 }, { "expr": "sum(rate(tiflash_proxy_tikv_engine_cache_efficiency{instance=~\"$instance\", db=\"$db\", type=\"block_cache_filter_bytes_insert\"}[1m]))", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 2, "legendFormat": "filter_insert", "metric": "", "refId": "B", "step": 10 }, { "expr": "sum(rate(tiflash_proxy_tikv_engine_cache_efficiency{instance=~\"$instance\", db=\"$db\", type=\"block_cache_filter_bytes_evict\"}[1m]))", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 2, "legendFormat": "filter_evict", "metric": "", "refId": "E", "step": 10 }, { "expr": "sum(rate(tiflash_proxy_tikv_engine_cache_efficiency{instance=~\"$instance\", db=\"$db\", type=\"block_cache_index_bytes_insert\"}[1m]))", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 2, "legendFormat": "index_insert", "metric": "", "refId": "F", "step": 10 }, { "expr": "sum(rate(tiflash_proxy_tikv_engine_cache_efficiency{instance=~\"$instance\", db=\"$db\", type=\"block_cache_index_bytes_evict\"}[1m]))", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 2, "legendFormat": "index_evict", "metric": "", "refId": "G", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Block cache flow", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "Bps", "label": null, "logBase": 10, "max": null, "min": "0", "show": true }, { "format": "none", "label": null, "logBase": 1, "max": null, "min": "0", "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The count of different kinds of block cache operations", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 89 }, "id": 468, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_engine_cache_efficiency{instance=~\"$instance\", db=\"$db\", type=\"block_cache_add\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "total_add", "metric": "", "refId": "A", "step": 10 }, { "expr": "sum(rate(tiflash_proxy_tikv_engine_cache_efficiency{instance=~\"$instance\", db=\"$db\", type=\"block_cache_data_add\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "data_add", "metric": "", "refId": "C", "step": 10 }, { "expr": "sum(rate(tiflash_proxy_tikv_engine_cache_efficiency{instance=~\"$instance\", db=\"$db\", type=\"block_cache_filter_add\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "filter_add", "metric": "", "refId": "D", "step": 10 }, { "expr": "sum(rate(tiflash_proxy_tikv_engine_cache_efficiency{instance=~\"$instance\", db=\"$db\", type=\"block_cache_index_add\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "index_add", "metric": "", "refId": "E", "step": 10 }, { "expr": "sum(rate(tiflash_proxy_tikv_engine_cache_efficiency{instance=~\"$instance\", db=\"$db\", type=\"block_cache_add_failures\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "add_failures", "metric": "", "refId": "B", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Block cache operations", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The flow of different kinds of operations on keys", "fill": 0, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 97 }, "height": "", "id": 132, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_engine_flow_bytes{instance=~\"$instance\", db=\"$db\", type=\"keys_read\"}[1m]))", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 2, "legendFormat": "read", "refId": "B", "step": 10 }, { "expr": "sum(rate(tiflash_proxy_tikv_engine_flow_bytes{instance=~\"$instance\", db=\"$db\", type=\"keys_written\"}[1m]))", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 2, "legendFormat": "written", "refId": "C", "step": 10 }, { "expr": "sum(rate(tiflash_proxy_tikv_engine_compaction_num_corrupt_keys{instance=~\"$instance\", db=\"$db\"}[1m]))", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 2, "legendFormat": "corrupt", "metric": "", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Keys flow", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The count of keys in each column family", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 97 }, "id": 131, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(tiflash_proxy_tikv_engine_estimate_num_keys{instance=~\"$instance\", db=\"$db\"}) by (cf)", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{cf}}", "metric": "tiflash_proxy_tikv_engine_estimate_num_keys", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Total keys", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The flow rate of read operations per type", "fill": 0, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 105 }, "height": "", "id": 85, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_engine_flow_bytes{instance=~\"$instance\", db=\"$db\", type=\"bytes_read\"}[1m]))", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 2, "legendFormat": "get", "refId": "A", "step": 10 }, { "expr": "sum(rate(tiflash_proxy_tikv_engine_flow_bytes{instance=~\"$instance\", db=\"$db\", type=\"iter_bytes_read\"}[1m]))", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 2, "legendFormat": "scan", "refId": "C", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Read flow", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "Bps", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The bytes per read", "fill": 0, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 105 }, "id": 133, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "maxPerRow": 2, "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "max(tiflash_proxy_tikv_engine_bytes_per_read{instance=~\"$instance\", db=\"$db\",type=\"bytes_per_read_max\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "max", "refId": "A", "step": 10 }, { "expr": "avg(tiflash_proxy_tikv_engine_bytes_per_read{instance=~\"$instance\", db=\"$db\",type=\"bytes_per_read_percentile99\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "refId": "B", "step": 10 }, { "expr": "avg(tiflash_proxy_tikv_engine_bytes_per_read{instance=~\"$instance\", db=\"$db\",type=\"bytes_per_read_percentile95\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "refId": "C", "step": 10 }, { "expr": "avg(tiflash_proxy_tikv_engine_bytes_per_read{instance=~\"$instance\", db=\"$db\",type=\"bytes_per_read_average\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "refId": "D", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Bytes / Read", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "decbytes", "label": null, "logBase": 10, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The flow of different kinds of write operations", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 113 }, "height": "", "id": 86, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_engine_flow_bytes{instance=~\"$instance\", db=\"$db\", type=\"wal_file_bytes\"}[1m]))", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "wal", "refId": "C", "step": 10 }, { "expr": "sum(rate(tiflash_proxy_tikv_engine_flow_bytes{instance=~\"$instance\", db=\"$db\", type=\"bytes_written\"}[1m]))", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "write", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Write flow", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "Bps", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The bytes per write", "fill": 0, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 113 }, "id": 134, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "maxPerRow": 2, "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "max(tiflash_proxy_tikv_engine_bytes_per_write{instance=~\"$instance\", db=\"$db\",type=\"bytes_per_write_max\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "max", "refId": "A", "step": 10 }, { "expr": "avg(tiflash_proxy_tikv_engine_bytes_per_write{instance=~\"$instance\", db=\"$db\",type=\"bytes_per_write_percentile99\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "refId": "B", "step": 10 }, { "expr": "avg(tiflash_proxy_tikv_engine_bytes_per_write{instance=~\"$instance\", db=\"$db\",type=\"bytes_per_write_percentile95\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "refId": "C", "step": 10 }, { "expr": "avg(tiflash_proxy_tikv_engine_bytes_per_write{instance=~\"$instance\", db=\"$db\",type=\"bytes_per_write_average\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "refId": "D", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Bytes / Write", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "decbytes", "label": null, "logBase": 10, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The flow rate of compaction operations per type", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 121 }, "id": 90, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_engine_compaction_flow_bytes{instance=~\"$instance\", db=\"$db\", type=\"bytes_read\"}[1m]))", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "read", "refId": "A", "step": 10 }, { "expr": "sum(rate(tiflash_proxy_tikv_engine_compaction_flow_bytes{instance=~\"$instance\", db=\"$db\", type=\"bytes_written\"}[1m]))", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "written", "refId": "C", "step": 10 }, { "expr": "sum(rate(tiflash_proxy_tikv_engine_flow_bytes{instance=~\"$instance\", db=\"$db\", type=\"flush_write_bytes\"}[1m]))", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "flushed", "refId": "B", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Compaction flow", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "Bps", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "Bps", "label": null, "logBase": 1, "max": null, "min": "0", "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The pending bytes to be compacted", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 121 }, "id": 127, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_engine_pending_compaction_bytes{instance=~\"$instance\", db=\"$db\"}[1m])) by (cf)", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{cf}}", "metric": "tiflash_proxy_tikv_engine_pending_compaction_bytes", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Compaction pending bytes", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "Bps", "label": null, "logBase": 1, "max": null, "min": "0", "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The read amplification per TiKV instance \t", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 129 }, "id": 518, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_engine_read_amp_flow_bytes{instance=~\"$instance\", db=\"$db\", type=\"read_amp_total_read_bytes\"}[1m])) by (instance) / sum(rate(tiflash_proxy_tikv_engine_read_amp_flow_bytes{db=\"$db\", type=\"read_amp_estimate_useful_bytes\"}[1m])) by (instance)", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Read amplication", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The compression ratio of each level", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 129 }, "id": 863, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "avg(tiflash_proxy_tikv_engine_compression_ratio{instance=~\"$instance\", db=\"$db\"}) by (level)", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "level - {{level}}", "metric": "", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Compression ratio", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The number of snapshot of each TiKV instance", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 137 }, "id": 516, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "tiflash_proxy_tikv_engine_num_snapshots{instance=~\"$instance\", db=\"$db\"}", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Number of snapshots", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The time that the oldest unreleased snapshot survivals", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 137 }, "id": 517, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "tiflash_proxy_tikv_engine_oldest_snapshot_duration{instance=~\"$instance\", db=\"$db\"}", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tiflash_proxy_tikv_engine_oldest_snapshot_duration", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Oldest snapshots duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The number of SST files for different column families in each level", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 145 }, "id": 2002, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": true, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "avg(tiflash_proxy_tikv_engine_num_files_at_level{instance=~\"$instance\", db=\"$db\"}) by (cf, level)", "format": "time_series", "intervalFactor": 2, "legendFormat": "cf-{{cf}}, level-{{level}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Number files at each level", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The time consumed when ingesting SST files", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 145 }, "id": 2003, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": true, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tiflash_proxy_tikv_snapshot_ingest_sst_duration_seconds_bucket{instance=~\"$instance\", db=\"$db\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "refId": "A" }, { "expr": "sum(rate(tiflash_proxy_tikv_snapshot_ingest_sst_duration_seconds_sum{instance=~\"$instance\"}[1m])) / sum(rate(tiflash_proxy_tikv_snapshot_ingest_sst_duration_seconds_count{instance=~\"$instance\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "average", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Ingest SST duration seconds", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "Stall conditions changed of each column family", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 153 }, "id": 2381, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideZero": true, "max": true, "min": true, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "tiflash_proxy_tikv_engine_stall_conditions_changed{instance=~\"$instance\", db=\"$db\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}-{{cf}}-{{type}}", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Stall conditions changed of each CF", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 161 }, "id": 2452, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(increase(tiflash_proxy_tikv_engine_write_stall_reason{instance=~\"$instance\", db=\"$db\"}[1m])) by (type)", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{type}}", "metric": "", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Write Stall Reason", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 65 }, "id": 2451, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_engine_compaction_reason{instance=~\"$instance\", db=\"$db\"}[1m])) by (cf, reason)", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{cf}} - {{reason}}", "metric": "", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Compaction reason", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": "db", "title": "RocksDB - $db", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 20 }, "id": 4414, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "Total number of encryption data keys in use", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 21 }, "id": 4453, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": true, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "tiflash_proxy_tikv_encryption_data_key_storage_total", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{instance}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Encryption data keys", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 0, "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "decimals": 0, "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "Number of files being encrypted", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 21 }, "id": 4492, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": true, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "tiflash_proxy_tikv_encryption_file_num", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{instance}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Encrypted files", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "Flag to indicate if encryption is initialized", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 29 }, "id": 4496, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "tiflash_proxy_tikv_encryption_is_initialized", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{instance}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Encryption initialized", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 0, "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "decimals": 0, "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "Total size of encryption meta files", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 29 }, "id": 4497, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": true, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "tiflash_proxy_tikv_encryption_meta_file_size_bytes", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{name}}-{{instance}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Encryption meta files size", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "Writing or reading file duration (second)", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 37 }, "id": 4498, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(1, sum(rate(tiflash_proxy_tikv_encryption_write_read_file_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le, type, operation))", "format": "time_series", "intervalFactor": 2, "legendFormat": "max-{{type}}-{{operation}}", "refId": "A" }, { "expr": "histogram_quantile(0.95, sum(rate(tiflash_proxy_tikv_encryption_write_read_file_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le, type, operation))", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%-{{type}}-{{operation}}", "refId": "B" }, { "expr": "sum(rate(tiflash_proxy_tikv_encryption_write_read_file_duration_seconds_sum{instance=~\"$instance\"}[1m])) by (le, type, operation) / sum(rate(tiflash_proxy_tikv_encryption_write_read_file_duration_seconds_count{instance=~\"$instance\"}[1m])) by (le, type, operation)", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg-{{type}}-{{operation}}", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Read/write encryption meta duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "title": "Encryption", "type": "row" } ], "refresh": "1m", "schemaVersion": 18, "style": "dark", "tags": [], "templating": { "list": [ { "allValue": null, "current": {}, "datasource": "${DS_TEST-CLUSTER}", "definition": "", "hide": 0, "includeAll": true, "label": "db", "multi": true, "name": "db", "options": [], "query": "label_values(tiflash_proxy_tikv_engine_block_cache_size_bytes, db)", "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 1, "tagValuesQuery": "", "tags": [], "tagsQuery": "", "type": "query", "useTags": false }, { "allValue": null, "current": {}, "datasource": "${DS_TEST-CLUSTER}", "definition": "", "hide": 0, "includeAll": true, "label": "command", "multi": true, "name": "command", "options": [], "query": "label_values(tiflash_proxy_tikv_storage_command_total, type)", "refresh": 1, "regex": "prewrite|commit|rollback", "skipUrlSync": false, "sort": 1, "tagValuesQuery": "", "tags": [], "tagsQuery": "", "type": "query", "useTags": false }, { "allValue": ".*", "current": {}, "datasource": "${DS_TEST-CLUSTER}", "definition": "", "hide": 0, "includeAll": true, "label": "Instance", "multi": false, "name": "instance", "options": [], "query": "label_values(tiflash_proxy_tikv_engine_size_bytes, instance)", "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 1, "tagValuesQuery": "", "tags": [], "tagsQuery": "", "type": "query", "useTags": false } ] }, "time": { "from": "now-1h", "to": "now" }, "timepicker": { "refresh_intervals": [ "5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d" ], "time_options": [ "5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d" ] }, "timezone": "browser", "title": "Test-Cluster-TiFlash-Proxy-Details", "uid": "kWxNAVnGz", "version": 1 } ================================================ FILE: scripts/tiflash_proxy_summary.json ================================================ { "__inputs": [ { "name": "DS_TEST-CLUSTER", "label": "test-cluster", "description": "", "type": "datasource", "pluginId": "prometheus", "pluginName": "Prometheus" } ], "__requires": [ { "type": "grafana", "id": "grafana", "name": "Grafana", "version": "6.1.6" }, { "type": "panel", "id": "graph", "name": "Graph", "version": "" }, { "type": "datasource", "id": "prometheus", "name": "Prometheus", "version": "1.0.0" } ], "annotations": { "list": [ { "builtIn": 1, "datasource": "${DS_TEST-CLUSTER}", "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "type": "dashboard" } ] }, "editable": true, "gnetId": null, "graphTooltip": 1, "id": null, "iteration": 1582082299870, "links": [], "panels": [ { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, "id": 2742, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": " \tThe CPU usage of each TiKV instance", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 1 }, "id": 1708, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_tiflash_proxy_thread_cpu_seconds_total{instance=~\"$instance\", job=\"tiflash\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "CPU", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": " \tThe memory usage of each TiKV instance", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 1 }, "id": 1709, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "avg(tiflash_proxy_process_resident_memory_bytes{instance=~\"$instance\", job=\"tiflash\"}) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Memory", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": " \tThe I/O utilization per TiKV instance", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 9 }, "id": 1710, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "rate(node_disk_io_time_seconds_total[1m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} - {{device}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "IO utilization", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": " \tThe number of Regions on each TiKV instance", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 9 }, "id": 1714, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 300, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(tiflash_proxy_tikv_raftstore_region_count{instance=~\"$instance\", type=\"region\"}) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Region", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": "", "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Cluster", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 1 }, "id": 2743, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "It contains some kinds of events such as write stall, channel full, scheduler busy, and coprocessor full, which will make the TiKV instance unavailable temporarily.", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 2 }, "id": 1584, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_scheduler_too_busy_total{instance=~\"$instance\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "scheduler-{{instance}}", "metric": "", "refId": "A", "step": 4 }, { "expr": "sum(rate(tiflash_proxy_tikv_channel_full_total{instance=~\"$instance\"}[1m])) by (instance, type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "channelfull-{{instance}}-{{type}}", "metric": "", "refId": "B", "step": 4 }, { "expr": "avg(tiflash_proxy_tikv_engine_write_stall{instance=~\"$instance\", type=\"write_stall_percentile99\"}) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "stall-{{instance}}", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Server is busy", "tooltip": { "msResolution": false, "shared": true, "sort": 2, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The count of missing leaders per TiKV instance", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 2 }, "id": 1723, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "total", "lines": false } ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(tiflash_proxy_tikv_raftstore_leader_missing{instance=~\"$instance\"}) by (instance)", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Leader missing", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Errors", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 2 }, "id": 2744, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": " \tThe total size of each column family", "editable": true, "error": false, "fill": 3, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 3 }, "id": 33, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "expr": "sum(tiflash_proxy_tikv_engine_size_bytes{instance=~\"$instance\"}) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "CF size", "tooltip": { "msResolution": false, "shared": true, "sort": 2, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "decbytes", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Server", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 3 }, "id": 2746, "panels": [ { "alert": { "conditions": [ { "evaluator": { "params": [ 1.7 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "datasourceId": 1, "model": { "expr": "sum(rate(tikv_tiflash_proxy_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"raftstore_.*\"}[1m])) by (instance)", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tikv_tiflash_proxy_thread_cpu_seconds_total", "refId": "A", "step": 20 }, "params": [ "A", "1m", "now" ] }, "reducer": { "params": [], "type": "max" }, "type": "query" } ], "executionErrorState": "alerting", "for": "0m", "frequency": "60s", "handler": 1, "message": "TiKV raftstore thread CPU usage is high", "name": "TiKV raft store CPU alert", "noDataState": "ok", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The CPU utilization of raftstore thread", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 4 }, "id": 61, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_tiflash_proxy_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"raftstore_.*\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tikv_tiflash_proxy_thread_cpu_seconds_total", "refId": "A", "step": 4 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 0.85 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Raft store CPU", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The CPU utilization of RocksDB", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 4 }, "id": 69, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_tiflash_proxy_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"rocksdb.*\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tikv_tiflash_proxy_thread_cpu_seconds_total", "refId": "A", "step": 4 } ], "thresholds": [ { "colorMode": "warning", "fill": true, "line": true, "op": "gt", "value": 1 }, { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 4 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "RocksDB CPU", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": " \tThe CPU utilization of split check", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 11 }, "id": 68, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_tiflash_proxy_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"split_check\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tikv_tiflash_proxy_thread_cpu_seconds_total", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Split check CPU", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The CPU utilization of snapshot worker", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 11 }, "id": 67, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_tiflash_proxy_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"snapshot_worker\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tikv_tiflash_proxy_thread_cpu_seconds_total", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Snapshot worker CPU", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 18 }, "id": 2531, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_tiflash_proxy_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"gc_worker.*\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "GC worker CPU", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Thread CPU", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }, "id": 2747, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": " \tThe count of requests that TiKV sends to PD", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 5 }, "id": 1069, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 350, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_pd_request_duration_seconds_count{instance=~\"$instance\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ type }}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "PD requests", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The time consumed by requests that TiKV sends to PD", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 5 }, "id": 1070, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 350, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_proxy_tikv_pd_request_duration_seconds_sum{instance=~\"$instance\"}[1m])) by (type) / sum(rate(tiflash_proxy_tikv_pd_request_duration_seconds_count{instance=~\"$instance\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ type }}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "PD request duration (average)", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "PD", "type": "row" } ], "refresh": "1m", "schemaVersion": 18, "style": "dark", "tags": [], "templating": { "list": [ { "allValue": ".*", "current": {}, "datasource": "${DS_TEST-CLUSTER}", "definition": "", "hide": 0, "includeAll": true, "label": "Instance", "multi": true, "name": "instance", "options": [], "query": "label_values(tiflash_proxy_tikv_engine_size_bytes, instance)", "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 1, "tagValuesQuery": "", "tags": [], "tagsQuery": "", "type": "query", "useTags": false } ] }, "time": { "from": "now-1h", "to": "now" }, "timepicker": { "refresh_intervals": [ "5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d" ], "time_options": [ "5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d" ] }, "timezone": "browser", "title": "Test-Cluster-TiFlash-Proxy-Summary", "uid": "myoLjZQWz", "version": 18 } ================================================ FILE: scripts/tiflash_summary.json ================================================ { "__inputs": [ { "name": "DS_TEST-CLUSTER", "label": "test-cluster", "description": "", "type": "datasource", "pluginId": "prometheus", "pluginName": "Prometheus" } ], "__requires": [ { "type": "grafana", "id": "grafana", "name": "Grafana", "version": "6.1.6" }, { "type": "panel", "id": "graph", "name": "Graph", "version": "" }, { "type": "panel", "id": "heatmap", "name": "Heatmap", "version": "" }, { "type": "datasource", "id": "prometheus", "name": "Prometheus", "version": "1.0.0" } ], "annotations": { "list": [ { "builtIn": 1, "datasource": "${DS_TEST-CLUSTER}", "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "type": "dashboard" } ] }, "editable": true, "gnetId": null, "graphTooltip": 1, "id": null, "iteration": 1595916828338, "links": [], "panels": [ { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, "id": 4, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The storage size per TiFlash instance.\n(Not including some disk usage of TiFlash-Proxy by now)", "editable": true, "error": false, "fill": 5, "grid": {}, "gridPos": { "h": 8, "w": 8, "x": 0, "y": 1 }, "id": 53, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 0, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "expr": "sum(tiflash_system_current_metric_StoreSizeUsed{instance=~\"$instance\"}) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Store size", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "decbytes", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The available capacity size per TiFlash instance", "editable": true, "error": false, "fill": 5, "grid": {}, "gridPos": { "h": 8, "w": 8, "x": 8, "y": 1 }, "id": 54, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 0, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "expr": "sum(tiflash_system_current_metric_StoreSizeAvailable{instance=~\"$instance\"}) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Available size", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "decbytes", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The capacity size per TiFlash instance", "editable": true, "error": false, "fill": 5, "grid": {}, "gridPos": { "h": 8, "w": 8, "x": 16, "y": 1 }, "id": 55, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 0, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "expr": "sum(tiflash_system_current_metric_StoreSizeCapacity{instance=~\"$instance\"}) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Capacity size", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "decbytes", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "cacheTimeout": null, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiFlash uptime since last restart", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 9 }, "id": 21, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pluginVersion": "6.1.6", "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "total", "fill": 0, "lines": false } ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "tiflash_system_asynchronous_metric_Uptime{instance=~\"$instance\"}", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 40 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Uptime", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "dtdurations", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The memory usage per TiFlash instance", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 9 }, "id": 10, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(tiflash_system_asynchronous_metric_jemalloc_retained{instance=~\"$instance\"})", "format": "time_series", "hide": true, "intervalFactor": 1, "legendFormat": "retained", "refId": "A" }, { "expr": "sum(tiflash_system_asynchronous_metric_jemalloc_mapped{instance=~\"$instance\"})", "format": "time_series", "hide": true, "intervalFactor": 1, "legendFormat": "mapped", "refId": "B" }, { "expr": "sum(tiflash_system_asynchronous_metric_jemalloc_resident{instance=~\"$instance\"})", "format": "time_series", "hide": true, "intervalFactor": 1, "legendFormat": "resident", "refId": "C" }, { "expr": "sum(tiflash_system_asynchronous_metric_jemalloc_allocated{instance=~\"$instance\"})", "format": "time_series", "hide": true, "intervalFactor": 1, "legendFormat": "allocated", "refId": "D" }, { "expr": "sum(tiflash_system_asynchronous_metric_jemalloc_active{instance=~\"$instance\"})", "format": "time_series", "hide": true, "intervalFactor": 1, "legendFormat": "active", "refId": "E" }, { "expr": "sum(tiflash_system_asynchronous_metric_jemalloc_metadata_thp{instance=~\"$instance\"})", "format": "time_series", "hide": true, "intervalFactor": 1, "legendFormat": "metadata_thp", "refId": "F" }, { "expr": "sum(tiflash_system_asynchronous_metric_jemalloc_metadata{instance=~\"$instance\"})", "format": "time_series", "hide": true, "intervalFactor": 1, "legendFormat": "metadata", "refId": "G" }, { "expr": "tiflash_proxy_process_resident_memory_bytes{job=\"tiflash\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{instance}}", "refId": "H" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Memory", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "TiFlash CPU usage calculated with process CPU running seconds.", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 17 }, "id": 51, "legend": { "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "total", "fill": 0, "lines": false } ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "rate(tiflash_proxy_process_cpu_seconds_total{job=\"tiflash\"}[1m])", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 40 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "CPU Usage", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 1, "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The number of fsync operations.\n(Only counting storage engine of TiFlash by now. Not including TiFlash-Proxy)", "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 17 }, "id": 52, "legend": { "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_system_profile_event_FileFSync{instance=~\"$instance\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{instance}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "FSync OPS", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": null, "format": "ops", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "none", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The number of open file descriptors action.\n(Only counting storage engine of TiFlash by now. Not including TiFlash-Proxy)", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 }, "id": 22, "legend": { "alignAsTable": false, "avg": false, "current": false, "hideEmpty": true, "hideZero": true, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_system_profile_event_FileOpen{instance=~\"$instance\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 1, "legendFormat": "Newly Open-{{instance}}", "refId": "A" }, { "expr": "sum(rate(tiflash_system_profile_event_FileOpenFailed{instance=~\"$instance\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 1, "legendFormat": "Newly Open Failed-{{instance}}", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "File Open OPS", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The number of currently opened file descriptors.\n(Only counting storage engine of TiFlash by now. Not including TiFlash-Proxy)", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 }, "id": 50, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "tiflash_proxy_process_open_fds{job=\"tiflash\"}", "format": "time_series", "hide": true, "intervalFactor": 1, "legendFormat": "{{instance}}", "refId": "A" }, { "expr": "sum(tiflash_system_current_metric_OpenFileForWrite{instance=~\"$instance\"}) by (instance)", "format": "time_series", "hide": false, "intervalFactor": 1, "legendFormat": "Write File-{{instance}}", "refId": "B" }, { "expr": "sum(tiflash_system_current_metric_OpenFileForRead{instance=~\"$instance\"}) by (instance)", "format": "time_series", "hide": false, "intervalFactor": 1, "legendFormat": "Read File-{{instance}}", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Opened File Count", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Server", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 1 }, "id": 6, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 2 }, "id": 9, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_coprocessor_request_count{instance=~\"$instance\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{type}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Request QPS", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": null, "format": "none", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "none", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 2 }, "id": 2, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_coprocessor_executor_count{instance=~\"$instance\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{type}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Executor QPS", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "none", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 9 }, "id": 11, "legend": { "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.999, sum(rate(tiflash_coprocessor_request_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 1, "legendFormat": "999", "refId": "A" }, { "expr": "histogram_quantile(0.99, sum(rate(tiflash_coprocessor_request_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 1, "legendFormat": "99", "refId": "B" }, { "expr": "histogram_quantile(0.95, sum(rate(tiflash_coprocessor_request_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 1, "legendFormat": "95", "refId": "C" }, { "expr": "histogram_quantile(0.80, sum(rate(tiflash_coprocessor_request_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 1, "legendFormat": "80", "refId": "D" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Request Duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 9 }, "id": 12, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_coprocessor_request_error{instance=~\"$instance\"}[1m])) by (reason)", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{reason}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Error QPS", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "none", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 16 }, "id": 13, "legend": { "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.999, sum(rate(tiflash_coprocessor_request_handle_seconds_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 1, "legendFormat": "999", "refId": "A" }, { "expr": "histogram_quantile(0.99, sum(rate(tiflash_coprocessor_request_handle_seconds_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 1, "legendFormat": "99", "refId": "B" }, { "expr": "histogram_quantile(0.95, sum(rate(tiflash_coprocessor_request_handle_seconds_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 1, "legendFormat": "95", "refId": "C" }, { "expr": "histogram_quantile(0.80, sum(rate(tiflash_coprocessor_request_handle_seconds_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 1, "legendFormat": "80", "refId": "D" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Request Handle", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 16 }, "id": 14, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_coprocessor_response_bytes{instance=~\"$instance\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{instance}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Response Bytes/Seconds", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Coprocessor", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 2 }, "id": 16, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 34 }, "id": 17, "legend": { "alignAsTable": false, "avg": false, "current": true, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "avg(tiflash_schema_version{instance=~\"$instance\"}) by (instance)", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{instance}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Schema Version", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": null, "format": "none", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "none", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "Executed DDL apply jobs per minute", "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 34 }, "id": 18, "legend": { "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "avg(increase(tiflash_schema_apply_count{instance=~\"$instance\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{type}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Schema Apply OPM", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": null, "format": "opm", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "none", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "Executed DDL jobs per minute", "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 41 }, "id": 19, "legend": { "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "avg(increase(tiflash_schema_internal_ddl_count{instance=~\"$instance\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{type}}", "refId": "A" }, { "expr": "sum(increase(tiflash_schema_internal_ddl_count{instance=~\"$instance\"}[1m]))", "format": "time_series", "intervalFactor": 1, "legendFormat": "total", "refId": "B" }, { "expr": "sum(increase(tiflash_schema_internal_ddl_count{instance=~\"$instance\"}[1m])) by (type,instance)", "format": "time_series", "hide": true, "intervalFactor": 1, "legendFormat": "{{type}} - {{instance}}", "refId": "C" }, { "expr": "sum(increase(tiflash_schema_internal_ddl_count{instance=~\"$instance\"}[1m])) by (instance)", "format": "time_series", "hide": true, "intervalFactor": 1, "legendFormat": "total - {{instance}}", "refId": "D" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Schema Internal DDL OPM", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": null, "format": "opm", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "none", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 41 }, "id": 20, "legend": { "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.999, sum(rate(tiflash_schema_apply_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 1, "legendFormat": "999", "refId": "A" }, { "expr": "histogram_quantile(0.99, sum(rate(tiflash_schema_apply_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 1, "legendFormat": "99", "refId": "B" }, { "expr": "histogram_quantile(0.95, sum(rate(tiflash_schema_apply_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 1, "legendFormat": "95", "refId": "C" }, { "expr": "histogram_quantile(0.80, sum(rate(tiflash_schema_apply_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 1, "legendFormat": "80", "refId": "D" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Schema Apply Duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "DDL", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 3 }, "id": 25, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The total count of different kinds of commands received", "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 4 }, "id": 41, "legend": { "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_storage_command_count{instance=~\"$instance\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A" }, { "expr": "sum(rate(tiflash_system_profile_event_DMWriteBlock{instance=~\"$instance\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 1, "legendFormat": "write block", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Write Command OPS", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": null, "format": "ops", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "none", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "cacheTimeout": null, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 4 }, "id": 38, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pluginVersion": "6.1.6", "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "max(tiflash_storage_write_amplification{instance=~\"$instance\"}) by (instance)", "format": "time_series", "hide": false, "intervalFactor": 1, "legendFormat": "total-{{instance}}", "refId": "A" }, { "expr": "sum((rate(tiflash_system_profile_event_PSMWriteBytes{instance=~\"$instance\"}[5m]) + rate(tiflash_system_profile_event_WriteBufferFromFileDescriptorWriteBytes{instance=~\"$instance\"}[5m]) + rate(tiflash_system_profile_event_WriteBufferAIOWriteBytes{instance=~\"$instance\"}[5m])) / (rate(tiflash_system_profile_event_DMWriteBytes{instance=~\"$instance\"}[5m]))) by (instance)", "format": "time_series", "hide": false, "intervalFactor": 1, "legendFormat": "5min-{{instance}}", "refId": "B" }, { "expr": "sum((rate(tiflash_system_profile_event_PSMWriteBytes{instance=~\"$instance\"}[10m]) + rate(tiflash_system_profile_event_WriteBufferFromFileDescriptorWriteBytes{instance=~\"$instance\"}[10m]) + rate(tiflash_system_profile_event_WriteBufferAIOWriteBytes{instance=~\"$instance\"}[10m])) / (rate(tiflash_system_profile_event_DMWriteBytes{instance=~\"$instance\"}[10m]))) by (instance)", "format": "time_series", "hide": true, "intervalFactor": 1, "legendFormat": "10min-{{instance}}", "refId": "C" }, { "expr": "sum((rate(tiflash_system_profile_event_PSMWriteBytes{instance=~\"$instance\"}[30m]) + rate(tiflash_system_profile_event_WriteBufferFromFileDescriptorWriteBytes{instance=~\"$instance\"}[30m]) + rate(tiflash_system_profile_event_WriteBufferAIOWriteBytes{instance=~\"$instance\"}[30m])) / (rate(tiflash_system_profile_event_DMWriteBytes{instance=~\"$instance\"}[30m]))) by (instance)", "format": "time_series", "hide": true, "intervalFactor": 1, "legendFormat": "30min-{{instance}}", "refId": "D" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Write Amplification", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": null, "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "none", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "Total number of storage engine read tasks", "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 11 }, "id": 40, "legend": { "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_storage_read_tasks_count{instance=~\"$instance\"}[1m])) by (instance)", "format": "time_series", "instant": false, "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Read Tasks OPS", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": null, "format": "ops", "label": "", "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "none", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 11 }, "id": 61, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "/^RS Filter/", "yaxis": 2 }, { "alias": "/^PK/", "yaxis": 2 }, { "alias": "/^No Filter/", "yaxis": 2 } ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "avg((rate(tiflash_system_profile_event_DMFileFilterAftPKAndPackSet{instance=~\"$instance\"}[1m]) - rate(tiflash_system_profile_event_DMFileFilterAftRoughSet{instance=~\"$instance\"}[1m])) / (rate(tiflash_system_profile_event_DMFileFilterAftPKAndPackSet{instance=~\"$instance\"}[1m]))) by (instance)", "format": "time_series", "hide": false, "intervalFactor": 1, "legendFormat": "1min-{{instance}}", "refId": "B" }, { "expr": "avg((rate(tiflash_system_profile_event_DMFileFilterAftPKAndPackSet{instance=~\"$instance\"}[5m]) - rate(tiflash_system_profile_event_DMFileFilterAftRoughSet{instance=~\"$instance\"}[5m])) / (rate(tiflash_system_profile_event_DMFileFilterAftPKAndPackSet{instance=~\"$instance\"}[5m]))) by (instance)", "format": "time_series", "hide": true, "intervalFactor": 1, "legendFormat": "5min-{{instance}}", "refId": "C" }, { "expr": "sum(rate(tiflash_system_profile_event_DMFileFilterNoFilter{instance=~\"$instance\"}[1m])) by (instance)", "format": "time_series", "hide": true, "instant": false, "intervalFactor": 1, "legendFormat": "No Filter-{{instance}}", "refId": "A" }, { "expr": "sum(rate(tiflash_system_profile_event_DMFileFilterAftPKAndPackSet{instance=~\"$instance\"}[1m])) by (instance)", "format": "time_series", "hide": true, "instant": false, "intervalFactor": 1, "legendFormat": "PK Filter-{{instance}}", "refId": "D" }, { "expr": "sum(rate(tiflash_system_profile_event_DMFileFilterAftRoughSet{instance=~\"$instance\"}[1m])) by (instance)", "format": "time_series", "hide": true, "intervalFactor": 1, "legendFormat": "RS Filter-{{instance}}", "refId": "E" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Rough Set Filter Rate", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": null, "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "decimals": null, "format": "short", "label": "", "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "Total number of storage's internal sub tasks", "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 18 }, "id": 39, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_storage_subtask_count{instance=~\"$instance\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{type}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Internal Tasks OPS", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": null, "format": "ops", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "none", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "Duration of storage's internal sub tasks", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 18 }, "id": 42, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "99-delta_merge", "yaxis": 2 } ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.999, sum(rate(tiflash_storage_subtask_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le,type))", "format": "time_series", "hide": true, "intervalFactor": 1, "legendFormat": "999-{{type}}", "refId": "A" }, { "expr": "histogram_quantile(0.99, sum(rate(tiflash_storage_subtask_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le,type))", "format": "time_series", "hide": false, "intervalFactor": 1, "legendFormat": "99-{{type}}", "refId": "B" }, { "expr": "histogram_quantile(0.95, sum(rate(tiflash_storage_subtask_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le,type))", "format": "time_series", "hide": true, "intervalFactor": 1, "legendFormat": "95-{{type}}", "refId": "C" }, { "expr": "histogram_quantile(0.80, sum(rate(tiflash_storage_subtask_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le,type))", "format": "time_series", "hide": true, "intervalFactor": 1, "legendFormat": "80-{{type}}", "refId": "D" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Internal Tasks Duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "Total number of storage's internal page gc tasks", "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 25 }, "id": 43, "legend": { "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(increase(tiflash_storage_page_gc_count{instance=~\"$instance\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{type}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Page GC Tasks OPM", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": null, "format": "opm", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "none", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "cards": { "cardPadding": null, "cardRound": null }, "color": { "cardColor": "#b4ff00", "colorScale": "sqrt", "colorScheme": "interpolateSpectral", "exponent": 0.5, "mode": "spectrum" }, "dataFormat": "tsbuckets", "datasource": "${DS_TEST-CLUSTER}", "description": "Duration of storage's internal page gc tasks", "gridPos": { "h": 7, "w": 12, "x": 12, "y": 25 }, "heatmap": {}, "hideZeroBuckets": true, "highlightCards": true, "id": 44, "legend": { "show": true }, "links": [], "reverseYBuckets": false, "targets": [ { "expr": "sum(delta(tiflash_storage_page_gc_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le)", "format": "heatmap", "intervalFactor": 2, "legendFormat": "{{le}}", "refId": "B" } ], "timeFrom": null, "timeShift": null, "title": "Page GC Tasks Duration", "tooltip": { "show": true, "showHistogram": false }, "type": "heatmap", "xAxis": { "show": true }, "xBucketNumber": null, "xBucketSize": null, "yAxis": { "decimals": 0, "format": "s", "logBase": 1, "max": null, "min": null, "show": true, "splitFactor": null }, "yBucketBound": "upper", "yBucketNumber": null, "yBucketSize": null }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The number of different kinds of read operations", "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 32 }, "id": 46, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_system_profile_event_PSMWriteIOCalls{instance=~\"$instance\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "Page", "refId": "A" }, { "expr": "sum(rate(tiflash_system_profile_event_PSMWriteCalls{instance=~\"$instance\"}[1m])) by (type)", "format": "time_series", "hide": true, "intervalFactor": 2, "legendFormat": "Page Calls", "refId": "B" }, { "expr": "sum(rate(tiflash_system_profile_event_PSMWritePages{instance=~\"$instance\"}[1m]))", "format": "time_series", "hide": true, "intervalFactor": 2, "legendFormat": "PageFile", "refId": "C" }, { "expr": "sum(rate(tiflash_system_profile_event_WriteBufferFromFileDescriptorWrite{instance=~\"$instance\"}[1m]))", "format": "time_series", "intervalFactor": 1, "legendFormat": "File Descriptor", "refId": "D" }, { "expr": "sum(rate(tiflash_system_profile_event_WriteBufferAIOWrite{instance=~\"$instance\"}[1m]))", "format": "time_series", "intervalFactor": 1, "legendFormat": "AIO", "refId": "F" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Disk Write OPS", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": null, "format": "ops", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "none", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The number of different kinds of read operations", "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 32 }, "id": 47, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_system_profile_event_PSMReadIOCalls{instance=~\"$instance\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "Page", "refId": "A" }, { "expr": "sum(rate(tiflash_system_profile_event_PSMReadCalls{instance=~\"$instance\"}[1m]))", "format": "time_series", "hide": true, "intervalFactor": 2, "legendFormat": "Page Calls", "refId": "B" }, { "expr": "sum(rate(tiflash_system_profile_event_PSMReadPages{instance=~\"$instance\"}[1m]))", "format": "time_series", "hide": true, "intervalFactor": 2, "legendFormat": "PageFile", "refId": "C" }, { "expr": "sum(rate(tiflash_system_profile_event_ReadBufferFromFileDescriptorRead{instance=~\"$instance\"}[1m]))", "format": "time_series", "intervalFactor": 1, "legendFormat": "File Descriptor", "refId": "D" }, { "expr": "sum(rate(tiflash_system_profile_event_ReadBufferAIORead{instance=~\"$instance\"}[1m]))", "format": "time_series", "intervalFactor": 1, "legendFormat": "AIO", "refId": "F" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Disk Read OPS", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": null, "format": "ops", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "none", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The flow of different kinds of write operations", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 39 }, "height": "", "id": 60, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "repeatedByRow": true, "seriesOverrides": [], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_system_profile_event_WriteBufferFromFileDescriptorWriteBytes{instance=~\"$instance\"}[1m]))", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "File Descriptor", "refId": "A", "step": 10 }, { "expr": "sum(rate(tiflash_system_profile_event_PSMWriteBytes{instance=~\"$instance\"}[1m]))", "format": "time_series", "intervalFactor": 1, "legendFormat": "Page", "refId": "B" }, { "expr": "sum(rate(tiflash_system_profile_event_WriteBufferAIOWriteBytes{instance=~\"$instance\"}[1m]))", "format": "time_series", "interval": "", "intervalFactor": 1, "legendFormat": "AIO", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Write flow", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "Bps", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The flow of different kinds of read operations", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 39 }, "height": "", "id": 59, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "repeatedByRow": true, "seriesOverrides": [], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_system_profile_event_ReadBufferFromFileDescriptorReadBytes{instance=~\"$instance\"}[1m]))", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "File Descriptor", "refId": "A", "step": 10 }, { "expr": "sum(rate(tiflash_system_profile_event_PSMReadBytes{instance=~\"$instance\"}[1m]))", "format": "time_series", "intervalFactor": 1, "legendFormat": "Page", "refId": "B" }, { "expr": "sum(rate(tiflash_system_profile_event_ReadBufferAIOReadBytes{instance=~\"$instance\"}[1m]))", "format": "time_series", "intervalFactor": 1, "legendFormat": "AIO", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Read flow", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "Bps", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Storage", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }, "id": 64, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The throughput of write and delta's background management", "fill": 1, "gridPos": { "h": 9, "w": 24, "x": 0, "y": 79 }, "height": "", "id": 70, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "repeatedByRow": true, "seriesOverrides": [ { "alias": "/total/", "yaxis": 2 } ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_storage_throughput_bytes{instance=~\"$instance\", type=\"write\"}[1m]))", "format": "time_series", "hide": false, "intervalFactor": 1, "legendFormat": "throughput_write", "refId": "A", "step": 10 }, { "expr": "sum(rate(tiflash_storage_throughput_bytes{instance=~\"$instance\", type!=\"write\"}[1m]))", "format": "time_series", "intervalFactor": 1, "legendFormat": "throughput_delta-management", "refId": "B" }, { "expr": "sum(tiflash_storage_throughput_bytes{instance=~\"$instance\", type=\"write\"})", "format": "time_series", "intervalFactor": 1, "legendFormat": "total_write", "refId": "C" }, { "expr": "sum(tiflash_storage_throughput_bytes{instance=~\"$instance\", type!=\"write\"})", "format": "time_series", "intervalFactor": 1, "legendFormat": "total_delta-management", "refId": "D" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Write & Delta Management Throughput", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "Bps", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The stall duration of write and delete range", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 88 }, "id": 62, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "99-delta_merge", "yaxis": 2 } ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tiflash_storage_write_stall_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le, type, instance))", "format": "time_series", "hide": false, "intervalFactor": 1, "legendFormat": "99-{{type}}-{{instance}}", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Write Stall Duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "cacheTimeout": null, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The current processing number of segments' background management", "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 88 }, "id": 67, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pluginVersion": "6.1.6", "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "avg(tiflash_system_current_metric_DT_DeltaMerge{instance=~\"$instance\"}) by (instance)", "format": "time_series", "hide": false, "intervalFactor": 1, "legendFormat": "delta_merge-{{instance}}", "refId": "A" }, { "expr": "avg(tiflash_system_current_metric_DT_SegmentSplit{instance=~\"$instance\"}) by (instance)", "format": "time_series", "intervalFactor": 1, "legendFormat": "seg_split-{{instance}}", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Current Data Management Tasks", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": null, "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "none", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "title": "Storage Write Stall", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 }, "id": 34, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 6 }, "id": 35, "legend": { "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_raft_read_index_count{instance=~\"$instance\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{instance}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Read Index OPS", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": null, "format": "ops", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "none", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 6 }, "id": 36, "legend": { "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.999, sum(rate(tiflash_raft_read_index_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 1, "legendFormat": "999", "refId": "A" }, { "expr": "histogram_quantile(0.99, sum(rate(tiflash_raft_read_index_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 1, "legendFormat": "99", "refId": "B" }, { "expr": "histogram_quantile(0.95, sum(rate(tiflash_raft_read_index_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 1, "legendFormat": "95", "refId": "C" }, { "expr": "histogram_quantile(0.80, sum(rate(tiflash_raft_read_index_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 1, "legendFormat": "80", "refId": "D" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Read Index Duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 13 }, "id": 37, "legend": { "alignAsTable": false, "avg": false, "current": false, "max": false, "min": false, "rightSide": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.999, sum(rate(tiflash_raft_wait_index_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 1, "legendFormat": "999", "refId": "A" }, { "expr": "histogram_quantile(0.99, sum(rate(tiflash_raft_wait_index_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 1, "legendFormat": "99", "refId": "B" }, { "expr": "histogram_quantile(0.95, sum(rate(tiflash_raft_wait_index_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 1, "legendFormat": "95", "refId": "C" }, { "expr": "histogram_quantile(0.80, sum(rate(tiflash_raft_wait_index_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 1, "legendFormat": "80", "refId": "D" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Wait Index Duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The number of currently applying snapshots.", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 13 }, "id": 75, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(tiflash_system_current_metric_RaftNumSnapshotsPendingApply{instance=~\"$instance\"}) by (instance)", "format": "time_series", "hide": false, "intervalFactor": 1, "legendFormat": "Pending-{{instance}}", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Applying snapshots Count", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "cards": { "cardPadding": null, "cardRound": null }, "color": { "cardColor": "#b4ff00", "colorScale": "sqrt", "colorScheme": "interpolateSpectral", "exponent": 0.5, "mode": "spectrum" }, "dataFormat": "tsbuckets", "datasource": "${DS_TEST-CLUSTER}", "description": "Duration of pre-decode when applying region snapshot", "gridPos": { "h": 7, "w": 12, "x": 0, "y": 20 }, "heatmap": {}, "hideZeroBuckets": true, "highlightCards": true, "id": 72, "legend": { "show": true }, "links": [], "reverseYBuckets": false, "targets": [ { "expr": "sum(delta(tiflash_raft_command_duration_seconds_bucket{instance=~\"$instance\", type=\"snapshot_predecode\"}[1m])) by (le)", "format": "heatmap", "intervalFactor": 2, "legendFormat": "{{le}}", "refId": "B" } ], "timeFrom": null, "timeShift": null, "title": "Snapshot Predecode Duration", "tooltip": { "show": true, "showHistogram": true }, "type": "heatmap", "xAxis": { "show": true }, "xBucketNumber": null, "xBucketSize": null, "yAxis": { "decimals": 0, "format": "s", "logBase": 1, "max": null, "min": null, "show": true, "splitFactor": null }, "yBucketBound": "upper", "yBucketNumber": null, "yBucketSize": null }, { "cards": { "cardPadding": null, "cardRound": null }, "color": { "cardColor": "#b4ff00", "colorScale": "sqrt", "colorScheme": "interpolateSpectral", "exponent": 0.5, "mode": "spectrum" }, "dataFormat": "tsbuckets", "datasource": "${DS_TEST-CLUSTER}", "description": "Duration of pre-decode when applying region snapshot", "gridPos": { "h": 7, "w": 12, "x": 12, "y": 20 }, "heatmap": {}, "hideZeroBuckets": true, "highlightCards": true, "id": 73, "legend": { "show": true }, "links": [], "reverseYBuckets": false, "targets": [ { "expr": "sum(delta(tiflash_raft_command_duration_seconds_bucket{instance=~\"$instance\", type=\"snapshot_flush\"}[1m])) by (le)", "format": "heatmap", "intervalFactor": 2, "legendFormat": "{{le}}", "refId": "B" } ], "timeFrom": null, "timeShift": null, "title": "Snapshot Flush Duration", "tooltip": { "show": true, "showHistogram": true }, "type": "heatmap", "xAxis": { "show": true }, "xBucketNumber": null, "xBucketSize": null, "yAxis": { "decimals": 0, "format": "s", "logBase": 1, "max": null, "min": null, "show": true, "splitFactor": null }, "yBucketBound": "upper", "yBucketNumber": null, "yBucketSize": null }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The keys flow of different kinds of Raft operations", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 27 }, "height": "", "id": 71, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "repeatedByRow": true, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tiflash_raft_process_keys{instance=~\"$instance\"}[1m])) by (type)", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Keys flow", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "cards": { "cardPadding": null, "cardRound": null }, "color": { "cardColor": "#b4ff00", "colorScale": "sqrt", "colorScheme": "interpolateSpectral", "exponent": 0.5, "mode": "spectrum" }, "dataFormat": "tsbuckets", "datasource": "${DS_TEST-CLUSTER}", "description": "Duration of ingesting SST", "gridPos": { "h": 7, "w": 12, "x": 12, "y": 27 }, "heatmap": {}, "hideZeroBuckets": true, "highlightCards": true, "id": 74, "legend": { "show": true }, "links": [], "reverseYBuckets": false, "targets": [ { "expr": "sum(delta(tiflash_raft_command_duration_seconds_bucket{instance=~\"$instance\", type=\"ingest_sst\"}[1m])) by (le)", "format": "heatmap", "intervalFactor": 2, "legendFormat": "{{le}}", "refId": "B" } ], "timeFrom": null, "timeShift": null, "title": "Ingest SST Duration", "tooltip": { "show": true, "showHistogram": true }, "type": "heatmap", "xAxis": { "show": true }, "xBucketNumber": null, "xBucketSize": null, "yAxis": { "decimals": 0, "format": "s", "logBase": 1, "max": null, "min": null, "show": true, "splitFactor": null }, "yBucketBound": "upper", "yBucketNumber": null, "yBucketSize": null } ], "repeat": null, "title": "Raft", "type": "row" } ], "refresh": "30s", "schemaVersion": 18, "style": "dark", "tags": [], "templating": { "list": [ { "allValue": null, "current": {}, "datasource": "${DS_TEST-CLUSTER}", "definition": "label_values(tiflash_system_profile_event_Query, instance)", "hide": 0, "includeAll": true, "label": "Instance", "multi": true, "name": "instance", "options": [], "query": "label_values(tiflash_system_profile_event_Query, instance)", "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 1, "tagValuesQuery": "", "tags": [], "tagsQuery": "", "type": "query", "useTags": false } ] }, "time": { "from": "now-1h", "to": "now" }, "timepicker": { "refresh_intervals": [ "5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d" ], "time_options": [ "5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d" ] }, "timezone": "", "title": "Test-Cluster-TiFlash-Summary", "uid": "SVbh2xUWk", "version": 2 } ================================================ FILE: scripts/tikv_details.json ================================================ { "__inputs": [ { "name": "DS_TEST-CLUSTER", "label": "test-cluster", "description": "", "type": "datasource", "pluginId": "prometheus", "pluginName": "Prometheus" } ], "__requires": [ { "type": "grafana", "id": "grafana", "name": "Grafana", "version": "6.1.6" }, { "type": "panel", "id": "graph", "name": "Graph", "version": "" }, { "type": "panel", "id": "heatmap", "name": "Heatmap", "version": "" }, { "type": "datasource", "id": "prometheus", "name": "Prometheus", "version": "1.0.0" }, { "type": "panel", "id": "singlestat", "name": "Singlestat", "version": "" } ], "annotations": { "list": [ { "builtIn": 1, "datasource": "${DS_TEST-CLUSTER}", "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "type": "dashboard" } ] }, "editable": true, "gnetId": null, "graphTooltip": 1, "id": null, "iteration": 1577960059869, "links": [], "panels": [ { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, "id": 2742, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The storage size per TiKV instance", "editable": true, "error": false, "fill": 5, "grid": {}, "gridPos": { "h": 8, "w": 8, "x": 0, "y": 1 }, "id": 56, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 0, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "expr": "sum(tikv_engine_size_bytes{instance=~\"$instance\"}) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Store size", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "decbytes", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The available capacity size of each TiKV instance", "editable": true, "error": false, "fill": 5, "grid": {}, "gridPos": { "h": 8, "w": 8, "x": 8, "y": 1 }, "id": 1706, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 0, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "expr": "sum(tikv_store_size_bytes{instance=~\"$instance\", type=\"available\"}) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Available size", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "decbytes", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The capacity size per TiKV instance", "editable": true, "error": false, "fill": 5, "grid": {}, "gridPos": { "h": 8, "w": 8, "x": 16, "y": 1 }, "id": 1707, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 0, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "expr": "sum(tikv_store_size_bytes{instance=~\"$instance\", type=\"capacity\"}) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Capacity size", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "decbytes", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The CPU usage of each TiKV instance", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 9 }, "id": 1708, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "CPU", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The memory usage per TiKV instance", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 9 }, "id": 1709, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "avg(process_resident_memory_bytes{instance=~\"$instance\"}) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Memory", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The I/O utilization per TiKV instance", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 17 }, "id": 1710, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "rate(node_disk_io_time_seconds_total[1m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} - {{device}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "IO utilization", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The total bytes of read and write in each TiKV instance", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 17 }, "id": 1711, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_engine_flow_bytes{instance=~\"$instance\", db=\"kv\", type=\"wal_file_bytes\"}[1m])) by (instance)", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{instance}}-write", "refId": "A", "step": 10 }, { "expr": "sum(rate(tikv_engine_flow_bytes{instance=~\"$instance\", db=\"kv\", type=~\"bytes_read|iter_bytes_read\"}[1m])) by (instance)", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{instance}}-read", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "MBps", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "Bps", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The QPS per command in each TiKV instance", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 25 }, "id": 1713, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_grpc_msg_duration_seconds_count{instance=~\"$instance\", type!=\"kv_gc\"}[1m])) by (instance,type)", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{instance}} - {{type}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "QPS", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The total number of the gRPC message failures", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 25 }, "id": 1712, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_grpc_msg_fail_total{instance=~\"$instance\", type!=\"kv_gc\"}[1m])) by (instance)", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{instance}}-grpc-msg-fail", "refId": "A", "step": 10 }, { "expr": "sum(delta(tikv_pd_heartbeat_message_total{instance=~\"$instance\", type=\"noop\"}[1m])) by (instance) < 1", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}-pd-heartbeat", "refId": "B" }, { "expr": "sum(rate(tikv_critical_error_total{instance=~\"$instance\"}[1m])) by (instance, type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}-{{type}}", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Errps", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": " \tThe number of leaders on each TiKV instance", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 33 }, "id": 1715, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "total", "lines": false } ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(tikv_raftstore_region_count{instance=~\"$instance\", type=\"leader\"}) by (instance)", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 10 }, { "expr": "delta(tikv_raftstore_region_count{instance=~\"$instance\", type=\"leader\"}[30s]) < -10", "format": "time_series", "hide": true, "intervalFactor": 2, "legendFormat": "", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Leader", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The number of Regions on each TiKV instance", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 33 }, "id": 1714, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(tikv_raftstore_region_count{instance=~\"$instance\", type=\"region\"}) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Region", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": "", "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": null, "description": "TiKV uptime since the last restart", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 41 }, "id": 4106, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "(time() - process_start_time_seconds{job=\"tikv\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Uptime", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "dtdurations", "label": "", "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Cluster", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 1 }, "id": 2743, "panels": [ { "alert": { "conditions": [ { "evaluator": { "params": [ 0 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "5m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" } ], "executionErrorState": "alerting", "frequency": "60s", "handler": 1, "name": "Critical error alert", "noDataState": "no_data", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 7, "w": 24, "x": 0, "y": 2 }, "id": 2741, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_critical_error_total{instance=~\"$instance\"}[1m])) by (instance, type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}-{{type}}", "refId": "A" } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 0 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Critical error", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "Indicates occurrences of events that make the TiKV instance unavailable temporarily, such as Write Stall, Channel Full, Scheduler Busy, and Coprocessor Full", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 9 }, "id": 1584, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_scheduler_too_busy_total{instance=~\"$instance\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "scheduler-{{instance}}", "metric": "", "refId": "A", "step": 4 }, { "expr": "sum(rate(tikv_channel_full_total{instance=~\"$instance\"}[1m])) by (instance, type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "channelfull-{{instance}}-{{type}}", "metric": "", "refId": "B", "step": 4 }, { "expr": "sum(rate(tikv_coprocessor_request_error{instance=~\"$instance\", type='full'}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "coprocessor-{{instance}}", "metric": "", "refId": "C", "step": 4 }, { "expr": "avg(tikv_engine_write_stall{instance=~\"$instance\", type=\"write_stall_percentile99\", db=~\"$db\"}) by (instance, db)", "format": "time_series", "intervalFactor": 2, "legendFormat": "stall-{{instance}}-{{db}}", "refId": "D" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Server is busy", "tooltip": { "msResolution": false, "shared": true, "sort": 2, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 0 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "10s", "now" ] }, "reducer": { "params": [], "type": "max" }, "type": "query" } ], "executionErrorState": "alerting", "frequency": "10s", "handler": 1, "message": "TiKV server report failures", "name": "server report failures alert", "noDataState": "ok", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The total number of reporting failure messages", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 9 }, "id": 18, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_server_report_failure_msg_total{instance=~\"$instance\"}[1m])) by (type,instance,store_id)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} - {{type}} - to - {{store_id}}", "metric": "tikv_server_raft_store_msg_total", "refId": "A", "step": 10 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 0 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Server report failures", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The number of different raftstore errors on each TiKV instance", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 16 }, "id": 1718, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_storage_engine_async_request_total{instance=~\"$instance\", status!~\"success|all\"}[1m])) by (instance, status)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}-{{status}}", "metric": "", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Raftstore error", "tooltip": { "msResolution": false, "shared": true, "sort": 2, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The number of scheduler errors per type on each TiKV instance", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 16 }, "id": 1719, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_scheduler_stage_total{instance=~\"$instance\", stage=~\"snapshot_err|prepare_write_err\"}[1m])) by (instance, stage)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}-{{stage}}", "metric": "", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Scheduler error", "tooltip": { "msResolution": false, "shared": true, "sort": 2, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The number of different coprocessor errors on each TiKV instance", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 23 }, "id": 1720, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_coprocessor_request_error{instance=~\"$instance\"}[1m])) by (instance, reason)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}-{{reason}}", "metric": "", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Coprocessor error", "tooltip": { "msResolution": false, "shared": true, "sort": 2, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The number of gRPC message errors per type on each TiKV instance", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 23 }, "id": 1721, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_grpc_msg_fail_total{instance=~\"$instance\"}[1m])) by (instance, type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}-{{type}}", "metric": "", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "gRPC message error", "tooltip": { "msResolution": false, "shared": true, "sort": 2, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The count of dropped leaders per TiKV instance", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 30 }, "id": 1722, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "total", "lines": false } ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(delta(tikv_raftstore_region_count{instance=~\"$instance\", type=\"leader\"}[1m])) by (instance)", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Leader drop", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The count of missing leaders per TiKV instance", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 30 }, "id": 1723, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "total", "lines": false } ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(tikv_raftstore_leader_missing{instance=~\"$instance\"}) by (instance)", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Leader missing", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Errors", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 2 }, "id": 2744, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The size of each column family", "editable": true, "error": false, "fill": 3, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 3 }, "id": 33, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "expr": "sum(tikv_engine_size_bytes{instance=~\"$instance\"}) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "CF size", "tooltip": { "msResolution": false, "shared": true, "sort": 2, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "decbytes", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The storage size per TiKV instance", "editable": true, "error": false, "fill": 5, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 3 }, "id": 1705, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 0, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "expr": "sum(tikv_engine_size_bytes{instance=~\"$instance\"}) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Store size", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "decbytes", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 0 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "datasourceId": 1, "model": { "expr": "sum(rate(tikv_channel_full_total{instance=~\"$instance\"}[1m])) by (instance, type)", "intervalFactor": 2, "legendFormat": "{{instance}} - {{type}}", "metric": "", "refId": "A", "step": 10 }, "params": [ "A", "10s", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" } ], "executionErrorState": "alerting", "frequency": "10s", "handler": 1, "message": "TiKV channel full", "name": "TiKV channel full alert", "noDataState": "ok", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The total number of channel full errors on each TiKV instance", "editable": true, "error": false, "fill": 3, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 11 }, "id": 22, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_channel_full_total{instance=~\"$instance\"}[1m])) by (instance, type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} - {{type}}", "metric": "", "refId": "A", "step": 10 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 0 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Channel full", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The number of leaders being written on each TiKV instance", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 11 }, "id": 75, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_region_written_keys_count{instance=~\"$instance\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tikv_region_written_keys_bucket", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Active written leaders", "tooltip": { "msResolution": false, "shared": true, "sort": 2, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 1073741824 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "B", "1m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" } ], "executionErrorState": "alerting", "frequency": "60s", "handler": 1, "name": "approximate region size alert", "noDataState": "no_data", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The approximate Region size", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 19 }, "id": 1481, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_region_size_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "metric": "", "refId": "B", "step": 10 }, { "expr": "histogram_quantile(0.95, sum(rate(tikv_raftstore_region_size_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "metric": "", "refId": "C", "step": 10 }, { "expr": "sum(rate(tikv_raftstore_region_size_sum{instance=~\"$instance\"}[1m])) / sum(rate(tikv_raftstore_region_size_count{instance=~\"$instance\"}[1m])) ", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "metric": "", "refId": "D", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Approximate Region size", "tooltip": { "msResolution": false, "shared": true, "sort": 2, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": true, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 19 }, "id": 3638, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "max": true, "min": false, "rightSide": true, "show": false, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": false, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "paceLength": 10, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_raftstore_region_size_bucket{instance=~\"$instance\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "metric": "", "refId": "B", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Approximate Region size Histogram", "tooltip": { "msResolution": false, "shared": false, "sort": 2, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "histogram", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The average rate of writing bytes to Regions per TiKV instance", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 27 }, "id": 58, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_region_written_bytes_sum[1m])) by (instance) / sum(rate(tikv_region_written_bytes_count[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tikv_regi", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Region average written bytes", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "cards": { "cardPadding": null, "cardRound": null }, "color": { "cardColor": "#b4ff00", "colorScale": "sqrt", "colorScheme": "interpolateOranges", "exponent": 0.5, "mode": "spectrum" }, "dataFormat": "timeseries", "datasource": "${DS_TEST-CLUSTER}", "gridPos": { "h": 8, "w": 12, "x": 12, "y": 27 }, "heatmap": {}, "hideZeroBuckets": false, "highlightCards": true, "id": 3636, "legend": { "show": false }, "links": [], "reverseYBuckets": false, "targets": [ { "expr": "sum(rate(tikv_region_written_bytes_bucket[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tikv_regi", "refId": "A", "step": 10 } ], "timeFrom": null, "timeShift": null, "title": "Region written bytes", "tooltip": { "show": true, "showHistogram": false }, "type": "heatmap", "xAxis": { "show": true }, "xBucketNumber": null, "xBucketSize": null, "yAxis": { "decimals": null, "format": "decbytes", "logBase": 1, "max": null, "min": null, "show": true, "splitFactor": null }, "yBucketBound": "auto", "yBucketNumber": null, "yBucketSize": null }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The average rate of written keys to Regions per TiKV instance", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 35 }, "id": 57, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_region_written_keys_sum{instance=~\"$instance\"}[1m])) by (instance) / sum(rate(tikv_region_written_keys_count{instance=~\"$instance\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tikv_region_written_keys_bucket", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Region average written keys", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "cards": { "cardPadding": null, "cardRound": null }, "color": { "cardColor": "#b4ff00", "colorScale": "sqrt", "colorScheme": "interpolateOranges", "exponent": 0.5, "mode": "spectrum" }, "dataFormat": "timeseries", "datasource": "${DS_TEST-CLUSTER}", "gridPos": { "h": 8, "w": 12, "x": 12, "y": 35 }, "heatmap": {}, "hideZeroBuckets": false, "highlightCards": true, "id": 3637, "legend": { "show": false }, "links": [], "reverseYBuckets": false, "targets": [ { "expr": "sum(rate(tikv_region_written_keys_bucket{instance=~\"$instance\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tikv_region_written_keys_bucket", "refId": "A", "step": 10 } ], "timeFrom": null, "timeShift": null, "title": "Region written keys", "tooltip": { "show": true, "showHistogram": false }, "type": "heatmap", "xAxis": { "show": true }, "xBucketNumber": null, "xBucketSize": null, "yAxis": { "decimals": null, "format": "short", "logBase": 1, "max": null, "min": null, "show": true, "splitFactor": null }, "yBucketBound": "auto", "yBucketNumber": null, "yBucketSize": null }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The ratio of request batch output to input per TiKV instance", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 43 }, "id": 3718, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_server_request_batch_ratio_sum{instance=~\"$instance\"}[1m])) by (type) / sum(rate(tikv_server_request_batch_ratio_count{instance=~\"$instance\"}[1m])) by (type)", "format": "time_series", "hide": false, "intervalFactor": 1, "legendFormat": "{{type}} avg", "refId": "B" }, { "expr": "histogram_quantile(0.99, sum(rate(tikv_server_request_batch_ratio_bucket{instance=~\"$instance\"}[1m])) by (le, type))", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{type}} 99", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Request batch ratio", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 10, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The size of requests into request batch per TiKV instance", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 43 }, "id": 3720, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_server_request_batch_size_sum{instance=~\"$instance\"}[1m])) by (type) / sum(rate(tikv_server_request_batch_size_count{instance=~\"$instance\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{type}} avg", "refId": "A" }, { "expr": "histogram_quantile(0.99, sum(rate(tikv_server_request_batch_size_bucket{instance=~\"$instance\"}[1m])) by (le, type))", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{type}} 99", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Request batch input", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 10, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Server", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 3 }, "id": 2745, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The count of different kinds of gRPC message", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 }, "id": 95, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_grpc_msg_duration_seconds_count{instance=~\"$instance\", type!=\"kv_gc\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "metric": "tikv_grpc_msg_duration_seconds_bucket", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "gRPC message count", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The count of different kinds of gRPC message which is failed", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 }, "id": 107, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_grpc_msg_fail_total{instance=~\"$instance\", type!=\"kv_gc\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "metric": "tikv_grpc_msg_fail_total", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "gRPC message failed", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The execution time of gRPC message", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 }, "id": 98, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_grpc_msg_duration_seconds_bucket{instance=~\"$instance\", type!=\"kv_gc\"}[1m])) by (le, type))", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "99% gRPC messge duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 10, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 12 }, "id": 2532, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_grpc_msg_duration_seconds_sum{instance=~\"$instance\"}[1m])) by (type) / sum(rate(tikv_grpc_msg_duration_seconds_count[1m])) by (type)", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Average gRPC messge duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 20 }, "id": 2533, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_server_grpc_req_batch_size_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "99% request", "refId": "A", "step": 10 }, { "expr": "histogram_quantile(0.99, sum(rate(tikv_server_grpc_resp_batch_size_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99% response", "refId": "B" }, { "expr": "sum(rate(tikv_server_grpc_req_batch_size_sum{instance=~\"$instance\"}[1m])) / sum(rate(tikv_server_grpc_req_batch_size_count{instance=~\"$instance\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg request", "refId": "C" }, { "expr": "sum(rate(tikv_server_grpc_resp_batch_size_sum{instance=~\"$instance\"}[1m])) / sum(rate(tikv_server_grpc_resp_batch_size_count{instance=~\"$instance\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg response", "refId": "D" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "gRPC batch size", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 20 }, "id": 2534, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_server_raft_message_batch_size_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "99%", "refId": "A", "step": 10 }, { "expr": "sum(rate(tikv_server_raft_message_batch_size_sum{instance=~\"$instance\"}[1m])) / sum(rate(tikv_server_raft_message_batch_size_count{instance=~\"$instance\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "raft message batch size", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "gRPC", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }, "id": 2746, "panels": [ { "alert": { "conditions": [ { "evaluator": { "params": [ 1.7 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "datasourceId": 1, "model": { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"raftstore_.*\"}[1m])) by (instance)", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tikv_thread_cpu_seconds_total", "refId": "A", "step": 20 }, "params": [ "A", "1m", "now" ] }, "reducer": { "params": [], "type": "max" }, "type": "query" } ], "executionErrorState": "alerting", "for": "0m", "frequency": "60s", "handler": 1, "message": "TiKV raftstore thread CPU usage is high", "name": "TiKV raft store CPU alert", "noDataState": "ok", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The CPU utilization of raftstore thread", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 5 }, "id": 61, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"raftstore_.*\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tikv_thread_cpu_seconds_total", "refId": "A", "step": 4 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 0.85 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Raft store CPU", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 1.8 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "1m", "now" ] }, "reducer": { "params": [], "type": "max" }, "type": "query" } ], "executionErrorState": "alerting", "for": "0m", "frequency": "1m", "handler": 1, "message": "TiKV async apply thread CPU usage is high", "name": "TiKV async apply CPU alert", "noDataState": "ok", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The CPU utilization of async apply", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 5 }, "id": 79, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"apply_[0-9]+\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tikv_thread_cpu_seconds_total", "refId": "A", "step": 4 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 0.9 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Async apply CPU", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 3.6 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "1m", "now" ] }, "reducer": { "params": [], "type": "max" }, "type": "query" } ], "executionErrorState": "alerting", "for": "0m", "frequency": "1m", "handler": 1, "message": "TiKV scheduler worker thread CPU usage is high", "name": "TiKV scheduler worker CPU alert", "noDataState": "ok", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The CPU utilization of scheduler worker", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 12 }, "id": 64, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"sched_.*\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tikv_thread_cpu_seconds_total", "refId": "A", "step": 4 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 0.9 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Scheduler worker CPU", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 3.6 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "1m", "now" ] }, "reducer": { "params": [], "type": "max" }, "type": "query" } ], "executionErrorState": "alerting", "for": "0m", "frequency": "1m", "handler": 1, "message": "TiKV gRPC poll thread CPU usage is high", "name": "TiKV gRPC poll CPU alert", "noDataState": "ok", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The CPU utilization of gRPC", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 12 }, "id": 105, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"grpc.*\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 4 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 0.9 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "gRPC poll CPU", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 7.2 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "1m", "now" ] }, "reducer": { "params": [], "type": "max" }, "type": "query" } ], "executionErrorState": "alerting", "for": "0m", "frequency": "1m", "handler": 1, "message": "TiKV unified read pool thread CPU usage is high", "name": "Unified read pool CPU alert", "noDataState": "ok", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The CPU utilization of the unified read pool", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 19 }, "id": 4287, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"unified_read_po*\"}[1m])) by (instance)", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 4 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 7.2 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Unified read pool CPU", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 3.6 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "1m", "now" ] }, "reducer": { "params": [], "type": "max" }, "type": "query" } ], "executionErrorState": "alerting", "for": "0m", "frequency": "1m", "handler": 1, "message": "TiKV Storage ReadPool thread CPU usage is high", "name": "TiKV Storage ReadPool CPU alert", "noDataState": "ok", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The CPU utilization of readpool", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 19 }, "id": 1908, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"store_read_norm.*\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} - normal", "metric": "tikv_thread_cpu_seconds_total", "refId": "A", "step": 4 }, { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"store_read_high.*\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} - high", "metric": "tikv_thread_cpu_seconds_total", "refId": "B", "step": 4 }, { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"store_read_low.*\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} - low", "metric": "tikv_thread_cpu_seconds_total", "refId": "C", "step": 4 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 3.6 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Storage ReadPool CPU", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 7.2 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "1m", "now" ] }, "reducer": { "params": [], "type": "max" }, "type": "query" } ], "executionErrorState": "alerting", "for": "0m", "frequency": "1m", "handler": 1, "message": "TiKV Coprocessor thread CPU alert", "name": "TiKV Coprocessor CPU alert", "noDataState": "ok", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The CPU utilization of coprocessor", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 26 }, "id": 78, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"cop_normal.*\"}[1m])) by (instance)", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "{{instance}} - normal", "refId": "A", "step": 4 }, { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"cop_high.*\"}[1m])) by (instance)", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "{{instance}} - high", "refId": "B", "step": 4 }, { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"cop_low.*\"}[1m])) by (instance)", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "{{instance}} - low", "refId": "C", "step": 4 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 7.2 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Coprocessor CPU", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The CPU utilization of RocksDB", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 26 }, "id": 69, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"rocksdb.*\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tikv_thread_cpu_seconds_total", "refId": "A", "step": 4 } ], "thresholds": [ { "colorMode": "warning", "fill": true, "line": true, "op": "gt", "value": 1 }, { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 4 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "RocksDB CPU", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": " \tThe CPU utilization of split check", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 33 }, "id": 68, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"split_check\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tikv_thread_cpu_seconds_total", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Split check CPU", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 33 }, "id": 2531, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"gc_worker.*\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "GC worker CPU", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The CPU utilization of snapshot worker", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 40 }, "id": 67, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"snapshot_worker\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tikv_thread_cpu_seconds_total", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Snapshot worker CPU", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Thread CPU", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 }, "id": 2747, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The count of requests that TiKV sends to PD", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 }, "id": 1069, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_pd_request_duration_seconds_count{instance=~\"$instance\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ type }}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "PD requests", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The time consumed by requests that TiKV sends to PD", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 }, "id": 1070, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_pd_request_duration_seconds_sum{instance=~\"$instance\"}[1m])) by (type) / sum(rate(tikv_pd_request_duration_seconds_count{instance=~\"$instance\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ type }}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "PD request duration (average)", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": " \tThe total number of PD heartbeat messages", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 14 }, "id": 1215, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_pd_heartbeat_message_total{instance=~\"$instance\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ type }}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "PD heartbeats", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "opm", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The total number of peers validated by the PD worker", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 14 }, "id": 1396, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_pd_validate_peer_total{instance=~\"$instance\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ type }}", "metric": "", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "PD validate peers", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "PD", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 6 }, "id": 2748, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The time consumed when Raft applies log", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 7 }, "id": 31, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_apply_log_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": " 99%", "metric": "", "refId": "A", "step": 4 }, { "expr": "histogram_quantile(0.95, sum(rate(tikv_raftstore_apply_log_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "refId": "B", "step": 4 }, { "expr": "sum(rate(tikv_raftstore_apply_log_duration_seconds_sum{instance=~\"$instance\"}[1m])) / sum(rate(tikv_raftstore_apply_log_duration_seconds_count{instance=~\"$instance\"}[1m])) ", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "refId": "C", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Apply log duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The time consumed for Raft to apply logs per TiKV instance", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 7 }, "id": 32, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_apply_log_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le, instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": " {{instance}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Apply log duration per server", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The time consumed when Raft appends log", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 14 }, "id": 39, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_append_log_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": " 99%", "metric": "", "refId": "A", "step": 4 }, { "expr": "histogram_quantile(0.95, sum(rate(tikv_raftstore_append_log_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "refId": "B", "step": 4 }, { "expr": "sum(rate(tikv_raftstore_append_log_duration_seconds_sum{instance=~\"$instance\"}[1m])) / sum(rate(tikv_raftstore_append_log_duration_seconds_count{instance=~\"$instance\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "refId": "C", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Append log duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The time consumed when Raft appends log on each TiKV instance", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 14 }, "id": 40, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_append_log_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le, instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} ", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Append log duration per server", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The time consumed when Raft commits log", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 21 }, "id": 3690, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_commit_log_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "refId": "A" }, { "expr": "histogram_quantile(0.95, sum(rate(tikv_raftstore_commit_log_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "refId": "B" }, { "expr": "sum(rate(tikv_raftstore_commit_log_duration_seconds_sum{instance=~\"$instance\"}[1m])) / sum(rate(tikv_raftstore_commit_log_duration_seconds_count{instance=~\"$instance\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Commit log duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The time consumed when Raft commits log on each TiKV instance", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 21 }, "id": 3688, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_commit_log_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le, instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Commit log duration per server", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Raft IO", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 7 }, "id": 2749, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The count of different ready type of Raft", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 8 }, "id": 5, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_raftstore_raft_ready_handled_total{instance=~\"$instance\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "metric": "tikv_raftstore_raft_ready_handled_total", "refId": "A", "step": 4 }, { "expr": "sum(rate(tikv_raftstore_raft_process_duration_secs_count{instance=~\"$instance\", type=\"ready\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "count", "refId": "B", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Ready handled", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The time consumed for peer processes to be ready in Raft", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 8 }, "id": 118, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_raft_process_duration_secs_bucket{instance=~\"$instance\", type='ready'}[1m])) by (le, instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "C", "step": 4 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 1 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Process ready duration per server", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The time consumed by raftstore events (P99).99", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 16 }, "id": 123, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_event_duration_bucket{instance=~\"$instance\"}[1m])) by (le, type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "C", "step": 4 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 1 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "0.99 Duration of raft store events", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Raft process", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 8 }, "id": 2750, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The number of Raft messages sent by each TiKV instance", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 9 }, "id": 1615, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_raftstore_raft_sent_message_total{instance=~\"$instance\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Sent messages per server", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The number of Raft messages flushed by each TiKV instance", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 9 }, "id": 1616, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_server_raft_message_flush_total{instance=~\"$instance\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tikv_server_raft_message_flush_total", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Flush messages per server", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The number of Raft messages received by each TiKV instance", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 16 }, "id": 106, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_server_raft_message_recv_total{instance=~\"$instance\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Receive messages per server", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The number of different types of Raft messages that are sent", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 16 }, "id": 11, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_raftstore_raft_sent_message_total{instance=~\"$instance\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Messages", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The total number of vote messages that are sent in Raft", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 23 }, "id": 25, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_raftstore_raft_sent_message_total{instance=~\"$instance\", type=\"vote\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Vote", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The number of dropped Raft messages per type", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 23 }, "id": 1309, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_raftstore_raft_dropped_message_total{instance=~\"$instance\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Raft dropped messages", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Raft message", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 9 }, "id": 2751, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The proposal count of all Regions in a mio tick", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 10 }, "id": 108, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_apply_proposal_bucket{instance=~\"$instance\"}[1m])) by (le, instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Raft proposals per ready", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The number of proposals per type", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 10 }, "id": 7, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_raftstore_proposal_total{instance=~\"$instance\", type=~\"local_read|normal|read_index|batch\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "metric": "tikv_raftstore_proposal_total", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Raft read/write proposals", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The number of read proposals which are made by each TiKV instance", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 17 }, "id": 119, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_raftstore_proposal_total{instance=~\"$instance\", type=~\"local_read|read_index\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Raft read proposals per server", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The number of write proposals which are made by each TiKV instance", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 17 }, "id": 120, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_raftstore_proposal_total{instance=~\"$instance\", type=\"normal\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tikv_raftstore_proposal_total", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Raft write proposals per server", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The wait time of each proposal", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 24 }, "id": 41, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_request_wait_time_duration_secs_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "metric": "tikv_raftstore_request_wait_time_duration_secs_bucket", "refId": "A", "step": 4 }, { "expr": "histogram_quantile(0.95, sum(rate(tikv_raftstore_request_wait_time_duration_secs_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "refId": "B", "step": 4 }, { "expr": "sum(rate(tikv_raftstore_request_wait_time_duration_secs_sum{instance=~\"$instance\"}[1m])) / sum(rate(tikv_raftstore_request_wait_time_duration_secs_count{instance=~\"$instance\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "refId": "C", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Propose wait duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The wait time of each proposal in each TiKV instance", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 24 }, "id": 42, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_request_wait_time_duration_secs_bucket{instance=~\"$instance\"}[1m])) by (le, instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Propose wait duration per server", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 31 }, "id": 2535, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_apply_wait_time_duration_secs_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "metric": "tikv_raftstore_request_wait_time_duration_secs_bucket", "refId": "A", "step": 4 }, { "expr": "histogram_quantile(0.95, sum(rate(tikv_raftstore_apply_wait_time_duration_secs_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "refId": "B", "step": 4 }, { "expr": "sum(rate(tikv_raftstore_apply_wait_time_duration_secs_sum{instance=~\"$instance\"}[1m])) / sum(rate(tikv_raftstore_apply_wait_time_duration_secs_count{instance=~\"$instance\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "refId": "C", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Apply wait duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 31 }, "id": 2536, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_apply_wait_time_duration_secs_bucket{instance=~\"$instance\"}[1m])) by (le, instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Apply wait duration per server", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The rate at which peers propose logs", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 38 }, "id": 1975, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": true, "min": true, "rightSide": true, "show": true, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "avg(rate(tikv_raftstore_propose_log_size_sum{instance=~\"$instance\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Raft log speed", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": null, "format": "short", "label": "bytes/s", "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 38 }, "id": 4375, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_store_perf_context_time_duration_secs_bucket{instance=~\"$instance\"}[1m])) by (le, type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "store-{{type}}", "metric": "tikv_raftstore_request_wait_time_duration_secs_bucket", "refId": "A", "step": 4 }, { "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_apply_perf_context_time_duration_secs_bucket{instance=~\"$instance\"}[1m])) by (le, type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "apply-{{type}}", "refId": "B", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Perf Context duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Raft propose", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 10 }, "id": 2752, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The number of admin proposals", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 11 }, "id": 76, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_raftstore_proposal_total{instance=~\"$instance\", type=~\"conf_change|transfer_leader\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "metric": "tikv_raftstore_proposal_total", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Admin proposals", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The number of the processed apply command", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 11 }, "id": 77, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_raftstore_admin_cmd_total{instance=~\"$instance\", status=\"success\", type!=\"compact\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "metric": "tikv_raftstore_admin_cmd_total", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Admin apply", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The number of raftstore split checksss", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 18 }, "id": 70, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_raftstore_check_split_total{instance=~\"$instance\", type!=\"ignore\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "metric": "tikv_raftstore_check_split_total", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Check split", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The time consumed when running split check in .9999", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 18 }, "id": 71, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.9999, sum(rate(tikv_raftstore_check_split_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le, instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tikv_raftstore_check_split_duration_seconds_bucket", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "99.99% Check split duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Raft admin", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 11 }, "id": 2753, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The number of rejections from the local read thread and The number of total requests", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 12 }, "id": 2292, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "/.*-total/i", "yaxis": 2 } ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_raftstore_local_read_reject_total{instance=~\"$instance\"}[1m])) by (instance, reason)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}-reject-by-{{reason}}", "refId": "A" }, { "expr": "sum(rate(tikv_raftstore_local_read_executed_requests{instance=~\"$instance\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}-total", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Local reader requests", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Local reader", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 12 }, "id": 4200, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The time used by each level in the unified read pool per second. Level 0 refers to small queries.", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 13 }, "id": 4194, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_multilevel_level_elapsed{instance=~\"$instance\", name=\"unified-read-pool\"}[1m])) by (level)", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{level}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Time used by level", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "µs", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The chance that level 0 (small) tasks are scheduled in the unified read pool.", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 13 }, "id": 4196, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": true, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "tikv_multilevel_level0_chance{instance=~\"$instance\", name=\"unified-read-pool\"}", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{instance}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Level 0 chance", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The number of concurrently running tasks in the unified read pool.", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 21 }, "id": 4198, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(avg_over_time(tikv_unified_read_pool_running_tasks[1m])) by (instance)", "format": "time_series", "interval": "", "intervalFactor": 1, "legendFormat": "{{instance}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Running tasks", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "title": "Unified Read Pool", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 12 }, "id": 2754, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The total count of different kinds of commands received", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 13 }, "id": 2, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_storage_command_total{instance=~\"$instance\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Storage command total", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 10, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The total number of engine asynchronous request errors", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 13 }, "id": 8, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_storage_engine_async_request_total{instance=~\"$instance\", status!~\"all|success\"}[1m])) by (status)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{status}}", "metric": "tikv_raftstore_raft_process_duration_secs_bucket", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Storage async request error", "tooltip": { "msResolution": true, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The time consumed by processing asynchronous snapshot requests", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 21 }, "id": 15, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{instance=~\"$instance\", type=\"snapshot\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "refId": "A", "step": 4 }, { "expr": "histogram_quantile(0.95, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{instance=~\"$instance\", type=\"snapshot\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "refId": "B", "step": 4 }, { "expr": "sum(rate(tikv_storage_engine_async_request_duration_seconds_sum{instance=~\"$instance\", type=\"snapshot\"}[1m])) / sum(rate(tikv_storage_engine_async_request_duration_seconds_count{instance=~\"$instance\", type=\"snapshot\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "refId": "C", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Storage async snapshot duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The time consumed by processing asynchronous write requests", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 21 }, "id": 109, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{instance=~\"$instance\", type=\"write\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "refId": "A", "step": 4 }, { "expr": "histogram_quantile(0.95, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{instance=~\"$instance\", type=\"write\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "refId": "B", "step": 4 }, { "expr": "sum(rate(tikv_storage_engine_async_request_duration_seconds_sum{instance=~\"$instance\", type=\"write\"}[1m])) / sum(rate(tikv_storage_engine_async_request_duration_seconds_count{instance=~\"$instance\", type=\"write\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "refId": "C", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Storage async write duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Storage", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 13 }, "id": 2755, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The total writing bytes of commands on each stage", "fill": 1, "gridPos": { "h": 10, "w": 12, "x": 12, "y": 14 }, "height": "400", "id": 3834, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "maxPerRow": 1, "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(tikv_scheduler_writing_bytes{instance=~\"$instance\"}) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 20 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Scheduler writing bytes", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The total number of commands on each stage", "fill": 1, "gridPos": { "h": 10, "w": 12, "x": 0, "y": 14 }, "height": "400", "id": 167, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "maxPerRow": 1, "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_scheduler_too_busy_total{instance=~\"$instance\"}[1m])) by (stage)", "format": "time_series", "intervalFactor": 2, "legendFormat": "busy", "refId": "A", "step": 20 }, { "expr": "sum(rate(tikv_scheduler_stage_total{instance=~\"$instance\"}[1m])) by (stage)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{stage}}", "refId": "B", "step": 20 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Scheduler stage total", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The count of different priority commands", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 24 }, "height": "", "id": 1, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "maxPerRow": 2, "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_scheduler_commands_pri_total{instance=~\"$instance\"}[1m])) by (priority)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{priority}}", "metric": "", "refId": "A", "step": 40 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Scheduler priority commands", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 300 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "5m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" } ], "executionErrorState": "alerting", "frequency": "120s", "handler": 1, "message": "TiKV scheduler context total", "name": "scheduler pending commands alert", "noDataState": "ok", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The count of pending commands per TiKV instance", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 24 }, "height": "", "id": 193, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "maxPerRow": 2, "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(tikv_scheduler_contex_total{instance=~\"$instance\"}) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "", "refId": "A", "step": 40 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 300 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Scheduler pending commands", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Scheduler", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 14 }, "id": 2756, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The total number of commands on each stage in commit command", "fill": 1, "gridPos": { "h": 10, "w": 24, "x": 0, "y": 15 }, "height": "400", "id": 168, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "maxPerRow": 1, "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "command": { "selected": false, "text": "commit", "value": "commit" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_scheduler_too_busy_total{instance=~\"$instance\", type=\"$command\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "busy", "refId": "A", "step": 4 }, { "expr": "sum(rate(tikv_scheduler_stage_total{instance=~\"$instance\", type=\"$command\"}[1m])) by (stage)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{stage}}", "refId": "B", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Scheduler stage total", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The time consumed when executing commit command", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 25 }, "id": 3, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "command": { "selected": false, "text": "commit", "value": "commit" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_scheduler_command_duration_seconds_bucket{instance=~\"$instance\", type=\"$command\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "metric": "", "refId": "A", "step": 10 }, { "expr": "histogram_quantile(0.95, sum(rate(tikv_scheduler_command_duration_seconds_bucket{instance=~\"$instance\", type=\"$command\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "metric": "", "refId": "B", "step": 10 }, { "expr": "sum(rate(tikv_scheduler_command_duration_seconds_sum{instance=~\"$instance\", type=\"$command\"}[1m])) / sum(rate(tikv_scheduler_command_duration_seconds_count{instance=~\"$instance\", type=\"$command\"}[1m])) ", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "metric": "", "refId": "C", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Scheduler command duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The time which is caused by latch wait in commit command", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 25 }, "id": 194, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "command": { "selected": false, "text": "commit", "value": "commit" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_scheduler_latch_wait_duration_seconds_bucket{instance=~\"$instance\", type=\"$command\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "metric": "", "refId": "A", "step": 10 }, { "expr": "histogram_quantile(0.95, sum(rate(tikv_scheduler_latch_wait_duration_seconds_bucket{instance=~\"$instance\", type=\"$command\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "metric": "", "refId": "B", "step": 10 }, { "expr": "sum(rate(tikv_scheduler_latch_wait_duration_seconds_sum{instance=~\"$instance\", type=\"$command\"}[1m])) / sum(rate(tikv_scheduler_latch_wait_duration_seconds_count{instance=~\"$instance\", type=\"$command\"}[1m])) ", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "metric": "", "refId": "C", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Scheduler latch wait duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The count of keys read by a commit command", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 33 }, "id": 195, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "command": { "selected": false, "text": "commit", "value": "commit" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_scheduler_kv_command_key_read_bucket{instance=~\"$instance\", type=\"$command\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "metric": "kv_command_key", "refId": "A", "step": 10 }, { "expr": "histogram_quantile(0.95, sum(rate(tikv_scheduler_kv_command_key_read_bucket{instance=~\"$instance\", type=\"$command\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "metric": "", "refId": "B", "step": 10 }, { "expr": "sum(rate(tikv_scheduler_kv_command_key_read_sum{instance=~\"$instance\", type=\"$command\"}[1m])) / sum(rate(tikv_scheduler_kv_command_key_read_count{instance=~\"$instance\", type=\"$command\"}[1m])) ", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "metric": "", "refId": "C", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Scheduler keys read", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The count of keys written by a commit command", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 33 }, "id": 373, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "command": { "selected": false, "text": "commit", "value": "commit" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_scheduler_kv_command_key_write_bucket{instance=~\"$instance\", type=\"$command\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "metric": "kv_command_key", "refId": "A", "step": 10 }, { "expr": "histogram_quantile(0.95, sum(rate(tikv_scheduler_kv_command_key_write_bucket{instance=~\"$instance\", type=\"$command\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "metric": "", "refId": "B", "step": 10 }, { "expr": "sum(rate(tikv_scheduler_kv_command_key_write_sum{instance=~\"$instance\", type=\"$command\"}[1m])) / sum(rate(tikv_scheduler_kv_command_key_write_count{instance=~\"$instance\", type=\"$command\"}[1m])) ", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "metric": "", "refId": "C", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Scheduler keys written", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The keys scan details of each CF when executing commit command", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 41 }, "id": 560, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "command": { "selected": false, "text": "commit", "value": "commit" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_scheduler_kv_scan_details{instance=~\"$instance\", req=\"$command\"}[1m])) by (tag)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{tag}}", "metric": "", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Scheduler scan details", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The keys scan details of lock CF when executing commit command", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 41 }, "id": 675, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "command": { "selected": false, "text": "commit", "value": "commit" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_scheduler_kv_scan_details{instance=~\"$instance\", req=\"$command\", cf=\"lock\"}[1m])) by (tag)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{tag}}", "metric": "", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Scheduler scan details [lock]", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The keys scan details of write CF when executing commit command", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 49 }, "id": 829, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "command": { "selected": false, "text": "commit", "value": "commit" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_scheduler_kv_scan_details{instance=~\"$instance\", req=\"$command\", cf=\"write\"}[1m])) by (tag)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{tag}}", "metric": "", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Scheduler scan details [write]", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The keys scan details of default CF when executing commit command", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 49 }, "id": 830, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "command": { "selected": false, "text": "commit", "value": "commit" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_scheduler_kv_scan_details{instance=~\"$instance\", req=\"$command\", cf=\"default\"}[1m])) by (tag)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{tag}}", "metric": "", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Scheduler scan details [default]", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": "command", "title": "Scheduler - $command", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 15 }, "id": 2758, "panels": [ { "aliasColors": {}, "bars": true, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The number of versions for each key", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 19 }, "id": 26, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": false, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": false, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "paceLength": 10, "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_storage_mvcc_versions_bucket{instance=~\"$instance\"}[5m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": " max", "metric": "", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "MVCC versions", "tooltip": { "msResolution": false, "shared": false, "sort": 2, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "histogram", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": "", "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "cards": { "cardPadding": null, "cardRound": null }, "color": { "cardColor": "#b4ff00", "colorScale": "sqrt", "colorScheme": "interpolateOranges", "exponent": 0.5, "mode": "spectrum" }, "dataFormat": "timeseries", "datasource": "${DS_TEST-CLUSTER}", "description": "The number of versions deleted by GC for each key", "gridPos": { "h": 7, "w": 12, "x": 12, "y": 19 }, "heatmap": {}, "hideZeroBuckets": false, "highlightCards": true, "id": 559, "legend": { "show": false }, "links": [], "reverseYBuckets": false, "targets": [ { "expr": "sum(rate(tikv_storage_mvcc_gc_delete_versions_bucket{instance=~\"$instance\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": " max", "metric": "", "refId": "A", "step": 4 } ], "timeFrom": null, "timeShift": null, "title": "MVCC delete versions", "tooltip": { "show": true, "showHistogram": false }, "type": "heatmap", "xAxis": { "show": true }, "xBucketNumber": null, "xBucketSize": null, "yAxis": { "decimals": null, "format": "short", "logBase": 1, "max": null, "min": null, "show": true, "splitFactor": null }, "yBucketBound": "auto", "yBucketNumber": null, "yBucketSize": null }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The count of GC tasks processed by gc_worker", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 26 }, "id": 121, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_gcworker_gc_tasks_vec{instance=~\"$instance\"}[1m])) by (task)", "format": "time_series", "intervalFactor": 2, "legendFormat": "total-{{task}}", "metric": "tikv_storage_command_total", "refId": "A", "step": 4 }, { "expr": "sum(rate(tikv_storage_gc_skipped_counter{instance=~\"$instance\"}[1m])) by (task)", "format": "time_series", "intervalFactor": 2, "legendFormat": "skipped-{{task}}", "metric": "tikv_storage_gc_skipped_counter", "refId": "B", "step": 4 }, { "expr": "sum(rate(tikv_gcworker_gc_task_fail_vec{instance=~\"$instance\"}[1m])) by (task)", "format": "time_series", "intervalFactor": 2, "legendFormat": "failed-{{task}}", "refId": "C" }, { "expr": "sum(rate(tikv_gc_worker_too_busy{instance=~\"$instance\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "gcworker-too-busy", "refId": "D" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "GC tasks", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The time consumed when executing GC tasks", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 26 }, "id": 2224, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(1, sum(rate(tikv_gcworker_gc_task_duration_vec_bucket{instance=~\"$instance\"}[1m])) by (le, task))", "format": "time_series", "intervalFactor": 2, "legendFormat": "max-{{task}}", "metric": "tikv_storage_command_total", "refId": "A", "step": 4 }, { "expr": "histogram_quantile(0.99, sum(rate(tikv_gcworker_gc_task_duration_vec_bucket{instance=~\"$instance\"}[1m])) by (le, task))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%-{{task}}", "metric": "tikv_storage_gc_skipped_counter", "refId": "B", "step": 4 }, { "expr": "histogram_quantile(0.95, sum(rate(tikv_gcworker_gc_task_duration_vec_bucket{instance=~\"$instance\"}[1m])) by (le, task))", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%-{{task}}", "refId": "C" }, { "expr": "sum(rate(tikv_gcworker_gc_task_duration_vec_sum{instance=~\"$instance\"}[1m])) by (task) / sum(rate(tikv_gcworker_gc_task_duration_vec_count{instance=~\"$instance\"}[1m])) by (task)", "format": "time_series", "intervalFactor": 2, "legendFormat": "average-{{task}}", "refId": "D" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "GC tasks duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The count of keys in write CF affected during GC", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 33 }, "id": 2225, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_gcworker_gc_keys{instance=~\"$instance\", cf=\"write\"}[1m])) by (tag)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{tag}}", "metric": "tikv_storage_command_total", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "GC keys (write CF)", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 2, "description": "The count of TiDB GC worker actions", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 33 }, "id": 966, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tidb_tikvclient_gc_worker_actions_total[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "metric": "", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "TiDB GC worker actions", "tooltip": { "msResolution": false, "shared": true, "sort": 2, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": "", "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The GC duration", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 40 }, "id": 969, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(1.0, sum(rate(tidb_tikvclient_gc_seconds_bucket[1m])) by (instance, le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 40 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "TiDB GC seconds", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "keys / second", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 40 }, "id": 2589, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_storage_mvcc_gc_delete_versions_sum[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "keys/s", "refId": "E" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "GC speed", "tooltip": { "msResolution": false, "shared": true, "sort": 2, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": "", "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": true, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 0, "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 47 }, "id": 2819, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": false, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "expr": "sum(max_over_time(tikv_gcworker_autogc_status{instance=~\"$instance\", state=\"working\"}[1m])) by (instance)", "format": "time_series", "instant": false, "interval": "", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tikv_storage_command_total", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "TiKV AutoGC Working", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 0, "format": "none", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 0, "description": "Progress of ResolveLocks, the first phase of GC", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 47 }, "id": 2823, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "expr": "max(tidb_tikvclient_range_task_stats{type=~\"resolve-locks.*\"}) by (result)", "format": "time_series", "instant": false, "interval": "", "intervalFactor": 2, "legendFormat": "{{result}}", "metric": "tikv_storage_command_total", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "ResolveLocks Progress", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 0, "format": "none", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 0, "description": "Progress of TiKV's GC", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 54 }, "id": 2821, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(tikv_gcworker_autogc_processed_regions{instance=~\"$instance\", type=\"scan\"}) by (instance) / sum(tikv_raftstore_region_count{instance=~\"$instance\", type=\"region\"}) by (instance)", "format": "time_series", "instant": false, "interval": "", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tikv_storage_command_total", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "TiKV Auto GC Progress", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": null, "format": "percentunit", "label": null, "logBase": 1, "max": "1.1", "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 0, "description": "SafePoint used for TiKV's Auto GC", "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 54 }, "id": 2822, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "max(tikv_gcworker_autogc_safe_point) by (instance) / (2^18)", "format": "time_series", "instant": false, "interval": "", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tikv_storage_command_total", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "TiKV Auto GC SafePoint", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": null, "format": "dateTimeAsIso", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_TEST-CLUSTER}", "decimals": 0, "description": " \tThe lifetime of TiDB GC", "editable": true, "error": false, "format": "s", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "gridPos": { "h": 7, "w": 6, "x": 0, "y": 61 }, "id": 27, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "null", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { "expr": "max(tidb_tikvclient_gc_config{type=\"tikv_gc_life_time\"})", "format": "time_series", "interval": "", "intervalFactor": 2, "refId": "A", "step": 60 } ], "thresholds": "", "title": "GC lifetime", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" }, { "cacheTimeout": null, "colorBackground": false, "colorValue": false, "colors": [ "rgba(245, 54, 54, 0.9)", "rgba(237, 129, 40, 0.89)", "rgba(50, 172, 45, 0.97)" ], "datasource": "${DS_TEST-CLUSTER}", "decimals": 0, "description": "The interval of TiDB GC", "editable": true, "error": false, "format": "s", "gauge": { "maxValue": 100, "minValue": 0, "show": false, "thresholdLabels": false, "thresholdMarkers": true }, "gridPos": { "h": 7, "w": 6, "x": 6, "y": 61 }, "id": 28, "interval": null, "links": [], "mappingType": 1, "mappingTypes": [ { "name": "value to text", "value": 1 }, { "name": "range to text", "value": 2 } ], "maxDataPoints": 100, "nullPointMode": "null", "nullText": null, "postfix": "", "postfixFontSize": "50%", "prefix": "", "prefixFontSize": "50%", "rangeMaps": [ { "from": "null", "text": "N/A", "to": "null" } ], "sparkline": { "fillColor": "rgba(31, 118, 189, 0.18)", "full": false, "lineColor": "rgb(31, 120, 193)", "show": false }, "tableColumn": "", "targets": [ { "expr": "max(tidb_tikvclient_gc_config{type=\"tikv_gc_run_interval\"})", "format": "time_series", "intervalFactor": 2, "refId": "A", "step": 60 } ], "thresholds": "", "title": "GC interval", "type": "singlestat", "valueFontSize": "80%", "valueMaps": [ { "op": "=", "text": "N/A", "value": "null" } ], "valueName": "current" } ], "repeat": null, "title": "GC", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 16 }, "id": 2759, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The rate of Raft snapshot messages sent", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 8, "x": 0, "y": 20 }, "id": 35, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(delta(tikv_raftstore_raft_sent_message_total{instance=~\"$instance\", type=\"snapshot\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": " ", "refId": "A", "step": 60 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Rate snapshot message", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "opm", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The time consumed when handling snapshots", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 8, "x": 8, "y": 20 }, "id": 36, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_server_send_snapshot_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "send", "refId": "A", "step": 60 }, { "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_snapshot_duration_seconds_bucket{instance=~\"$instance\", type=\"apply\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "apply", "refId": "B", "step": 60 }, { "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_snapshot_duration_seconds_bucket{instance=~\"$instance\", type=\"generate\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "generate", "refId": "C", "step": 60 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "99% Handle snapshot duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The number of snapshots in different states", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 8, "x": 16, "y": 20 }, "id": 38, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": true, "targets": [ { "expr": "sum(tikv_raftstore_snapshot_traffic_total{instance=~\"$instance\"}) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "metric": "", "refId": "A", "step": 60 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Snapshot state count", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The snapshot size (P99.99).9999", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 27 }, "id": 44, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.9999, sum(rate(tikv_snapshot_size_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "size", "metric": "tikv_snapshot_size_bucket", "refId": "A", "step": 40 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "99.99% Snapshot size", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The number of KV within a snapshot in .9999", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 27 }, "id": 43, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.9999, sum(rate(tikv_snapshot_kv_count_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "count", "metric": "tikv_snapshot_kv_count_bucket", "refId": "A", "step": 40 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "99.99% Snapshot KV count", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Snapshot", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 17 }, "id": 2760, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The number of tasks handled by worker", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 21 }, "id": 59, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_worker_handled_task_total{instance=~\"$instance\"}[1m])) by (name)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{name}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Worker handled tasks", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": " \tCurrent pending and running tasks of worker", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 21 }, "id": 1395, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_worker_pending_task_total{instance=~\"$instance\"}[1m])) by (name)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{name}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Worker pending tasks", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The number of tasks handled by future_pool", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 29 }, "id": 1876, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_futurepool_handled_task_total{instance=~\"$instance\"}[1m])) by (name)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{name}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "FuturePool handled tasks", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "Current pending and running tasks of future_pool", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 29 }, "id": 1877, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_futurepool_pending_task_total{instance=~\"$instance\"}[1m])) by (name)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{name}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "FuturePool pending tasks", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Task", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 18 }, "id": 2757, "panels": [ { "cards": { "cardPadding": 0, "cardRound": 0 }, "color": { "cardColor": "#5195ce", "colorScale": "linear", "colorScheme": "interpolateBlues", "exponent": 0.5, "min": 0, "mode": "spectrum" }, "dataFormat": "tsbuckets", "datasource": "${DS_TEST-CLUSTER}", "description": "The time consumed to handle coprocessor read requests", "gridPos": { "h": 8, "w": 12, "x": 0, "y": 22 }, "heatmap": {}, "hideZeroBuckets": false, "highlightCards": true, "id": 3062, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "links": [], "reverseYBuckets": false, "targets": [ { "expr": "sum(rate(tikv_coprocessor_request_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le)", "format": "heatmap", "instant": false, "intervalFactor": 2, "legendFormat": "{{le}}", "refId": "A" } ], "title": "Request duration", "tooltip": { "show": true, "showHistogram": true }, "tooltipDecimals": 1, "type": "heatmap", "xAxis": { "show": true }, "xBucketNumber": null, "xBucketSize": null, "yAxis": { "decimals": 1, "format": "s", "logBase": 1, "max": null, "min": null, "show": true, "splitFactor": null }, "yBucketBound": "upper", "yBucketNumber": null, "yBucketSize": null }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 22 }, "id": 16, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(1, sum(rate(tikv_coprocessor_request_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le,req))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{req}}-100%", "refId": "E" }, { "expr": "histogram_quantile(0.99, sum(rate(tikv_coprocessor_request_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le,req))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{req}}-99%", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Request duration", "tooltip": { "msResolution": false, "shared": true, "sort": 1, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 1, "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 30 }, "id": 74, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_coprocessor_request_duration_seconds_count{instance=~\"$instance\"}[1m])) by (req)", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "{{req}}", "metric": "tikv_coprocessor_request_error", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Total Requests", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 1, "format": "ops", "label": "", "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 30 }, "id": 3128, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_coprocessor_request_error{instance=~\"$instance\"}[1m])) by (reason)", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "{{reason}}", "metric": "tikv_coprocessor_request_error", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Total Request Errors", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 1, "format": "ops", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 37 }, "id": 52, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_coprocessor_scan_keys_sum{instance=~\"$instance\"}[1m])) by (req)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{req}}", "refId": "D" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Total KV Cursor Operations", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 0, "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 37 }, "id": 3129, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(1, avg(rate(tikv_coprocessor_scan_keys_bucket{instance=~\"$instance\"}[1m])) by (le, req)) ", "format": "time_series", "intervalFactor": 2, "legendFormat": "100%-{{req}}", "refId": "D" }, { "expr": "histogram_quantile(0.99, avg(rate(tikv_coprocessor_scan_keys_bucket{instance=~\"$instance\"}[1m])) by (le, req)) ", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%-{{req}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "KV Cursor Operations", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 0, "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 44 }, "id": 2118, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "key_skipped", "yaxis": 2 } ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_coprocessor_rocksdb_perf{instance=~\"$instance\" ,metric=\"internal_delete_skipped_count\"}[1m])) by (req)", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "delete_skipped-{{req}}", "metric": "scan_details", "refId": "B", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Total RocksDB Perf Statistics", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 1, "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "decimals": null, "format": "short", "label": "", "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 44 }, "id": 551, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_coprocessor_response_bytes{instance=~\"$instance\"}[1m]))", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "size", "metric": "", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Total Response Size", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 0, "format": "decbytes", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Coprocessor Overview", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 19 }, "id": 3197, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The time consumed when handling coprocessor requests", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 23 }, "id": 113, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(1, sum(rate(tikv_coprocessor_request_handle_seconds_bucket{instance=~\"$instance\"}[1m])) by (le,req))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{req}}-100%", "refId": "E" }, { "expr": "histogram_quantile(0.99, sum(rate(tikv_coprocessor_request_handle_seconds_bucket{instance=~\"$instance\"}[1m])) by (le,req))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{req}}-99%", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Handle duration", "tooltip": { "msResolution": false, "shared": true, "sort": 1, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 1, "format": "s", "label": "", "logBase": 1, "max": null, "min": "0", "show": true }, { "decimals": 1, "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The time consumed to handle coprocessor requests per TiKV instance (P95)", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 23 }, "id": 117, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.95, sum(rate(tikv_coprocessor_request_handle_seconds_bucket{instance=~\"$instance\"}[1m])) by (le, instance,req))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}-{{req}}", "refId": "B", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "95% Handle duration by store", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 1, "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The time consumed when coprocessor requests are wait for being handled", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 30 }, "id": 111, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(1, sum(rate(tikv_coprocessor_request_wait_seconds_bucket{instance=~\"$instance\"}[1m])) by (le,req))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{req}}-100%", "refId": "D" }, { "expr": "histogram_quantile(0.99, sum(rate(tikv_coprocessor_request_wait_seconds_bucket{instance=~\"$instance\"}[1m])) by (le,req))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{req}}-99%", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Wait duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 1, "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The time consumed when coprocessor requests are wait for being handled in each TiKV instance", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 30 }, "id": 116, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.95, sum(rate(tikv_coprocessor_request_wait_seconds_bucket{instance=~\"$instance\"}[1m])) by (le, instance,req))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}-{{req}}", "refId": "B", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "95% Wait duration by store", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 1, "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "decimals": 1, "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 37 }, "id": 3195, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_coprocessor_dag_request_count{instance=~\"$instance\"}[1m])) by (vec_type)", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "{{vec_type}}", "metric": "", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Total DAG Requests", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 1, "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The total number of DAG executors", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 37 }, "id": 3264, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_coprocessor_executor_count{instance=~\"$instance\"}[1m])) by (type)", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "{{type}}", "metric": "", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Total DAG Executors", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 1, "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 9, "w": 12, "x": 0, "y": 44 }, "id": 552, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_coprocessor_scan_details{instance=~\"$instance\", req=\"select\"}[1m])) by (tag)", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "{{tag}}", "metric": "scan_details", "refId": "B", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Total Ops Details (Table Scan)", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 1, "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 9, "w": 12, "x": 12, "y": 44 }, "id": 3263, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_coprocessor_scan_details{instance=~\"$instance\", req=\"index\"}[1m])) by (tag)", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "{{tag}}", "metric": "scan_details", "refId": "B", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Total Ops Details (Index Scan)", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 1, "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 9, "w": 12, "x": 0, "y": 53 }, "id": 122, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "repeat": null, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_coprocessor_scan_details{instance=~\"$instance\", req=\"select\"}[1m])) by (tag,cf)", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "{{cf}}-{{tag}}", "metric": "scan_details", "refId": "B", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Total Ops Details by CF (Table Scan)", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 1, "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 9, "w": 12, "x": 12, "y": 53 }, "id": 554, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "repeat": "cf", "repeatDirection": "h", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_coprocessor_scan_details{instance=~\"$instance\", req=\"index\"}[1m])) by (tag,cf)", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "{{cf}}-{{tag}}", "metric": "scan_details", "refId": "B", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Total Ops Details by CF (Index Scan)", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 1, "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } } ], "title": "Coprocessor Detail", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 20 }, "id": 2761, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 24 }, "id": 2108, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": true, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(tikv_threads_state{instance=~\"$instance\"}) by (instance, state)", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{instance}}-{{state}}", "refId": "A", "step": 4 }, { "expr": "sum(tikv_threads_state{instance=~\"$instance\"}) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}-total", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Threads state", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 24 }, "id": 2258, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": true, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_threads_io_bytes_total{instance=~\"$instance\"}[30s])) by (name, io) > 1024", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 1, "legendFormat": "{{name}}-{{io}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Threads IO", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "Bps", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 31 }, "id": 2660, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": true, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_thread_voluntary_context_switches{instance=~\"$instance\"}[30s])) by (instance, name) > 200", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 1, "legendFormat": "{{instance}} - {{name}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Thread Voluntary Context Switches", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 31 }, "id": 2661, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": true, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_thread_nonvoluntary_context_switches{instance=~\"$instance\"}[30s])) by (instance, name) > 50", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 1, "legendFormat": "{{instance}} - {{name}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Thread Nonvoluntary Context Switches", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Threads", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 21 }, "id": 2762, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The count of get operations", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 25 }, "id": 138, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_engine_memtable_efficiency{instance=~\"$instance\", db=\"$db\", type=\"memtable_hit\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "memtable", "metric": "", "refId": "B", "step": 10 }, { "expr": "sum(rate(tikv_engine_cache_efficiency{instance=~\"$instance\", db=\"$db\", type=~\"block_cache_data_hit|block_cache_filter_hit\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "block_cache", "metric": "", "refId": "E", "step": 10 }, { "expr": "sum(rate(tikv_engine_get_served{instance=~\"$instance\", db=\"$db\", type=\"get_hit_l0\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "l0", "refId": "A", "step": 10 }, { "expr": "sum(rate(tikv_engine_get_served{instance=~\"$instance\", db=\"$db\", type=\"get_hit_l1\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "l1", "refId": "C", "step": 10 }, { "expr": "sum(rate(tikv_engine_get_served{instance=~\"$instance\", db=\"$db\", type=\"get_hit_l2_and_up\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "l2_and_up", "refId": "F", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Get operations", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The time consumed when executing get operations", "fill": 0, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 25 }, "id": 82, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "max(tikv_engine_get_micro_seconds{instance=~\"$instance\", db=\"$db\",type=\"get_max\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "max", "refId": "A", "step": 10 }, { "expr": "avg(tikv_engine_get_micro_seconds{instance=~\"$instance\", db=\"$db\",type=\"get_percentile99\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "refId": "B", "step": 10 }, { "expr": "avg(tikv_engine_get_micro_seconds{instance=~\"$instance\", db=\"$db\",type=\"get_percentile95\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "refId": "C", "step": 10 }, { "expr": "avg(tikv_engine_get_micro_seconds{instance=~\"$instance\", db=\"$db\",type=\"get_average\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "refId": "D", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Get duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "µs", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The count of seek operations", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 33 }, "id": 129, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_engine_locate{instance=~\"$instance\", db=\"$db\", type=\"number_db_seek\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "seek", "metric": "", "refId": "A", "step": 10 }, { "expr": "sum(rate(tikv_engine_locate{instance=~\"$instance\", db=\"$db\", type=\"number_db_seek_found\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "seek_found", "metric": "", "refId": "B", "step": 10 }, { "expr": "sum(rate(tikv_engine_locate{instance=~\"$instance\", db=\"$db\", type=\"number_db_next\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "next", "metric": "", "refId": "C", "step": 10 }, { "expr": "sum(rate(tikv_engine_locate{instance=~\"$instance\", db=\"$db\", type=\"number_db_next_found\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "next_found", "metric": "", "refId": "D", "step": 10 }, { "expr": "sum(rate(tikv_engine_locate{instance=~\"$instance\", db=\"$db\", type=\"number_db_prev\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "prev", "metric": "", "refId": "E", "step": 10 }, { "expr": "sum(rate(tikv_engine_locate{instance=~\"$instance\", db=\"$db\", type=\"number_db_prev_found\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "prev_found", "metric": "", "refId": "F", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Seek operations", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The time consumed when executing seek operation", "fill": 0, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 33 }, "id": 125, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "max(tikv_engine_seek_micro_seconds{instance=~\"$instance\", db=\"$db\",type=\"seek_max\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "max", "refId": "A", "step": 10 }, { "expr": "avg(tikv_engine_seek_micro_seconds{instance=~\"$instance\", db=\"$db\",type=\"seek_percentile99\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "refId": "B", "step": 10 }, { "expr": "avg(tikv_engine_seek_micro_seconds{instance=~\"$instance\", db=\"$db\",type=\"seek_percentile95\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "refId": "C", "step": 10 }, { "expr": "avg(tikv_engine_seek_micro_seconds{instance=~\"$instance\", db=\"$db\",type=\"seek_average\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "refId": "D", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Seek duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "µs", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The count of write operations", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 41 }, "id": 139, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_engine_write_served{instance=~\"$instance\", db=\"$db\", type=~\"write_done_by_self|write_done_by_other\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "done", "refId": "A", "step": 10 }, { "expr": "sum(rate(tikv_engine_write_served{instance=~\"$instance\", db=\"$db\", type=\"write_timeout\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "timeout", "refId": "B", "step": 10 }, { "expr": "sum(rate(tikv_engine_write_served{instance=~\"$instance\", db=\"$db\", type=\"write_with_wal\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "with_wal", "refId": "C", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Write operations", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The time consumed when executing write operation", "fill": 0, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 41 }, "id": 126, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "max(tikv_engine_write_micro_seconds{instance=~\"$instance\", db=\"$db\",type=\"write_max\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "max", "refId": "A", "step": 10 }, { "expr": "avg(tikv_engine_write_micro_seconds{instance=~\"$instance\", db=\"$db\",type=\"write_percentile99\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "refId": "B", "step": 10 }, { "expr": "avg(tikv_engine_write_micro_seconds{instance=~\"$instance\", db=\"$db\",type=\"write_percentile95\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "refId": "C", "step": 10 }, { "expr": "avg(tikv_engine_write_micro_seconds{instance=~\"$instance\", db=\"$db\",type=\"write_average\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "refId": "D", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Write duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "µs", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The time consumed when executing write wal operation", "fill": 0, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 41 }, "id": 130, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "max(tikv_engine_write_wal_time_micro_seconds{instance=~\"$instance\", db=\"$db\",type=\"write_wal_micros_max\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "max", "refId": "A", "step": 10 }, { "expr": "avg(tikv_engine_write_wal_time_micro_seconds{instance=~\"$instance\", db=\"$db\",type=\"write_wal_micros_percentile99\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "refId": "B", "step": 10 }, { "expr": "avg(tikv_engine_write_wal_time_micro_seconds{instance=~\"$instance\", db=\"$db\",type=\"write_wal_micros_percentile95\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "refId": "C", "step": 10 }, { "expr": "avg(tikv_engine_write_wal_time_micro_seconds{instance=~\"$instance\", db=\"$db\",type=\"write_wal_micros_average\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "refId": "D", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Write WAL duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "µs", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": " \tThe count of WAL sync operations", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 49 }, "id": 137, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_engine_wal_file_synced{instance=~\"$instance\", db=\"$db\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "sync", "metric": "", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "WAL sync operations", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The time consumed when executing WAL sync operation", "fill": 0, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 49 }, "id": 135, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "maxPerRow": 2, "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "max(tikv_engine_wal_file_sync_micro_seconds{instance=~\"$instance\", db=\"$db\",type=\"wal_file_sync_max\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "max", "refId": "A", "step": 10 }, { "expr": "avg(tikv_engine_wal_file_sync_micro_seconds{instance=~\"$instance\", db=\"$db\",type=\"wal_file_sync_percentile99\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "refId": "B", "step": 10 }, { "expr": "avg(tikv_engine_wal_file_sync_micro_seconds{instance=~\"$instance\", db=\"$db\",type=\"wal_file_sync_percentile95\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "refId": "C", "step": 10 }, { "expr": "avg(tikv_engine_wal_file_sync_micro_seconds{instance=~\"$instance\", db=\"$db\",type=\"wal_file_sync_average\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "refId": "D", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "WAL sync duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "µs", "label": null, "logBase": 10, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The count of compaction and flush operations", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 57 }, "id": 128, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_engine_event_total{instance=~\"$instance\", db=\"$db\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "metric": "tikv_engine_event_total", "refId": "B", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Compaction operations", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The time consumed when executing the compaction and flush operations", "fill": 0, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 57 }, "id": 136, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "max(tikv_engine_compaction_time{instance=~\"$instance\", db=\"$db\",type=\"compaction_time_max\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "max", "metric": "", "refId": "A", "step": 10 }, { "expr": "avg(tikv_engine_compaction_time{instance=~\"$instance\", db=\"$db\",type=\"compaction_time_percentile99\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "refId": "B", "step": 10 }, { "expr": "avg(tikv_engine_compaction_time{instance=~\"$instance\", db=\"$db\",type=\"compaction_time_percentile95\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "refId": "C", "step": 10 }, { "expr": "avg(tikv_engine_compaction_time{instance=~\"$instance\", db=\"$db\",type=\"compaction_time_average\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "refId": "D", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Compaction duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "µs", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The time consumed when reading SST files", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 65 }, "id": 140, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "max(tikv_engine_sst_read_micros{instance=~\"$instance\", db=\"$db\", type=\"sst_read_micros_max\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "max", "metric": "", "refId": "A", "step": 10 }, { "expr": "avg(tikv_engine_sst_read_micros{instance=~\"$instance\", db=\"$db\", type=\"sst_read_micros_percentile99\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "metric": "", "refId": "B", "step": 10 }, { "expr": "avg(tikv_engine_sst_read_micros{instance=~\"$instance\", db=\"$db\", type=\"sst_read_micros_percentile95\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "metric": "", "refId": "C", "step": 10 }, { "expr": "avg(tikv_engine_sst_read_micros{instance=~\"$instance\", db=\"$db\", type=\"sst_read_micros_average\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "metric": "", "refId": "D", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "SST read duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "µs", "label": null, "logBase": 10, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The time which is caused by write stall", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 153 }, "id": 87, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "max(tikv_engine_write_stall{instance=~\"$instance\", db=\"$db\", type=\"write_stall_max\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "max", "metric": "", "refId": "A", "step": 10 }, { "expr": "avg(tikv_engine_write_stall{instance=~\"$instance\", db=\"$db\", type=\"write_stall_percentile99\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "metric": "", "refId": "B", "step": 10 }, { "expr": "avg(tikv_engine_write_stall{instance=~\"$instance\", db=\"$db\", type=\"write_stall_percentile95\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "metric": "", "refId": "C", "step": 10 }, { "expr": "avg(tikv_engine_write_stall{instance=~\"$instance\", db=\"$db\", type=\"write_stall_average\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "metric": "", "refId": "D", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Write stall duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "µs", "label": null, "logBase": 10, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The memtable size of each column family", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 153 }, "id": 103, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "avg(tikv_engine_memory_bytes{instance=~\"$instance\", db=\"$db\", type=\"mem-tables\"}) by (cf)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cf}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Memtable size", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The hit rate of memtable", "fill": 0, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 73 }, "id": 88, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_engine_memtable_efficiency{instance=~\"$instance\", db=\"$db\", type=\"memtable_hit\"}[1m])) / (sum(rate(tikv_engine_memtable_efficiency{db=\"$db\", type=\"memtable_hit\"}[1m])) + sum(rate(tikv_engine_memtable_efficiency{db=\"$db\", type=\"memtable_miss\"}[1m])))", "format": "time_series", "intervalFactor": 2, "legendFormat": "hit", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Memtable hit", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The block cache size. Broken down by column family if shared block cache is disabled.", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 81 }, "id": 102, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "topk(20, avg(tikv_engine_block_cache_size_bytes{instance=~\"$instance\", db=\"$db\"}) by(cf, instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}-{{cf}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Block cache size", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The hit rate of block cache", "fill": 0, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 81 }, "id": 80, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "maxPerRow": 2, "nullPointMode": "connected", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_engine_cache_efficiency{instance=~\"$instance\", db=\"$db\", type=\"block_cache_hit\"}[1m])) / (sum(rate(tikv_engine_cache_efficiency{instance=~\"$instance\", db=\"$db\", type=\"block_cache_hit\"}[1m])) + sum(rate(tikv_engine_cache_efficiency{instance=~\"$instance\", db=\"$db\", type=\"block_cache_miss\"}[1m])))", "format": "time_series", "intervalFactor": 2, "legendFormat": "all", "metric": "", "refId": "A", "step": 10 }, { "expr": "sum(rate(tikv_engine_cache_efficiency{instance=~\"$instance\", db=\"$db\", type=\"block_cache_data_hit\"}[1m])) / (sum(rate(tikv_engine_cache_efficiency{instance=~\"$instance\", db=\"$db\", type=\"block_cache_data_hit\"}[1m])) + sum(rate(tikv_engine_cache_efficiency{instance=~\"$instance\", db=\"$db\", type=\"block_cache_data_miss\"}[1m])))", "format": "time_series", "intervalFactor": 2, "legendFormat": "data", "metric": "", "refId": "D", "step": 10 }, { "expr": "sum(rate(tikv_engine_cache_efficiency{instance=~\"$instance\", db=\"$db\", type=\"block_cache_filter_hit\"}[1m])) / (sum(rate(tikv_engine_cache_efficiency{instance=~\"$instance\", db=\"$db\", type=\"block_cache_filter_hit\"}[1m])) + sum(rate(tikv_engine_cache_efficiency{instance=~\"$instance\", db=\"$db\", type=\"block_cache_filter_miss\"}[1m])))", "format": "time_series", "intervalFactor": 2, "legendFormat": "filter", "metric": "", "refId": "B", "step": 10 }, { "expr": "sum(rate(tikv_engine_cache_efficiency{instance=~\"$instance\", db=\"$db\", type=\"block_cache_index_hit\"}[1m])) / (sum(rate(tikv_engine_cache_efficiency{instance=~\"$instance\", db=\"$db\", type=\"block_cache_index_hit\"}[1m])) + sum(rate(tikv_engine_cache_efficiency{instance=~\"$instance\", db=\"$db\", type=\"block_cache_index_miss\"}[1m])))", "format": "time_series", "intervalFactor": 2, "legendFormat": "index", "metric": "", "refId": "C", "step": 10 }, { "expr": "sum(rate(tikv_engine_bloom_efficiency{instance=~\"$instance\", db=\"$db\", type=\"bloom_prefix_useful\"}[1m])) / sum(rate(tikv_engine_bloom_efficiency{instance=~\"$instance\", db=\"$db\", type=\"bloom_prefix_checked\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "bloom prefix", "metric": "", "refId": "E", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Block cache hit", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The flow of different kinds of block cache operations", "fill": 0, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 89 }, "height": "", "id": 467, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_engine_flow_bytes{instance=~\"$instance\", db=\"$db\", type=\"block_cache_byte_read\"}[1m]))", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 2, "legendFormat": "total_read", "refId": "A", "step": 10 }, { "expr": "sum(rate(tikv_engine_flow_bytes{instance=~\"$instance\", db=\"$db\", type=\"block_cache_byte_write\"}[1m]))", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 2, "legendFormat": "total_written", "refId": "C", "step": 10 }, { "expr": "sum(rate(tikv_engine_cache_efficiency{instance=~\"$instance\", db=\"$db\", type=\"block_cache_data_bytes_insert\"}[1m]))", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 2, "legendFormat": "data_insert", "metric": "", "refId": "D", "step": 10 }, { "expr": "sum(rate(tikv_engine_cache_efficiency{instance=~\"$instance\", db=\"$db\", type=\"block_cache_filter_bytes_insert\"}[1m]))", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 2, "legendFormat": "filter_insert", "metric": "", "refId": "B", "step": 10 }, { "expr": "sum(rate(tikv_engine_cache_efficiency{instance=~\"$instance\", db=\"$db\", type=\"block_cache_filter_bytes_evict\"}[1m]))", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 2, "legendFormat": "filter_evict", "metric": "", "refId": "E", "step": 10 }, { "expr": "sum(rate(tikv_engine_cache_efficiency{instance=~\"$instance\", db=\"$db\", type=\"block_cache_index_bytes_insert\"}[1m]))", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 2, "legendFormat": "index_insert", "metric": "", "refId": "F", "step": 10 }, { "expr": "sum(rate(tikv_engine_cache_efficiency{instance=~\"$instance\", db=\"$db\", type=\"block_cache_index_bytes_evict\"}[1m]))", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 2, "legendFormat": "index_evict", "metric": "", "refId": "G", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Block cache flow", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "Bps", "label": null, "logBase": 10, "max": null, "min": "0", "show": true }, { "format": "none", "label": null, "logBase": 1, "max": null, "min": "0", "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The count of different kinds of block cache operations", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 89 }, "id": 468, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_engine_cache_efficiency{instance=~\"$instance\", db=\"$db\", type=\"block_cache_add\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "total_add", "metric": "", "refId": "A", "step": 10 }, { "expr": "sum(rate(tikv_engine_cache_efficiency{instance=~\"$instance\", db=\"$db\", type=\"block_cache_data_add\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "data_add", "metric": "", "refId": "C", "step": 10 }, { "expr": "sum(rate(tikv_engine_cache_efficiency{instance=~\"$instance\", db=\"$db\", type=\"block_cache_filter_add\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "filter_add", "metric": "", "refId": "D", "step": 10 }, { "expr": "sum(rate(tikv_engine_cache_efficiency{instance=~\"$instance\", db=\"$db\", type=\"block_cache_index_add\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "index_add", "metric": "", "refId": "E", "step": 10 }, { "expr": "sum(rate(tikv_engine_cache_efficiency{instance=~\"$instance\", db=\"$db\", type=\"block_cache_add_failures\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "add_failures", "metric": "", "refId": "B", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Block cache operations", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The flow of different kinds of operations on keys", "fill": 0, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 97 }, "height": "", "id": 132, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_engine_flow_bytes{instance=~\"$instance\", db=\"$db\", type=\"keys_read\"}[1m]))", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 2, "legendFormat": "read", "refId": "B", "step": 10 }, { "expr": "sum(rate(tikv_engine_flow_bytes{instance=~\"$instance\", db=\"$db\", type=\"keys_written\"}[1m]))", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 2, "legendFormat": "written", "refId": "C", "step": 10 }, { "expr": "sum(rate(tikv_engine_compaction_num_corrupt_keys{instance=~\"$instance\", db=\"$db\"}[1m]))", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 2, "legendFormat": "corrupt", "metric": "", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Keys flow", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The count of keys in each column family", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 97 }, "id": 131, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(tikv_engine_estimate_num_keys{instance=~\"$instance\", db=\"$db\"}) by (cf)", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{cf}}", "metric": "tikv_engine_estimate_num_keys", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Total keys", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The flow rate of read operations per type", "fill": 0, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 105 }, "height": "", "id": 85, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_engine_flow_bytes{instance=~\"$instance\", db=\"$db\", type=\"bytes_read\"}[1m]))", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 2, "legendFormat": "get", "refId": "A", "step": 10 }, { "expr": "sum(rate(tikv_engine_flow_bytes{instance=~\"$instance\", db=\"$db\", type=\"iter_bytes_read\"}[1m]))", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 2, "legendFormat": "scan", "refId": "C", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Read flow", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "Bps", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The bytes per read", "fill": 0, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 105 }, "id": 133, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "maxPerRow": 2, "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "max(tikv_engine_bytes_per_read{instance=~\"$instance\", db=\"$db\",type=\"bytes_per_read_max\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "max", "refId": "A", "step": 10 }, { "expr": "avg(tikv_engine_bytes_per_read{instance=~\"$instance\", db=\"$db\",type=\"bytes_per_read_percentile99\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "refId": "B", "step": 10 }, { "expr": "avg(tikv_engine_bytes_per_read{instance=~\"$instance\", db=\"$db\",type=\"bytes_per_read_percentile95\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "refId": "C", "step": 10 }, { "expr": "avg(tikv_engine_bytes_per_read{instance=~\"$instance\", db=\"$db\",type=\"bytes_per_read_average\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "refId": "D", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Bytes / Read", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "decbytes", "label": null, "logBase": 10, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The flow of different kinds of write operations", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 113 }, "height": "", "id": 86, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_engine_flow_bytes{instance=~\"$instance\", db=\"$db\", type=\"wal_file_bytes\"}[1m]))", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "wal", "refId": "C", "step": 10 }, { "expr": "sum(rate(tikv_engine_flow_bytes{instance=~\"$instance\", db=\"$db\", type=\"bytes_written\"}[1m]))", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "write", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Write flow", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "Bps", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The bytes per write", "fill": 0, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 113 }, "id": 134, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "maxPerRow": 2, "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "max(tikv_engine_bytes_per_write{instance=~\"$instance\", db=\"$db\",type=\"bytes_per_write_max\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "max", "refId": "A", "step": 10 }, { "expr": "avg(tikv_engine_bytes_per_write{instance=~\"$instance\", db=\"$db\",type=\"bytes_per_write_percentile99\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "refId": "B", "step": 10 }, { "expr": "avg(tikv_engine_bytes_per_write{instance=~\"$instance\", db=\"$db\",type=\"bytes_per_write_percentile95\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "refId": "C", "step": 10 }, { "expr": "avg(tikv_engine_bytes_per_write{instance=~\"$instance\", db=\"$db\",type=\"bytes_per_write_average\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "refId": "D", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Bytes / Write", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "decbytes", "label": null, "logBase": 10, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The flow rate of compaction operations per type", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 121 }, "id": 90, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_engine_compaction_flow_bytes{instance=~\"$instance\", db=\"$db\", type=\"bytes_read\"}[1m]))", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "read", "refId": "A", "step": 10 }, { "expr": "sum(rate(tikv_engine_compaction_flow_bytes{instance=~\"$instance\", db=\"$db\", type=\"bytes_written\"}[1m]))", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "written", "refId": "C", "step": 10 }, { "expr": "sum(rate(tikv_engine_flow_bytes{instance=~\"$instance\", db=\"$db\", type=\"flush_write_bytes\"}[1m]))", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "flushed", "refId": "B", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Compaction flow", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "Bps", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "Bps", "label": null, "logBase": 1, "max": null, "min": "0", "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The pending bytes to be compacted", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 121 }, "id": 127, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_engine_pending_compaction_bytes{instance=~\"$instance\", db=\"$db\"}[1m])) by (cf)", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{cf}}", "metric": "tikv_engine_pending_compaction_bytes", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Compaction pending bytes", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "Bps", "label": null, "logBase": 1, "max": null, "min": "0", "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The read amplification per TiKV instance \t", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 129 }, "id": 518, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_engine_read_amp_flow_bytes{instance=~\"$instance\", db=\"$db\", type=\"read_amp_total_read_bytes\"}[1m])) by (instance) / sum(rate(tikv_engine_read_amp_flow_bytes{db=\"$db\", type=\"read_amp_estimate_useful_bytes\"}[1m])) by (instance)", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Read amplication", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The compression ratio of each level", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 129 }, "id": 863, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "avg(tikv_engine_compression_ratio{instance=~\"$instance\", db=\"$db\"}) by (level)", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "level - {{level}}", "metric": "", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Compression ratio", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The number of snapshot of each TiKV instance", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 137 }, "id": 516, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "tikv_engine_num_snapshots{instance=~\"$instance\", db=\"$db\"}", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Number of snapshots", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The time that the oldest unreleased snapshot survivals", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 137 }, "id": 517, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "tikv_engine_oldest_snapshot_duration{instance=~\"$instance\", db=\"$db\"}", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tikv_engine_oldest_snapshot_duration", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Oldest snapshots duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The number of SST files for different column families in each level", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 145 }, "id": 2002, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": true, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "avg(tikv_engine_num_files_at_level{instance=~\"$instance\", db=\"$db\"}) by (cf, level)", "format": "time_series", "intervalFactor": 2, "legendFormat": "cf-{{cf}}, level-{{level}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Number files at each level", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The time consumed when ingesting SST files", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 145 }, "id": 2003, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": true, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_snapshot_ingest_sst_duration_seconds_bucket{instance=~\"$instance\", db=\"$db\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "refId": "A" }, { "expr": "sum(rate(tikv_snapshot_ingest_sst_duration_seconds_sum{instance=~\"$instance\"}[1m])) / sum(rate(tikv_snapshot_ingest_sst_duration_seconds_count{instance=~\"$instance\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "average", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Ingest SST duration seconds", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "Stall conditions changed of each column family", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 153 }, "id": 2381, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideZero": true, "max": true, "min": true, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "tikv_engine_stall_conditions_changed{instance=~\"$instance\", db=\"$db\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}-{{cf}}-{{type}}", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Stall conditions changed of each CF", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 161 }, "id": 2452, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(increase(tikv_engine_write_stall_reason{instance=~\"$instance\", db=\"$db\"}[1m])) by (type)", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{type}}", "metric": "", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Write Stall Reason", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 65 }, "id": 2451, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_engine_compaction_reason{instance=~\"$instance\", db=\"$db\"}[1m])) by (cf, reason)", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{cf}} - {{reason}}", "metric": "", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Compaction reason", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": "db", "title": "RocksDB - $db", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 22 }, "id": 3301, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 6, "w": 12, "x": 0, "y": 27 }, "id": 3414, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "paceLength": 10, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "avg(tikv_engine_blob_key_size{instance=~\"$instance\", db=\"$db\", type=\"blob_key_size_average\"})", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "avg", "refId": "A" }, { "expr": "avg(tikv_engine_blob_key_size{instance=~\"$instance\", db=\"$db\", type=\"blob_key_size_percentile95\"})", "format": "time_series", "intervalFactor": 1, "legendFormat": "95%", "refId": "C" }, { "expr": "avg(tikv_engine_blob_key_size{instance=~\"$instance\", db=\"$db\", type=\"blob_key_size_percentile99\"})", "format": "time_series", "hide": false, "intervalFactor": 1, "legendFormat": "99%", "refId": "D" }, { "expr": "max(tikv_engine_blob_key_size{instance=~\"$instance\", db=\"$db\", type=\"blob_key_size_max\"})", "format": "time_series", "hide": false, "intervalFactor": 1, "legendFormat": "max", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Blob key size", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 6, "w": 12, "x": 12, "y": 27 }, "id": 3446, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "paceLength": 10, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "avg(tikv_engine_blob_value_size{instance=~\"$instance\", db=\"$db\", type=\"blob_value_size_average\"})", "format": "time_series", "intervalFactor": 1, "legendFormat": "avg", "refId": "A" }, { "expr": "avg(tikv_engine_blob_value_size{instance=~\"$instance\", db=\"$db\", type=\"blob_value_size_percentile95\"})", "format": "time_series", "intervalFactor": 1, "legendFormat": "95%", "refId": "B" }, { "expr": "avg(tikv_engine_blob_value_size{instance=~\"$instance\", db=\"$db\", type=\"blob_value_size_percentile99\"})", "format": "time_series", "intervalFactor": 1, "legendFormat": "99%", "refId": "C" }, { "expr": "avg(tikv_engine_blob_value_size{instance=~\"$instance\", db=\"$db\", type=\"blob_value_size_max\"})", "format": "time_series", "intervalFactor": 1, "legendFormat": "max", "refId": "D" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Blob value size", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 6, "w": 12, "x": 0, "y": 27 }, "id": 3412, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "paceLength": 10, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "avg(tikv_engine_blob_seek_micros_seconds{instance=~\"$instance\", db=\"$db\", type=~\".*_average\"}) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "refId": "A" }, { "expr": "avg(tikv_engine_blob_seek_micros_seconds{instance=~\"$instance\", db=\"$db\", type=~\".*_percentile95\"}) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "refId": "B" }, { "expr": "avg(tikv_engine_blob_seek_micros_seconds{instance=~\"$instance\", db=\"$db\", type=~\".*_percentile99\"}) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "refId": "C" }, { "expr": "avg(tikv_engine_blob_seek_micros_seconds{instance=~\"$instance\", db=\"$db\", type=~\".*_max\"}) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "max", "refId": "D" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Blob seek duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "µs", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 6, "w": 12, "x": 12, "y": 33 }, "id": 3338, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "paceLength": 10, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_engine_blob_locate{instance=~\"$instance\", db=\"$db\", type=\"number_blob_seek\"}[2m]))", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 2, "legendFormat": "seek", "refId": "A" }, { "expr": "sum(rate(tikv_engine_blob_locate{instance=~\"$instance\", db=\"$db\", type=\"number_blob_prev\"}[2m]))", "format": "time_series", "hide": false, "intervalFactor": 1, "legendFormat": "prev", "refId": "B" }, { "expr": "sum(rate(tikv_engine_blob_locate{instance=~\"$instance\", db=\"$db\", type=\"number_blob_next\"}[2m]))", "format": "time_series", "hide": false, "intervalFactor": 1, "legendFormat": "next", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Blob seek operations", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 6, "w": 12, "x": 0, "y": 39 }, "id": 3655, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "paceLength": 10, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "avg(tikv_engine_blob_get_micros_seconds{instance=~\"$instance\", db=\"$db\", type=~\".*_average\"}) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "refId": "A" }, { "expr": "avg(tikv_engine_blob_get_micros_seconds{instance=~\"$instance\", db=\"$db\", type=~\".*_percentile95\"}) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "refId": "B" }, { "expr": "avg(tikv_engine_blob_get_micros_seconds{instance=~\"$instance\", db=\"$db\", type=~\".*_percentile99\"}) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "refId": "C" }, { "expr": "avg(tikv_engine_blob_get_micros_seconds{instance=~\"$instance\", db=\"$db\", type=~\".*_max\"}) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "max", "refId": "D" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Blob get duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "µs", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 6, "w": 12, "x": 12, "y": 39 }, "id": 3746, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "paceLength": 10, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_engine_blob_locate{instance=~\"$instance\", db=\"$db\", type=\"number_blob_get\"}[2m]))", "format": "time_series", "hide": false, "interval": "", "intervalFactor": 2, "legendFormat": "seek", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Blob get operations", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 6, "w": 12, "x": 0, "y": 45 }, "id": 3643, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "paceLength": 10, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_engine_blob_flow_bytes{instance=~\"$instance\", db=\"$db\", type=~\"bytes.*\"}[2m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Blob bytes flow", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "Bps", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 6, "w": 12, "x": 12, "y": 45 }, "id": 3645, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "paceLength": 10, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_engine_blob_flow_bytes{instance=~\"$instance\", db=\"$db\", type=~\"keys.*\"}[30s])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Blob keys flow", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "Bps", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 6, "w": 12, "x": 0, "y": 51 }, "id": 3657, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "paceLength": 10, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "avg(tikv_engine_blob_file_read_micros_seconds{instance=~\"$instance\", db=\"$db\", type=\"blob_file_read_micros_average\"})", "format": "time_series", "intervalFactor": 1, "legendFormat": "avg", "refId": "A" }, { "expr": "avg(tikv_engine_blob_file_read_micros_seconds{instance=~\"$instance\", db=\"$db\", type=\"blob_file_read_micros_percentile99\"})", "format": "time_series", "intervalFactor": 1, "legendFormat": "99%", "refId": "B" }, { "expr": "avg(tikv_engine_blob_file_read_micros_seconds{instance=~\"$instance\", db=\"$db\", type=\"blob_file_read_micros_percentile95\"})", "format": "time_series", "intervalFactor": 1, "legendFormat": "95%", "refId": "C" }, { "expr": "avg(tikv_engine_blob_file_read_micros_seconds{instance=~\"$instance\", db=\"$db\", type=\"blob_file_read_micros_max\"})", "format": "time_series", "intervalFactor": 1, "legendFormat": "max", "refId": "D" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Blob file read duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "µs", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 6, "w": 12, "x": 12, "y": 51 }, "id": 3408, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "paceLength": 10, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "avg(tikv_engine_blob_file_write_micros_seconds{instance=~\"$instance\", db=\"$db\", type=\"blob_file_write_micros_average\"})", "format": "time_series", "intervalFactor": 1, "legendFormat": "avg", "refId": "A" }, { "expr": "avg(tikv_engine_blob_file_write_micros_seconds{instance=~\"$instance\", db=\"$db\", type=\"blob_file_write_micros_percentile99\"})", "format": "time_series", "intervalFactor": 1, "legendFormat": "99%", "refId": "B" }, { "expr": "avg(tikv_engine_blob_file_write_micros_seconds{instance=~\"$instance\", db=\"$db\", type=\"blob_file_write_micros_percentile95\"})", "format": "time_series", "intervalFactor": 1, "legendFormat": "95%", "refId": "C" }, { "expr": "avg(tikv_engine_blob_file_write_micros_seconds{instance=~\"$instance\", db=\"$db\", type=\"blob_file_write_micros_max\"})", "format": "time_series", "intervalFactor": 1, "legendFormat": "max", "refId": "D" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Blob file write duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "µs", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 6, "w": 12, "x": 0, "y": 57 }, "id": 3651, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "paceLength": 10, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_engine_blob_file_synced{instance=~\"$instance\", db=\"$db\"}[2m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "sync", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Blob file sync operations", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 6, "w": 12, "x": 12, "y": 57 }, "id": 3653, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "paceLength": 10, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "avg(tikv_engine_blob_file_sync_micros_seconds{instance=~\"$instance\", db=\"$db\", type=\"blob_file_sync_micros_average\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "refId": "A" }, { "expr": "avg(tikv_engine_blob_file_sync_micros_seconds{instance=~\"$instance\", db=\"$db\", type=\"blob_file_sync_micros_percentile95\"})", "format": "time_series", "intervalFactor": 1, "legendFormat": "95%", "refId": "B" }, { "expr": "avg(tikv_engine_blob_file_sync_micros_seconds{instance=~\"$instance\", db=\"$db\", type=\"blob_file_sync_micros_percentile99\"})", "format": "time_series", "intervalFactor": 1, "legendFormat": "99%", "refId": "C" }, { "expr": "avg(tikv_engine_blob_file_sync_micros_seconds{instance=~\"$instance\", db=\"$db\", type=\"blob_file_sync_micros_max\"})", "format": "time_series", "intervalFactor": 1, "legendFormat": "max", "refId": "D" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Blob file sync duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "µs", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 6, "w": 12, "x": 0, "y": 63 }, "id": 3555, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "paceLength": 10, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "avg(tikv_engine_titandb_num_live_blob_file{instance=~\"$instance\", db=\"$db\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "live blob file num", "refId": "A" }, { "expr": "avg(tikv_engine_titandb_num_obsolete_blob_file{instance=~\"$instance\", db=\"$db\"})", "format": "time_series", "hide": false, "intervalFactor": 1, "legendFormat": "obsolete blob file num", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Blob file count", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 6, "w": 12, "x": 12, "y": 63 }, "id": 3557, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "paceLength": 10, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "avg(tikv_engine_titandb_live_blob_file_size{instance=~\"$instance\", db=\"$db\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "live blob file size", "refId": "A" }, { "expr": "avg(tikv_engine_titandb_obsolete_blob_file_size{instance=~\"$instance\", db=\"$db\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "obsolete blob file size", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Blob file size", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 6, "w": 12, "x": 0, "y": 69 }, "id": 3344, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "paceLength": 10, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_engine_blob_gc_file_count{instance=~\"$instance\", db=\"$db\"}[2m])) by (type)", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Blob GC file", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 6, "w": 12, "x": 12, "y": 69 }, "id": 3410, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "paceLength": 10, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "avg(tikv_engine_blob_gc_micros_seconds{instance=~\"$instance\", db=\"$db\", type=\"blob_gc_micros_average\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "refId": "A" }, { "expr": "avg(tikv_engine_blob_gc_micros_seconds{instance=~\"$instance\", db=\"$db\", type=\"blob_gc_micros_percentile95\"})", "format": "time_series", "intervalFactor": 1, "legendFormat": "95%", "refId": "B" }, { "expr": "avg(tikv_engine_blob_gc_micros_seconds{instance=~\"$instance\", db=\"$db\", type=\"blob_gc_micros_percentile99\"})", "format": "time_series", "intervalFactor": 1, "legendFormat": "99%", "refId": "C" }, { "expr": "avg(tikv_engine_blob_gc_micros_seconds{instance=~\"$instance\", db=\"$db\", type=\"blob_gc_micros_max\"})", "format": "time_series", "intervalFactor": 1, "legendFormat": "max", "refId": "D" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Blob GC duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "µs", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 6, "w": 12, "x": 0, "y": 75 }, "id": 3340, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "paceLength": 10, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_engine_blob_gc_flow_bytes{instance=~\"$instance\", db=\"$db\", type=~\"bytes.*\"}[30s])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Blob GC bytes flow", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "Bps", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "decbytes", "label": "", "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 6, "w": 12, "x": 12, "y": 75 }, "id": 3649, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "paceLength": 10, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_engine_blob_gc_flow_bytes{instance=~\"$instance\", db=\"$db\", type=~\"keys.*\"}[30s])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Blob GC keys flow", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "Bps", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "decbytes", "label": "", "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 6, "w": 12, "x": 0, "y": 81 }, "id": 3523, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "paceLength": 10, "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "avg(tikv_engine_titandb_live_blob_size{instance=~\"$instance\", db=\"$db\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "live blob size", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Live blob size", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": "db", "title": "Titan - $db", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 23 }, "id": 2820, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 29 }, "id": 2991, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"waiter_manager.*\"}[1m])) by (instance, name)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}-{{name}}", "metric": "tikv_thread_cpu_seconds_total", "refId": "A", "step": 4 }, { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"deadlock_detect.*\"}[1m])) by (instance, name)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}-{{name}}", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Thread CPU", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 29 }, "id": 2877, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_lock_manager_task_counter{instance=~\"$instance\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Handled tasks", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 37 }, "id": 2993, "interval": "", "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_lock_manager_waiter_lifetime_duration_sum{instance=~\"$instance\"}[1m])) / sum(rate(tikv_lock_manager_waiter_lifetime_duration_count{instance=~\"$instance\"}[1m]))", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "avg", "refId": "A", "step": 10 }, { "expr": "histogram_quantile(0.99, sum(rate(tikv_lock_manager_waiter_lifetime_duration_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 1, "legendFormat": "99%", "refId": "B" }, { "expr": "histogram_quantile(0.9999, sum(rate(tikv_lock_manager_waiter_lifetime_duration_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 1, "legendFormat": "99.99%", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Waiter lifetime duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 37 }, "id": 4018, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(max_over_time(tikv_lock_manager_wait_table_status{instance=~\"$instance\"}[15s])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Wait table", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 45 }, "id": 2995, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_lock_manager_detect_duration_sum{instance=~\"$instance\"}[1m])) / sum(rate(tikv_lock_manager_detect_duration_count{instance=~\"$instance\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "refId": "A" }, { "expr": "histogram_quantile(0.99, sum(rate(tikv_lock_manager_detect_duration_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Deadlock detect duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 45 }, "id": 2934, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_lock_manager_error_counter{instance=~\"$instance\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Detect error", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": true, "cacheTimeout": null, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 0, "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 53 }, "id": 4019, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": false, "max": false, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": false, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pluginVersion": "6.1.6", "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(max_over_time(tikv_lock_manager_detector_leader_heartbeat{instance=~\"$instance\"}[15s])) by (instance)", "format": "time_series", "hide": false, "instant": false, "interval": "", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Deadlock detector leader", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": 0, "format": "none", "label": "", "logBase": 1, "max": "2", "min": "0", "show": false }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } } ], "title": "Lock manager", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 24 }, "id": 2763, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 505 }, "id": 2696, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": true, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "tikv_allocator_stats{instance=~\"$instance\"}", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeShift": null, "title": "Allocator Stats", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "decbytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] } ], "repeat": null, "title": "Memory", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 25 }, "id": 3922, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 1 }, "id": 3924, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"backup_worker.*\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "backup-worker", "metric": "tikv_thread_cpu_seconds_total", "refId": "A", "step": 4 }, { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"backup_endpoint\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "backup-endpoint", "metric": "tikv_thread_cpu_seconds_total", "refId": "B", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Backup CPU", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 1 }, "id": 3926, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_backup_range_size_bytes_bucket{instance=~\"$instance\"}[1m])) by (le, cf))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cf}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Range Size", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": "", "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 8 }, "id": 3927, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_backup_request_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": " 99%", "metric": "", "refId": "A", "step": 4 }, { "expr": "histogram_quantile(0.95, sum(rate(tikv_backup_request_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "refId": "B", "step": 4 }, { "expr": "sum(rate(tikv_backup_request_duration_seconds_sum{instance=~\"$instance\"}[1m])) / sum(rate(tikv_backup_request_duration_seconds_count{instance=~\"$instance\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "refId": "C", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Backup Duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 8 }, "id": 3928, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_backup_range_size_bytes_sum{instance=~\"$instance\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "backup-flow", "metric": "", "refId": "A", "step": 4 }, { "expr": "sum(rate(tikv_backup_range_size_bytes_sum[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "", "refId": "B", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Backup Flow", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "Bps", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 2, "description": "", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 15 }, "id": 3929, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(irate(node_disk_read_bytes_total{device=~\"sda\"}[5m])) + sum(irate(node_disk_written_bytes_total{device=~\"sda\"}[5m]))", "format": "time_series", "hide": true, "intervalFactor": 1, "legendFormat": "Total ", "refId": "A" }, { "expr": "sum(irate(node_disk_read_bytes_total{device=~\"sda\"}[5m]))", "format": "time_series", "hide": true, "intervalFactor": 1, "legendFormat": "Read", "refId": "B" }, { "expr": "sum(irate(node_disk_written_bytes_total{device=~\"sda\"}[5m]))", "format": "time_series", "hide": false, "intervalFactor": 1, "legendFormat": "Write - total", "refId": "C" }, { "expr": "sum(irate(node_disk_written_bytes_total{device=~\"sda\"}[5m])) by (instance)", "format": "time_series", "hide": false, "intervalFactor": 1, "legendFormat": "Write - {{instance}}", "refId": "D" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Disk Throughput", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "none", "label": null, "logBase": 1, "max": null, "min": "0", "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 15 }, "id": 3930, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_backup_range_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le, type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}} - 99%", "metric": "", "refId": "A", "step": 4 }, { "expr": "histogram_quantile(0.95, sum(rate(tikv_backup_range_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le, type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}} - 95%", "refId": "B", "step": 4 }, { "expr": "sum(rate(tikv_backup_range_duration_seconds_sum{instance=~\"$instance\"}[1m])) by (type) / sum(rate(tikv_backup_range_duration_seconds_count{instance=~\"$instance\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}} - avg", "refId": "C", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Backup Range Duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 2, "description": "", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 22 }, "id": 3931, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "rate(tikv_backup_error_counter[1m])", "format": "time_series", "hide": false, "intervalFactor": 1, "legendFormat": "{{error}} {{instance}}", "refId": "D" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Backup Errors", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "none", "label": null, "logBase": 1, "max": null, "min": "0", "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "title": "Backup", "type": "row" } ], "refresh": "1m", "schemaVersion": 18, "style": "dark", "tags": [], "templating": { "list": [ { "allValue": null, "current": {}, "datasource": "${DS_TEST-CLUSTER}", "definition": "", "hide": 0, "includeAll": true, "label": "db", "multi": true, "name": "db", "options": [], "query": "label_values(tikv_engine_block_cache_size_bytes, db)", "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 1, "tagValuesQuery": "", "tags": [], "tagsQuery": "", "type": "query", "useTags": false }, { "allValue": null, "current": {}, "datasource": "${DS_TEST-CLUSTER}", "definition": "", "hide": 0, "includeAll": true, "label": "command", "multi": true, "name": "command", "options": [], "query": "label_values(tikv_storage_command_total, type)", "refresh": 1, "regex": "prewrite|commit|rollback", "skipUrlSync": false, "sort": 1, "tagValuesQuery": "", "tags": [], "tagsQuery": "", "type": "query", "useTags": false }, { "allValue": ".*", "current": {}, "datasource": "${DS_TEST-CLUSTER}", "definition": "", "hide": 0, "includeAll": true, "label": "Instance", "multi": false, "name": "instance", "options": [], "query": "label_values(tikv_engine_size_bytes, instance)", "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 1, "tagValuesQuery": "", "tags": [], "tagsQuery": "", "type": "query", "useTags": false } ] }, "time": { "from": "now-1h", "to": "now" }, "timepicker": { "refresh_intervals": [ "5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d" ], "time_options": [ "5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d" ] }, "timezone": "browser", "title": "Test-Cluster-TiKV-Details", "uid": "RDVQiEzZz", "version": 21 } ================================================ FILE: scripts/tikv_raw.json ================================================ { "__inputs": [ { "name": "DS_TEST-CLUSTER", "label": "test-cluster", "description": "", "type": "datasource", "pluginId": "prometheus", "pluginName": "Prometheus" } ], "__requires": [ { "type": "grafana", "id": "grafana", "name": "Grafana", "version": "5.4.3" }, { "type": "panel", "id": "graph", "name": "Graph", "version": "5.0.0" }, { "type": "datasource", "id": "prometheus", "name": "Prometheus", "version": "5.0.0" } ], "annotations": { "list": [ { "builtIn": 1, "datasource": "${DS_TEST-CLUSTER}", "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "type": "dashboard" } ] }, "editable": true, "gnetId": null, "graphTooltip": 0, "id": null, "iteration": 1560225374091, "links": [], "panels": [ { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, "id": 2, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 9, "w": 12, "x": 0, "y": 1 }, "id": 4, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "command": { "selected": false, "text": "raw_batch_get", "value": "raw_batch_get" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_scheduler_command_duration_seconds_bucket{instance=~\"$instance\", type=~\"$command\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 1, "legendFormat": "99%", "refId": "A", "target": "select metric", "type": "timeserie" }, { "expr": "histogram_quantile(0.95, sum(rate(tikv_scheduler_command_duration_seconds_bucket{instance=~\"$instance\", type=~\"$command\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 1, "legendFormat": "95%", "refId": "B" }, { "expr": "sum(rate(tikv_scheduler_command_duration_seconds_sum{instance=~\"$instance\", type=~\"$command\"}[1m])) / sum(rate(tikv_scheduler_command_duration_seconds_count{instance=~\"$instance\", type=~\"$command\"}[1m])) ", "format": "time_series", "intervalFactor": 1, "legendFormat": "avg", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Command Duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ms", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 9, "w": 12, "x": 12, "y": 1 }, "id": 6, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "command": { "selected": false, "text": "raw_batch_get", "value": "raw_batch_get" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_scheduler_processing_read_duration_seconds_bucket{instance=~\"$instance\", type=~\"$command\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 1, "legendFormat": "99%", "refId": "A", "target": "select metric", "type": "timeserie" }, { "expr": "histogram_quantile(0.95, sum(rate(tikv_scheduler_processing_read_duration_seconds_bucket{instance=~\"$instance\", type=~\"$command\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 1, "legendFormat": "95%", "refId": "B" }, { "expr": "sum(rate(tikv_scheduler_processing_read_duration_seconds_sum{instance=~\"$instance\", type=~\"$command\"}[1m])) / sum(rate(tikv_scheduler_processing_read_duration_seconds_count{instance=~\"$instance\", type=~\"$command\"}[1m]))", "format": "time_series", "intervalFactor": 1, "legendFormat": "avg", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Read Processing Duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ms", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": "command", "title": "Read - $command", "type": "row" } ], "refresh": false, "schemaVersion": 16, "style": "dark", "tags": [], "templating": { "list": [ { "allValue": null, "current": {}, "datasource": "${DS_TEST-CLUSTER}", "definition": "label_values(tikv_storage_command_total, type)", "hide": 0, "includeAll": true, "label": "command", "multi": true, "name": "command", "options": [], "query": "label_values(tikv_storage_command_total, type)", "refresh": 1, "regex": "raw_get|raw_scan|raw_batch_get|raw_batch_scan", "skipUrlSync": false, "sort": 0, "tagValuesQuery": "", "tags": [], "tagsQuery": "", "type": "query", "useTags": false }, { "allValue": null, "current": {}, "datasource": "${DS_TEST-CLUSTER}", "definition": "label_values(tikv_engine_size_bytes, instance)", "hide": 0, "includeAll": true, "label": "instance", "multi": true, "name": "instance", "options": [], "query": "label_values(tikv_engine_size_bytes, instance)", "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 0, "tagValuesQuery": "", "tags": [], "tagsQuery": "", "type": "query", "useTags": false } ] }, "time": { "from": "now-5m", "to": "now" }, "timepicker": { "refresh_intervals": [ "5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d" ], "time_options": [ "5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d" ] }, "timezone": "", "title": "Test-Cluster-TiKV-Raw", "uid": "K0D2tEZZz", "version": 1 } ================================================ FILE: scripts/tikv_summary.json ================================================ { "__inputs": [ { "name": "DS_TEST-CLUSTER", "label": "test-cluster", "description": "", "type": "datasource", "pluginId": "prometheus", "pluginName": "Prometheus" } ], "__requires": [ { "type": "grafana", "id": "grafana", "name": "Grafana", "version": "6.1.6" }, { "type": "panel", "id": "graph", "name": "Graph", "version": "" }, { "type": "datasource", "id": "prometheus", "name": "Prometheus", "version": "1.0.0" } ], "annotations": { "list": [ { "builtIn": 1, "datasource": "${DS_TEST-CLUSTER}", "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "type": "dashboard" } ] }, "editable": true, "gnetId": null, "graphTooltip": 1, "id": null, "iteration": 1566459338986, "links": [], "panels": [ { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, "id": 2742, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The storage size per TiKV instance", "editable": true, "error": false, "fill": 5, "grid": {}, "gridPos": { "h": 8, "w": 8, "x": 0, "y": 1 }, "id": 56, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 0, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "expr": "sum(tikv_engine_size_bytes{instance=~\"$instance\"}) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Store size", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "decbytes", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The available capacity size of each TiKV instance", "editable": true, "error": false, "fill": 5, "grid": {}, "gridPos": { "h": 8, "w": 8, "x": 8, "y": 1 }, "id": 1706, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 0, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "expr": "sum(tikv_store_size_bytes{instance=~\"$instance\", type=\"available\"}) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Available size", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "decbytes", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The capacity size per TiKV instance", "editable": true, "error": false, "fill": 5, "grid": {}, "gridPos": { "h": 8, "w": 8, "x": 16, "y": 1 }, "id": 1707, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": false, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 0, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "expr": "sum(tikv_store_size_bytes{instance=~\"$instance\", type=\"capacity\"}) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Capacity size", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "decbytes", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": " \tThe CPU usage of each TiKV instance", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 9 }, "id": 1708, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "CPU", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": " \tThe memory usage of each TiKV instance", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 9 }, "id": 1709, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "avg(process_resident_memory_bytes{instance=~\"$instance\"}) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Memory", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": " \tThe I/O utilization per TiKV instance", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 17 }, "id": 1710, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "rate(node_disk_io_time_seconds_total[1m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} - {{device}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "IO utilization", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The total bytes of read and write in each TiKV instance", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 17 }, "id": 1711, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_engine_flow_bytes{instance=~\"$instance\", db=\"kv\", type=\"wal_file_bytes\"}[1m])) by (instance)", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{instance}}-write", "refId": "A", "step": 10 }, { "expr": "sum(rate(tikv_engine_flow_bytes{instance=~\"$instance\", db=\"kv\", type=~\"bytes_read|iter_bytes_read\"}[1m])) by (instance)", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{instance}}-read", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "MBps", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "Bps", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The QPS of different kinds of commands in each TiKV instance", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 25 }, "id": 1713, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_grpc_msg_duration_seconds_count{instance=~\"$instance\", type!=\"kv_gc\"}[1m])) by (instance,type)", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{instance}} - {{type}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "QPS", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The total number of the gRPC message failures", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 25 }, "id": 1712, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_grpc_msg_fail_total{instance=~\"$instance\", type!=\"kv_gc\"}[1m])) by (instance)", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{instance}}-grpc-msg-fail", "refId": "A", "step": 10 }, { "expr": "sum(delta(tikv_pd_heartbeat_message_total{instance=~\"$instance\", type=\"noop\"}[1m])) by (instance) < 1", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}-pd-heartbeat", "refId": "B" }, { "expr": "sum(rate(tikv_critical_error_total{instance=~\"$instance\"}[1m])) by (instance, type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}-{{type}}", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Errps", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The number of leaders per TiKV instance", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 33 }, "id": 1715, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "total", "lines": false } ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(tikv_raftstore_region_count{instance=~\"$instance\", type=\"leader\"}) by (instance)", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 10 }, { "expr": "delta(tikv_raftstore_region_count{instance=~\"$instance\", type=\"leader\"}[30s]) < -10", "format": "time_series", "hide": true, "intervalFactor": 2, "legendFormat": "", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Leader", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": " \tThe number of Regions on each TiKV instance", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 33 }, "id": 1714, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 300, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(tikv_raftstore_region_count{instance=~\"$instance\", type=\"region\"}) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Region", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": "", "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Cluster", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 1 }, "id": 2743, "panels": [ { "alert": { "conditions": [ { "evaluator": { "params": [ 0 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "5m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" } ], "executionErrorState": "alerting", "frequency": "60s", "handler": 1, "name": "Critical error alert", "noDataState": "no_data", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 7, "w": 24, "x": 0, "y": 2 }, "id": 2741, "legend": { "avg": false, "current": false, "max": false, "min": false, "show": true, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_critical_error_total{instance=~\"$instance\"}[1m])) by (instance, type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}-{{type}}", "refId": "A" } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 0 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Critical error", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "It contains some kinds of events such as write stall, channel full, scheduler busy, and coprocessor full, which will make the TiKV instance unavailable temporarily.", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 9 }, "id": 1584, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_scheduler_too_busy_total{instance=~\"$instance\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "scheduler-{{instance}}", "metric": "", "refId": "A", "step": 4 }, { "expr": "sum(rate(tikv_channel_full_total{instance=~\"$instance\"}[1m])) by (instance, type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "channelfull-{{instance}}-{{type}}", "metric": "", "refId": "B", "step": 4 }, { "expr": "sum(rate(tikv_coprocessor_request_error{instance=~\"$instance\", type='full'}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "coprocessor-{{instance}}", "metric": "", "refId": "C", "step": 4 }, { "expr": "avg(tikv_engine_write_stall{instance=~\"$instance\", type=\"write_stall_percentile99\"}) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "stall-{{instance}}", "refId": "D" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Server is busy", "tooltip": { "msResolution": false, "shared": true, "sort": 2, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 0 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "10s", "now" ] }, "reducer": { "params": [], "type": "max" }, "type": "query" } ], "executionErrorState": "alerting", "frequency": "10s", "handler": 1, "message": "TiKV server report failures", "name": "server report failures alert", "noDataState": "ok", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The total number of reporting failure messages", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 9 }, "id": 18, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_server_report_failure_msg_total{instance=~\"$instance\"}[1m])) by (type,instance,store_id)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} - {{type}} - to - {{store_id}}", "metric": "tikv_server_raft_store_msg_total", "refId": "A", "step": 10 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 0 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Server report failures", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The number of Raftstore errors per type on each TiKV instance", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 16 }, "id": 1718, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_storage_engine_async_request_total{instance=~\"$instance\", status!~\"success|all\"}[1m])) by (instance, status)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}-{{status}}", "metric": "", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Raftstore error", "tooltip": { "msResolution": false, "shared": true, "sort": 2, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The number of different scheduler errors on each TiKV instance", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 16 }, "id": 1719, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_scheduler_stage_total{instance=~\"$instance\", stage=~\"snapshot_err|prepare_write_err\"}[1m])) by (instance, stage)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}-{{stage}}", "metric": "", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Scheduler error", "tooltip": { "msResolution": false, "shared": true, "sort": 2, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The number of different coprocessor errors on each TiKV instance", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 23 }, "id": 1720, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_coprocessor_request_error{instance=~\"$instance\"}[1m])) by (instance, reason)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}-{{reason}}", "metric": "", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Coprocessor error", "tooltip": { "msResolution": false, "shared": true, "sort": 2, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The number of different gRPC message errors on each TiKV instance", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 23 }, "id": 1721, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_grpc_msg_fail_total{instance=~\"$instance\"}[1m])) by (instance, type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}-{{type}}", "metric": "", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "gRPC message error", "tooltip": { "msResolution": false, "shared": true, "sort": 2, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The count of dropped leader in each TiKV instance", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 30 }, "id": 1722, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "total", "lines": false } ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(delta(tikv_raftstore_region_count{instance=~\"$instance\", type=\"leader\"}[1m])) by (instance)", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Leader drop", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The count of missing leaders per TiKV instance", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 30 }, "id": 1723, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "total", "lines": false } ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(tikv_raftstore_leader_missing{instance=~\"$instance\"}) by (instance)", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Leader missing", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Errors", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 2 }, "id": 2744, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": " \tThe total size of each column family", "editable": true, "error": false, "fill": 3, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 3 }, "id": 33, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "expr": "sum(tikv_engine_size_bytes{instance=~\"$instance\"}) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "CF size", "tooltip": { "msResolution": false, "shared": true, "sort": 2, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "decbytes", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The storage size per TiKV instance", "editable": true, "error": false, "fill": 5, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 3 }, "id": 1705, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 0, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": true, "steppedLine": false, "targets": [ { "expr": "sum(tikv_engine_size_bytes{instance=~\"$instance\"}) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Store size", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "decbytes", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 0 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "datasourceId": 1, "model": { "expr": "sum(rate(tikv_channel_full_total{instance=~\"$instance\"}[1m])) by (instance, type)", "intervalFactor": 2, "legendFormat": "{{instance}} - {{type}}", "metric": "", "refId": "A", "step": 10 }, "params": [ "A", "10s", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" } ], "executionErrorState": "alerting", "frequency": "10s", "handler": 1, "message": "TiKV channel full", "name": "TiKV channel full alert", "noDataState": "ok", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The total number of channel full errors on each TiKV instance", "editable": true, "error": false, "fill": 3, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 11 }, "id": 22, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_channel_full_total{instance=~\"$instance\"}[1m])) by (instance, type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} - {{type}}", "metric": "", "refId": "A", "step": 10 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 0 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Channel full", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 1073741824 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "B", "1m", "now" ] }, "reducer": { "params": [], "type": "avg" }, "type": "query" } ], "executionErrorState": "alerting", "frequency": "60s", "handler": 1, "name": "approximate region size alert", "noDataState": "no_data", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The approximate Region size", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 11 }, "id": 1481, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_region_size_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "metric": "", "refId": "B", "step": 10 }, { "expr": "histogram_quantile(0.95, sum(rate(tikv_raftstore_region_size_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "metric": "", "refId": "C", "step": 10 }, { "expr": "sum(rate(tikv_raftstore_region_size_sum{instance=~\"$instance\"}[1m])) / sum(rate(tikv_raftstore_region_size_count{instance=~\"$instance\"}[1m])) ", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "metric": "", "refId": "D", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Approximate Region size", "tooltip": { "msResolution": false, "shared": true, "sort": 2, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Server", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 3 }, "id": 2745, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The count of different kinds of gRPC message", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 }, "id": 95, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 300, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_grpc_msg_duration_seconds_count{instance=~\"$instance\", type!=\"kv_gc\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "metric": "tikv_grpc_msg_duration_seconds_bucket", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "gRPC message count", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The count of different kinds of gRPC message which is failed", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 }, "id": 107, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 300, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_grpc_msg_fail_total{instance=~\"$instance\", type!=\"kv_gc\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{type}}", "metric": "tikv_grpc_msg_fail_total", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "gRPC message failed", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The gRPC message duration per message type (P99)", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 }, "id": 98, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 300, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_grpc_msg_duration_seconds_bucket{instance=~\"$instance\", type!=\"kv_gc\"}[1m])) by (le, type))", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "99% gRPC messge duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 10, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 12 }, "id": 2532, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 300, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_grpc_msg_duration_seconds_sum{instance=~\"$instance\"}[1m])) by (type) / sum(rate(tikv_grpc_msg_duration_seconds_count[1m])) by (type)", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Average gRPC messge duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "gRPC", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }, "id": 2746, "panels": [ { "alert": { "conditions": [ { "evaluator": { "params": [ 1.7 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "datasourceId": 1, "model": { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"raftstore_.*\"}[1m])) by (instance)", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tikv_thread_cpu_seconds_total", "refId": "A", "step": 20 }, "params": [ "A", "1m", "now" ] }, "reducer": { "params": [], "type": "max" }, "type": "query" } ], "executionErrorState": "alerting", "for": "0m", "frequency": "60s", "handler": 1, "message": "TiKV raftstore thread CPU usage is high", "name": "TiKV raft store CPU alert", "noDataState": "ok", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The CPU utilization of raftstore thread", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 5 }, "id": 61, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"raftstore_.*\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tikv_thread_cpu_seconds_total", "refId": "A", "step": 4 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 0.85 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Raft store CPU", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 1.8 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "1m", "now" ] }, "reducer": { "params": [], "type": "max" }, "type": "query" } ], "executionErrorState": "alerting", "for": "0m", "frequency": "1m", "handler": 1, "message": "TiKV async apply thread CPU usage is high", "name": "TiKV async apply CPU alert", "noDataState": "ok", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The CPU utilization of async apply", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 5 }, "id": 79, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"apply_[0-9]+\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tikv_thread_cpu_seconds_total", "refId": "A", "step": 4 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 0.9 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Async apply CPU", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 3.6 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "1m", "now" ] }, "reducer": { "params": [], "type": "max" }, "type": "query" } ], "executionErrorState": "alerting", "for": "0m", "frequency": "1m", "handler": 1, "message": "TiKV scheduler worker thread CPU usage is high", "name": "TiKV scheduler worker CPU alert", "noDataState": "ok", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The CPU utilization of scheduler worker", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 12 }, "id": 64, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"sched_.*\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tikv_thread_cpu_seconds_total", "refId": "A", "step": 4 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 0.9 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Scheduler worker CPU", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 3.6 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "1m", "now" ] }, "reducer": { "params": [], "type": "max" }, "type": "query" } ], "executionErrorState": "alerting", "for": "0m", "frequency": "1m", "handler": 1, "message": "TiKV gRPC poll thread CPU usage is high", "name": "TiKV gRPC poll CPU alert", "noDataState": "ok", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The CPU utilization of gRPC", "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 12 }, "id": 105, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"grpc.*\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 4 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 0.9 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "gRPC poll CPU", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 7.2 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "1m", "now" ] }, "reducer": { "params": [], "type": "max" }, "type": "query" } ], "executionErrorState": "alerting", "for": "", "frequency": "1m", "handler": 1, "message": "TiKV Coprocessor thread CPU alert", "name": "TiKV Coprocessor CPU alert", "noDataState": "ok", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The CPU utilization of coprocessor", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 19 }, "id": 78, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"cop_normal.*\"}[1m])) by (instance)", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "{{instance}} - normal", "refId": "A", "step": 4 }, { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"cop_high.*\"}[1m])) by (instance)", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "{{instance}} - high", "refId": "B", "step": 4 }, { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"cop_low.*\"}[1m])) by (instance)", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "{{instance}} - low", "refId": "C", "step": 4 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 0.9 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Coprocessor CPU", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 3.6 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "1m", "now" ] }, "reducer": { "params": [], "type": "max" }, "type": "query" } ], "executionErrorState": "alerting", "for": "", "frequency": "1m", "handler": 1, "message": "TiKV Storage ReadPool thread CPU usage is high", "name": "TiKV Storage ReadPool CPU alert", "noDataState": "ok", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The CPU utilization of readpool", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 19 }, "id": 1908, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"store_read_norm.*\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} - normal", "metric": "tikv_thread_cpu_seconds_total", "refId": "A", "step": 4 }, { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"store_read_high.*\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} - high", "metric": "tikv_thread_cpu_seconds_total", "refId": "B", "step": 4 }, { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"store_read_low.*\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} - low", "metric": "tikv_thread_cpu_seconds_total", "refId": "C", "step": 4 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 0.9 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Storage ReadPool CPU", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": " \tThe CPU utilization of split check", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 26 }, "id": 68, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"split_check\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tikv_thread_cpu_seconds_total", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Split check CPU", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The CPU utilization of RocksDB", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 26 }, "id": 69, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"rocksdb.*\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tikv_thread_cpu_seconds_total", "refId": "A", "step": 4 } ], "thresholds": [ { "colorMode": "warning", "fill": true, "line": true, "op": "gt", "value": 1 }, { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 4 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "RocksDB CPU", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 33 }, "id": 2531, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"gc_worker.*\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "GC worker CPU", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The CPU utilization of snapshot worker", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 33 }, "id": 67, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"snapshot_worker\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tikv_thread_cpu_seconds_total", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Snapshot worker CPU", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Thread CPU", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 }, "id": 2747, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": " \tThe count of requests that TiKV sends to PD", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 6 }, "id": 1069, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 350, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_pd_request_duration_seconds_count{instance=~\"$instance\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ type }}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "PD requests", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The time consumed by requests that TiKV sends to PD", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 6 }, "id": 1070, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 350, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_pd_request_duration_seconds_sum{instance=~\"$instance\"}[1m])) by (type) / sum(rate(tikv_pd_request_duration_seconds_count{instance=~\"$instance\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ type }}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "PD request duration (average)", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The total number of PD heartbeat messages", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 14 }, "id": 1215, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 350, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_pd_heartbeat_message_total{instance=~\"$instance\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ type }}", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "PD heartbeats", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "opm", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The total number of peers validated by the PD worker", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 14 }, "id": 1396, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 350, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_pd_validate_peer_total{instance=~\"$instance\"}[1m])) by (type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{ type }}", "metric": "", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "PD validate peers", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "PD", "type": "row" } ], "refresh": "1m", "schemaVersion": 18, "style": "dark", "tags": [], "templating": { "list": [ { "allValue": null, "current": {}, "datasource": "${DS_TEST-CLUSTER}", "definition": "", "hide": 0, "includeAll": true, "label": "db", "multi": true, "name": "db", "options": [], "query": "label_values(tikv_engine_block_cache_size_bytes, db)", "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 1, "tagValuesQuery": "", "tags": [], "tagsQuery": "", "type": "query", "useTags": false }, { "allValue": null, "current": {}, "datasource": "${DS_TEST-CLUSTER}", "definition": "", "hide": 0, "includeAll": true, "label": "command", "multi": true, "name": "command", "options": [], "query": "label_values(tikv_storage_command_total, type)", "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 1, "tagValuesQuery": "", "tags": [], "tagsQuery": "", "type": "query", "useTags": false }, { "allValue": ".*", "current": {}, "datasource": "${DS_TEST-CLUSTER}", "definition": "", "hide": 0, "includeAll": true, "label": "Instance", "multi": false, "name": "instance", "options": [], "query": "label_values(tikv_engine_size_bytes, instance)", "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 1, "tagValuesQuery": "", "tags": [], "tagsQuery": "", "type": "query", "useTags": false } ] }, "time": { "from": "now-5m", "to": "now" }, "timepicker": { "refresh_intervals": [ "5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d" ], "time_options": [ "5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d" ] }, "timezone": "browser", "title": "Test-Cluster-TiKV-Summary", "uid": "X7VQmEzZk", "version": 5 } ================================================ FILE: scripts/tikv_trouble_shooting.json ================================================ { "__inputs": [ { "name": "DS_TEST-CLUSTER", "label": "test-cluster", "description": "", "type": "datasource", "pluginId": "prometheus", "pluginName": "Prometheus" } ], "__requires": [ { "type": "grafana", "id": "grafana", "name": "Grafana", "version": "6.1.6" }, { "type": "panel", "id": "graph", "name": "Graph", "version": "" }, { "type": "datasource", "id": "prometheus", "name": "Prometheus", "version": "1.0.0" } ], "annotations": { "list": [ { "builtIn": 1, "datasource": "${DS_TEST-CLUSTER}", "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "type": "dashboard" } ] }, "editable": true, "gnetId": null, "graphTooltip": 1, "id": null, "iteration": 1568258597359, "links": [], "panels": [ { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, "id": 2796, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "总 CPU 使用率", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 1 }, "id": 1708, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "CPU", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "客户端发来的常见请求的 QPS。如果同一类请求在多个机器上分布显著不平均,那么需要考虑热点的问题", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 1 }, "id": 1713, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_grpc_msg_duration_seconds_count{instance=~\"$instance\", type=~\"coprocessor|kv_get|kv_batch_get|kv_prewrite|kv_commit\"}[1m])) by (instance, type)", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{instance}} - {{type}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "QPS", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 3.6 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "1m", "now" ] }, "reducer": { "params": [], "type": "max" }, "type": "query" } ], "executionErrorState": "alerting", "for": "", "frequency": "1m", "handler": 1, "message": "TiKV Storage ReadPool thread CPU usage is high", "name": "TiKV Storage ReadPool CPU alert", "noDataState": "ok", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "点查会走 Storage ReadPool", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 9 }, "id": 1908, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"store_read_norm.*\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} - normal", "metric": "tikv_thread_cpu_seconds_total", "refId": "A", "step": 4 }, { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"store_read_high.*\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} - high", "metric": "tikv_thread_cpu_seconds_total", "refId": "B", "step": 4 }, { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"store_read_low.*\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} - low", "metric": "tikv_thread_cpu_seconds_total", "refId": "C", "step": 4 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 0.9 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Storage ReadPool CPU", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 7.2 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "1m", "now" ] }, "reducer": { "params": [], "type": "max" }, "type": "query" } ], "executionErrorState": "alerting", "for": "", "frequency": "1m", "handler": 1, "message": "TiKV Coprocessor thread CPU usage is high", "name": "TiKV Coprocessor CPU alert", "noDataState": "ok", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "非点查的 SQL 走 Coprocessor Pool", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 9 }, "id": 78, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"cop_normal.*\"}[1m])) by (instance)", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "{{instance}} - normal", "refId": "A", "step": 4 }, { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"cop_high.*\"}[1m])) by (instance)", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "{{instance}} - high", "refId": "B", "step": 4 }, { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"cop_low.*\"}[1m])) by (instance)", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "{{instance}} - low", "refId": "C", "step": 4 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 0.9 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Coprocessor CPU", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 3.6 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "1m", "now" ] }, "reducer": { "params": [], "type": "max" }, "type": "query" } ], "executionErrorState": "alerting", "for": "", "frequency": "1m", "handler": 1, "message": "TiKV gRPC poll thread CPU usage is high", "name": "TiKV gRPC poll CPU alert", "noDataState": "ok", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "gRPC CPU 使用率,如果打满,需要调 TiKV 的 grpc-concurrency 参数", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 17 }, "id": 105, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"grpc.*\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 4 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 0.9 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "gRPC poll CPU", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "I/O util 反应磁盘的繁忙程度。接近 100% 意味着磁盘很忙,虽然未必达到了贷款瓶颈,但是延迟可能会受较大影响。", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 17 }, "id": 1710, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "rate(node_disk_io_time_seconds_total[1m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} - {{device}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "IO utilization", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Hot Read", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 1 }, "id": 2797, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 2 }, "id": 2763, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "CPU", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 2 }, "id": 2764, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_grpc_msg_duration_seconds_count{instance=~\"$instance\", type!=\"kv_gc\"}[1m])) by (instance,type)", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{instance}} - {{type}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "QPS", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 10 }, "id": 2765, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "rate(node_disk_io_time_seconds_total[1m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} - {{device}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "IO utilization", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 3.6 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "1m", "now" ] }, "reducer": { "params": [], "type": "max" }, "type": "query" } ], "executionErrorState": "alerting", "for": "", "frequency": "1m", "handler": 1, "message": "TiKV gRPC poll thread CPU usage is high", "name": "TiKV gRPC poll CPU alert", "noDataState": "ok", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 10 }, "id": 2783, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"grpc.*\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 4 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 0.9 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "gRPC poll CPU", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 1.7 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "1m", "now" ] }, "reducer": { "params": [], "type": "max" }, "type": "query" } ], "executionErrorState": "alerting", "for": "0", "frequency": "1m", "handler": 1, "message": "TiKV raft store thread CPU is high", "name": "TiKV raft store CPU alert", "noDataState": "ok", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "Raftstore 线程 CPU 使用率,这个线程池用来写 Raft log。", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 18 }, "id": 61, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"raftstore_.*\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tikv_thread_cpu_seconds_total", "refId": "A", "step": 4 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 0.85 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Raft store CPU", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 1.8 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "1m", "now" ] }, "reducer": { "params": [], "type": "max" }, "type": "query" } ], "executionErrorState": "alerting", "for": "", "frequency": "1m", "handler": 1, "message": "TiKV async apply thread CPU is high", "name": "TiKV async apply CPU alert", "noDataState": "ok", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "Apply 线程 CPU,这个线程池用来将写入应用到 kv engine 中。", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 18 }, "id": 79, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"apply_[0-9]+\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tikv_thread_cpu_seconds_total", "refId": "A", "step": 4 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 0.9 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Async apply CPU", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 3.6 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "1m", "now" ] }, "reducer": { "params": [], "type": "max" }, "type": "query" } ], "executionErrorState": "alerting", "for": "", "frequency": "1m", "handler": 1, "message": "TiKV Scheduler Worker thread CPU is high", "name": "TiKV Scheduler Worker CPU alert", "noDataState": "ok", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "Scheduler worker 线程池的 CPU 使用率。事务会在 Scheduler 中排队,这个线程池接近打满意味着 MVCC 的旧版本太多,事务处理过慢。", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 26 }, "id": 63, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"sched_worker.*\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tikv_thread_cpu_seconds_total", "refId": "A", "step": 4 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 0.9 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Scheduler Worker CPU", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Hot Write", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 2 }, "id": 2798, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 3 }, "id": 34, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [ { "alias": "total", "lines": false } ], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(tikv_raftstore_region_count{instance=~\"$instance\", type=\"leader\"}) by (instance)", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 10 }, { "expr": "delta(tikv_raftstore_region_count{instance=~\"$instance\", type=\"leader\"}[30s]) < -10", "format": "time_series", "hide": true, "intervalFactor": 2, "legendFormat": "", "refId": "B" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Leader", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "Append log 时间。过长意味着 Raftstore 线程太忙,有可能心跳处理不过来导致上面的 Leader 被切走。", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 3 }, "id": 2786, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.999999, sum(rate(tikv_raftstore_append_log_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": " 99.9999%", "metric": "", "refId": "A", "step": 4 }, { "expr": "histogram_quantile(0.95, sum(rate(tikv_raftstore_append_log_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 1, "legendFormat": "95%", "refId": "B" }, { "expr": "sum(rate(tikv_raftstore_append_log_duration_seconds_sum{instance=~\"$instance\"}[1m])) / sum(rate(tikv_raftstore_append_log_duration_seconds_count{instance=~\"$instance\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "refId": "C", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Append log duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "Propose 如果在 Raftstore 中等得太久,则说明 Raftstore 线程很忙,或者被卡住了。", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 10 }, "id": 2787, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.999999, sum(rate(tikv_raftstore_request_wait_time_duration_secs_bucket{instance=~\"$instance\"}[1m])) by (le, instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} ", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Propose 99.9999% wait duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 10 }, "id": 2789, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 300, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "avg(tikv_engine_write_micro_seconds{instance=~\"$instance\", db=\"raft\",type=\"write_max\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "max", "refId": "A", "step": 10 }, { "expr": "avg(tikv_engine_write_micro_seconds{instance=~\"$instance\", db=\"raft\",type=\"write_percentile99\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "refId": "B", "step": 10 }, { "expr": "avg(tikv_engine_write_micro_seconds{instance=~\"$instance\", db=\"raft\",type=\"write_percentile95\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "refId": "C", "step": 10 }, { "expr": "avg(tikv_engine_write_micro_seconds{instance=~\"$instance\", db=\"raft\",type=\"write_average\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "refId": "D", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Raft RocksDB write duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "µs", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": null, "description": "Raftstore 线程需要处理一些周期性任务。如果处理时间过长,则相当于 Raftstore 线程被卡住了。", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 24, "x": 0, "y": 17 }, "id": 2829, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.999999, sum(rate(tikv_raftstore_event_duration_bucket{instance=~\"$instance\"}[1m])) by (le, instance, type))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}-{{type}}", "refId": "A", "step": 4 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 100, "yaxis": "left" } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Raft event 99.9999% handle duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "decimals": null, "format": "s", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Leader Drop", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 3 }, "id": 2799, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "TiKV 的 Raftstore 会在处理不过来消息时发生 channel full,此时后续的消息会被丢弃掉,可能会导致 TiDB 重试或者掉 Leader。", "editable": true, "error": false, "fill": 3, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 4 }, "id": 22, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_channel_full_total{instance=~\"$instance\"}[1m])) by (instance, type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} - {{type}}", "metric": "", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Channel full", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "RocksDB write stall 说明写入请求被 compaction 或者其他流量控制措施卡住了。", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 4 }, "id": 87, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 300, "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "avg(tikv_engine_write_stall{instance=~\"$instance\", db=\"raft\", type=\"write_stall_max\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "max", "metric": "", "refId": "A", "step": 10 }, { "expr": "avg(tikv_engine_write_stall{instance=~\"$instance\", db=\"raft\", type=\"write_stall_percentile99\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "metric": "", "refId": "B", "step": 10 }, { "expr": "avg(tikv_engine_write_stall{instance=~\"$instance\", db=\"raft\", type=\"write_stall_percentile95\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "metric": "", "refId": "C", "step": 10 }, { "expr": "avg(tikv_engine_write_stall{instance=~\"$instance\", db=\"raft\", type=\"write_stall_average\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "metric": "", "refId": "D", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Raft RocksDB write stall duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "µs", "label": null, "logBase": 10, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 12 }, "id": 39, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_append_log_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": " 99%", "metric": "", "refId": "A", "step": 4 }, { "expr": "histogram_quantile(0.95, sum(rate(tikv_raftstore_append_log_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "refId": "B", "step": 4 }, { "expr": "sum(rate(tikv_raftstore_append_log_duration_seconds_sum{instance=~\"$instance\"}[1m])) / sum(rate(tikv_raftstore_append_log_duration_seconds_count{instance=~\"$instance\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "refId": "C", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Append log duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 12 }, "id": 40, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_append_log_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le, instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} ", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Append log duration per server", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ] }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 67 }, "id": 2819, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_commit_log_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "refId": "A" }, { "expr": "histogram_quantile(0.95, sum(rate(tikv_raftstore_commit_log_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "refId": "B" }, { "expr": "sum(rate(tikv_raftstore_commit_log_duration_seconds_sum{instance=~\"$instance\"}[1m])) / sum(rate(tikv_raftstore_commit_log_duration_seconds_count{instance=~\"$instance\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Commit log duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 67 }, "id": 2821, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_commit_log_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le, instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} 99%", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Commit log duration per server", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Channel Full", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 4 }, "id": 2800, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 5 }, "id": 2777, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_scheduler_too_busy_total{instance=~\"$instance\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "scheduler-{{instance}}", "metric": "", "refId": "A", "step": 4 }, { "expr": "sum(rate(tikv_channel_full_total{instance=~\"$instance\"}[1m])) by (instance, type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "channelfull-{{instance}}-{{type}}", "metric": "", "refId": "B", "step": 4 }, { "expr": "sum(rate(tikv_coprocessor_request_error{instance=~\"$instance\", type='full'}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "coprocessor-{{instance}}", "metric": "", "refId": "C", "step": 4 }, { "expr": "avg(tikv_engine_write_stall{instance=~\"$instance\", type=\"write_stall_percentile99\"}) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "stall-{{instance}}", "refId": "D" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Server is busy", "tooltip": { "msResolution": false, "shared": true, "sort": 2, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "none", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 5 }, "height": "", "id": 193, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "maxPerRow": 2, "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(tikv_scheduler_contex_total{instance=~\"$instance\"}) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "", "refId": "A", "step": 40 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Scheduler pending commands", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "editable": true, "error": false, "fill": 3, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 13 }, "id": 2779, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_channel_full_total{instance=~\"$instance\"}[1m])) by (instance, type)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} - {{type}}", "metric": "", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Channel full", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 13 }, "id": 2785, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 300, "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "avg(tikv_engine_write_stall{instance=~\"$instance\", type=\"write_stall_max\"}) by (db)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{db}} max", "metric": "", "refId": "A", "step": 10 }, { "expr": "avg(tikv_engine_write_stall{instance=~\"$instance\", type=\"write_stall_percentile99\"}) by (db)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{db}} 99%", "metric": "", "refId": "B", "step": 10 }, { "expr": "avg(tikv_engine_write_stall{instance=~\"$instance\", type=\"write_stall_percentile95\"}) by (db)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{db}} 95%", "metric": "", "refId": "C", "step": 10 }, { "expr": "avg(tikv_engine_write_stall{instance=~\"$instance\", type=\"write_stall_average\"}) by (db)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{db}} avg", "metric": "", "refId": "D", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "RocksDB write stall duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "µs", "label": null, "logBase": 10, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "The number of concurrently running tasks in the unified read pool.", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 21 }, "id": 2838, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(tikv_unified_read_pool_running_tasks{instance=~\"$instance\"}) by (instance)", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{instance}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Unified read pool running tasks", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Server Is Busy", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 5 }, "id": 2801, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 6 }, "id": 2768, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 300, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_grpc_msg_duration_seconds_bucket{instance=~\"$instance\", type!=\"kv_gc\"}[1m])) by (le, type))", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "{{type}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "99% gRPC messge duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 10, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 3.6 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "1m", "now" ] }, "reducer": { "params": [], "type": "max" }, "type": "query" } ], "executionErrorState": "alerting", "for": "0m", "frequency": "1m", "handler": 1, "message": "TiKV gRPC poll thread CPU is high", "name": "TiKV gRPC poll CPU alert", "noDataState": "ok", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 6 }, "id": 2782, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"grpc.*\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 4 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 0.9 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "gRPC poll CPU", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 7.2 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "1m", "now" ] }, "reducer": { "params": [], "type": "max" }, "type": "query" } ], "executionErrorState": "alerting", "for": "0m", "frequency": "1m", "handler": 1, "message": "TiKV unified read pool thread CPU usage is high", "name": "Unified read pool CPU alert", "noDataState": "ok", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "description": "The CPU utilization of the unified read pool", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 13 }, "id": 2840, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": null, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"unified_read_po*\"}[1m])) by (instance)", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 4 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 7.2 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Unified read pool CPU", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 3.6 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "1m", "now" ] }, "reducer": { "params": [], "type": "max" }, "type": "query" } ], "executionErrorState": "alerting", "for": "", "frequency": "1m", "handler": 1, "message": "TiKV Storage ReadPool thread CPU is high", "name": "TiKV Storage ReadPool CPU alert", "noDataState": "ok", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 13 }, "id": 2761, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"store_read_norm.*\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} - normal", "metric": "tikv_thread_cpu_seconds_total", "refId": "A", "step": 4 }, { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"store_read_high.*\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} - high", "metric": "tikv_thread_cpu_seconds_total", "refId": "B", "step": 4 }, { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"store_read_low.*\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} - low", "metric": "tikv_thread_cpu_seconds_total", "refId": "C", "step": 4 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 0.9 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Storage ReadPool CPU", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "alert": { "conditions": [ { "evaluator": { "params": [ 7.2 ], "type": "gt" }, "operator": { "type": "and" }, "query": { "params": [ "A", "1m", "now" ] }, "reducer": { "params": [], "type": "max" }, "type": "query" } ], "executionErrorState": "alerting", "for": "", "frequency": "1m", "handler": 1, "message": "TiKV Coprocessor thread CPU is high", "name": "TiKV Coprocessor CPU alert", "noDataState": "ok", "notifications": [] }, "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 20 }, "id": 2762, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"cop_normal.*\"}[1m])) by (instance)", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "{{instance}} - normal", "refId": "A", "step": 4 }, { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"cop_high.*\"}[1m])) by (instance)", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "{{instance}} - high", "refId": "B", "step": 4 }, { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"cop_low.*\"}[1m])) by (instance)", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "{{instance}} - low", "refId": "C", "step": 4 } ], "thresholds": [ { "colorMode": "critical", "fill": true, "line": true, "op": "gt", "value": 0.9 } ], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Coprocessor CPU", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 5, "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 20 }, "id": 111, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": null, "sortDesc": null, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.9999, sum(rate(tikv_coprocessor_request_wait_seconds_bucket{instance=~\"$instance\"}[1m])) by (le,req))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{req}}-99.99%", "refId": "D" }, { "expr": "histogram_quantile(0.99, sum(rate(tikv_coprocessor_request_wait_seconds_bucket{instance=~\"$instance\"}[1m])) by (le,req))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{req}}-99%", "refId": "A", "step": 4 }, { "expr": "histogram_quantile(0.95, sum(rate(tikv_coprocessor_request_wait_seconds_bucket{instance=~\"$instance\"}[1m])) by (le,req))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{req}}-95%", "refId": "B", "step": 4 }, { "expr": " sum(rate(tikv_coprocessor_request_wait_seconds_sum{instance=~\"$instance\"}[1m])) by (req) / sum(rate(tikv_coprocessor_request_wait_seconds_count{instance=~\"$instance\"}[1m])) by (req)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{req}}-avg", "refId": "C", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Coprocessor wait duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 5, "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 27 }, "id": 16, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": null, "sortDesc": null, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.9999, sum(rate(tikv_coprocessor_request_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le,req))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{req}}-99.99%", "refId": "E" }, { "expr": "histogram_quantile(0.99, sum(rate(tikv_coprocessor_request_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le,req))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{req}}-99%", "refId": "A", "step": 4 }, { "expr": "histogram_quantile(0.95, sum(rate(tikv_coprocessor_request_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le,req))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{req}}-95%", "refId": "B", "step": 4 }, { "expr": " sum(rate(tikv_coprocessor_request_duration_seconds_sum{instance=~\"$instance\"}[1m])) by (req) / sum(rate(tikv_coprocessor_request_duration_seconds_count{instance=~\"$instance\"}[1m])) by (req)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{req}}-avg", "refId": "C", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Coprocessor request duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 5, "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 27 }, "id": 116, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": null, "sortDesc": null, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.95, sum(rate(tikv_coprocessor_request_wait_seconds_bucket{instance=~\"$instance\"}[1m])) by (le, instance,req))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}-{{req}}", "refId": "B", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "95% Coprocessor wait duration by store", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 34 }, "id": 52, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.9999, avg(rate(tikv_coprocessor_scan_keys_bucket{instance=~\"$instance\"}[1m])) by (le, req)) ", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{req}}-99.99%", "refId": "D" }, { "expr": "histogram_quantile(0.99, avg(rate(tikv_coprocessor_scan_keys_bucket{instance=~\"$instance\"}[1m])) by (le, req)) ", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "{{req}}-99%", "metric": "tikv_coprocessor_scan_keys_bucket", "refId": "A", "step": 10 }, { "expr": "histogram_quantile(0.95, avg(rate(tikv_coprocessor_scan_keys_bucket{instance=~\"$instance\"}[1m])) by (le, req)) ", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{req}}-95%", "metric": "tikv_coprocessor_scan_keys_bucket", "refId": "B", "step": 10 }, { "expr": "histogram_quantile(0.90, avg(rate(tikv_coprocessor_scan_keys_bucket{instance=~\"$instance\"}[1m])) by (le, req)) ", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{req}}-90%", "metric": "tikv_coprocessor_scan_keys_bucket", "refId": "C", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Coprocessor scan keys", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 34 }, "id": 82, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 300, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "avg(tikv_engine_get_micro_seconds{instance=~\"$instance\", db=\"kv\",type=\"get_max\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "max", "refId": "A", "step": 10 }, { "expr": "avg(tikv_engine_get_micro_seconds{instance=~\"$instance\", db=\"kv\",type=\"get_percentile99\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "refId": "B", "step": 10 }, { "expr": "avg(tikv_engine_get_micro_seconds{instance=~\"$instance\", db=\"kv\",type=\"get_percentile95\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "refId": "C", "step": 10 }, { "expr": "avg(tikv_engine_get_micro_seconds{instance=~\"$instance\", db=\"kv\",type=\"get_average\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "refId": "D", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "RocksDB get duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "µs", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 5, "description": "读请求执行前,都需要拿一个 snapshot", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 41 }, "id": 2828, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sort": null, "sortDesc": null, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.999999, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{instance=~\"$instance\", type=\"snapshot\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99.9999%", "refId": "B", "step": 4 }, { "expr": "sum(rate(tikv_storage_engine_async_request_duration_seconds_sum{instance=~\"$instance\", type=\"snapshot\"}[1m])) / sum(rate(tikv_storage_engine_async_request_duration_seconds_count{instance=~\"$instance\", type=\"snapshot\"}[1m]))", "format": "time_series", "intervalFactor": 1, "legendFormat": "average", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Get snapshot duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 41 }, "id": 125, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 300, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "avg(tikv_engine_seek_micro_seconds{instance=~\"$instance\", db=\"kv\",type=\"seek_max\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "max", "refId": "A", "step": 10 }, { "expr": "avg(tikv_engine_seek_micro_seconds{instance=~\"$instance\", db=\"kv\",type=\"seek_percentile99\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "refId": "B", "step": 10 }, { "expr": "avg(tikv_engine_seek_micro_seconds{instance=~\"$instance\", db=\"kv\",type=\"seek_percentile95\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "refId": "C", "step": 10 }, { "expr": "avg(tikv_engine_seek_micro_seconds{instance=~\"$instance\", db=\"kv\",type=\"seek_average\"})", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "refId": "D", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "RocksDB seek duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "µs", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 2, "description": "Disk seconds Read Latency.\n- Critical:\n - Recommended performance value is < 10ms as avg value of the Avg Disk sec/Read,Write.\n - Critical value of the Avg Disk sec/Read,Write is > 50ms, should not exceed this value.", "fill": 1, "gridPos": { "h": 4, "w": 24, "x": 0, "y": 48 }, "id": 2821, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": true, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "(rate(node_disk_read_time_seconds_total[5m])/ rate(node_disk_reads_completed_total[5m])) > 0", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{instance}}: [{{ device }}]", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Disk Read Latency (ms)", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "none", "label": null, "logBase": 1, "max": null, "min": "0", "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 2, "description": "", "fill": 1, "gridPos": { "h": 4, "w": 24, "x": 0, "y": 52 }, "id": 2822, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": true, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "irate(node_disk_read_bytes_total[5m]) + irate(node_disk_written_bytes_total[5m]) > 0", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{instance}}: [{{ device }}]", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Disk Throughput", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "none", "label": null, "logBase": 1, "max": null, "min": "0", "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Read Too Slow", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 6 }, "id": 2802, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 7 }, "id": 2781, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": false, "hideZero": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 300, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_grpc_msg_duration_seconds_bucket{instance=~\"$instance\", type!=\"kv_gc\"}[1m])) by (le, type, instance))", "format": "time_series", "interval": "", "intervalFactor": 2, "legendFormat": "{{instance}} {{type}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "99% gRPC messge duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 10, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 7 }, "id": 109, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{instance=~\"$instance\", type=\"write\"}[1m])) by (le, instance))", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{instance}} 99%", "refId": "A", "step": 4 }, { "expr": "histogram_quantile(0.95, sum(rate(tikv_storage_engine_async_request_duration_seconds_bucket{instance=~\"$instance\", type=\"write\"}[1m])) by (le, instance))", "format": "time_series", "hide": false, "intervalFactor": 2, "legendFormat": "{{instance}} 95%", "refId": "B", "step": 4 }, { "expr": "rate(tikv_storage_engine_async_request_duration_seconds_sum{instance=~\"$instance\", type=\"write\"}[1m]) / rate(tikv_storage_engine_async_request_duration_seconds_count{instance=~\"$instance\", type=\"write\"}[1m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} avg", "refId": "C", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Storage async write duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 14 }, "id": 2753, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_scheduler_latch_wait_duration_seconds_bucket{instance=~\"$instance\", type=\"prewrite\"}[1m])) by (le, instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} 99%", "metric": "", "refId": "A", "step": 10 }, { "expr": "histogram_quantile(0.95, sum(rate(tikv_scheduler_latch_wait_duration_seconds_bucket{instance=~\"$instance\", type=\"prewrite\"}[1m])) by (le, instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} 95%", "metric": "", "refId": "B", "step": 10 }, { "expr": "rate(tikv_scheduler_latch_wait_duration_seconds_sum{instance=~\"$instance\", type=\"prewrite\"}[1m]) / rate(tikv_scheduler_latch_wait_duration_seconds_count{instance=~\"$instance\", type=\"prewrite\"}[1m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} avg", "metric": "", "refId": "C", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Prewrite latch wait duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 14 }, "id": 2774, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "hideZero": false, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "total": false, "values": true }, "lines": true, "linewidth": 2, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_scheduler_latch_wait_duration_seconds_bucket{instance=~\"$instance\", type=\"commit\"}[1m])) by (le, instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} 99%", "metric": "", "refId": "A", "step": 10 }, { "expr": "histogram_quantile(0.95, sum(rate(tikv_scheduler_latch_wait_duration_seconds_bucket{instance=~\"$instance\", type=\"commit\"}[1m])) by (le, instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} 95%", "metric": "", "refId": "B", "step": 10 }, { "expr": "rate(tikv_scheduler_latch_wait_duration_seconds_sum{instance=~\"$instance\", type=\"commit\"}[1m]) / rate(tikv_scheduler_latch_wait_duration_seconds_count{instance=~\"$instance\", type=\"commit\"}[1m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} avg", "metric": "", "refId": "C", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Commit latch wait duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 21 }, "id": 2788, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_append_log_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le, instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} 99%", "metric": "", "refId": "A", "step": 4 }, { "expr": "histogram_quantile(0.95, sum(rate(tikv_raftstore_append_log_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le, instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} 95%", "refId": "B", "step": 4 }, { "expr": "rate(tikv_raftstore_append_log_duration_seconds_sum{instance=~\"$instance\"}[1m]) / rate(tikv_raftstore_append_log_duration_seconds_count{instance=~\"$instance\"}[1m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} avg", "refId": "C", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Append log duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 21 }, "id": 2790, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_append_log_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le, instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} 99%", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Append log duration per server", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 28 }, "id": 2830, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_commit_log_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "refId": "A" }, { "expr": "histogram_quantile(0.95, sum(rate(tikv_raftstore_commit_log_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "refId": "B" }, { "expr": "sum(rate(tikv_raftstore_commit_log_duration_seconds_sum{instance=~\"$instance\"}[1m])) / sum(rate(tikv_raftstore_commit_log_duration_seconds_count{instance=~\"$instance\"}[1m]))", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "refId": "C" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Commit log duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 28 }, "id": 2831, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_commit_log_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le, instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} 99%", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Commit log duration per server", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 35 }, "id": 31, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_apply_log_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le, instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} 99%", "metric": "", "refId": "A", "step": 4 }, { "expr": "histogram_quantile(0.95, sum(rate(tikv_raftstore_apply_log_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le, instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} 95%", "refId": "B", "step": 4 }, { "expr": "rate(tikv_raftstore_apply_log_duration_seconds_sum{instance=~\"$instance\"}[1m]) / rate(tikv_raftstore_apply_log_duration_seconds_count{instance=~\"$instance\"}[1m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} avg", "refId": "C", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Apply log duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 35 }, "id": 32, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_apply_log_duration_seconds_bucket{instance=~\"$instance\"}[1m])) by (le, instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} 99%", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Apply log duration per server", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 42 }, "id": 2794, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_request_wait_time_duration_secs_bucket{instance=~\"$instance\"}[1m])) by (le, instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} 99%", "metric": "", "refId": "A", "step": 4 }, { "expr": "histogram_quantile(0.95, sum(rate(tikv_raftstore_request_wait_time_duration_secs_bucket{instance=~\"$instance\"}[1m])) by (le, instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} 95%", "refId": "B", "step": 4 }, { "expr": "rate(tikv_raftstore_request_wait_time_duration_secs_sum{instance=~\"$instance\"}[1m]) / rate(tikv_raftstore_request_wait_time_duration_secs_count{instance=~\"$instance\"}[1m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} avg", "refId": "C", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Propose wait duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "editable": true, "error": false, "fill": 1, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 42 }, "id": 2795, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_apply_wait_time_duration_secs_bucket{instance=~\"$instance\"}[1m])) by (le, instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} 99%", "metric": "", "refId": "A", "step": 4 }, { "expr": "histogram_quantile(0.95, sum(rate(tikv_raftstore_apply_wait_time_duration_secs_bucket{instance=~\"$instance\"}[1m])) by (le, instance))", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} 95%", "refId": "B", "step": 4 }, { "expr": "rate(tikv_raftstore_apply_wait_time_duration_secs_sum{instance=~\"$instance\"}[1m]) / rate(tikv_raftstore_apply_wait_time_duration_secs_count{instance=~\"$instance\"}[1m]) ", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} avg", "refId": "C", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Apply wait duration", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "cumulative" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 49 }, "id": 126, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 300, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "avg(tikv_engine_write_micro_seconds{instance=~\"$instance\", db=\"kv\",type=\"write_max\"}) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} max", "refId": "A", "step": 10 }, { "expr": "avg(tikv_engine_write_micro_seconds{instance=~\"$instance\", db=\"kv\",type=\"write_percentile99\"}) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} 99%", "refId": "B", "step": 10 }, { "expr": "avg(tikv_engine_write_micro_seconds{instance=~\"$instance\", db=\"kv\",type=\"write_percentile95\"}) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} 95%", "refId": "C", "step": 10 }, { "expr": "avg(tikv_engine_write_micro_seconds{instance=~\"$instance\", db=\"kv\",type=\"write_average\"}) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} avg", "refId": "D", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "KV RocksDB write duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "µs", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 49 }, "id": 2776, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 300, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "avg(tikv_engine_write_micro_seconds{instance=~\"$instance\", db=\"raft\",type=\"write_max\"}) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} max", "refId": "A", "step": 10 }, { "expr": "avg(tikv_engine_write_micro_seconds{instance=~\"$instance\", db=\"raft\",type=\"write_percentile99\"}) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} 99%", "refId": "B", "step": 10 }, { "expr": "avg(tikv_engine_write_micro_seconds{instance=~\"$instance\", db=\"raft\",type=\"write_percentile95\"}) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} 95%", "refId": "C", "step": 10 }, { "expr": "avg(tikv_engine_write_micro_seconds{instance=~\"$instance\", db=\"raft\",type=\"write_average\"}) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} avg", "refId": "D", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Raft RocksDB write duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "µs", "label": null, "logBase": 2, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 56 }, "id": 137, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 300, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "rate(tikv_engine_wal_file_synced{instance=~\"$instance\", db=\"raft\"}[1m])", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} sync", "metric": "", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Raft RocksDB WAL sync operations", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "ops", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "fill": 0, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 56 }, "id": 135, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 300, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "maxPerRow": 2, "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "tikv_engine_wal_file_sync_micro_seconds{instance=~\"$instance\", db=\"raft\",type=\"wal_file_sync_max\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} max", "refId": "A", "step": 10 }, { "expr": "tikv_engine_wal_file_sync_micro_seconds{instance=~\"$instance\", db=\"raft\",type=\"wal_file_sync_percentile99\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} 99%", "refId": "B", "step": 10 }, { "expr": "tikv_engine_wal_file_sync_micro_seconds{instance=~\"$instance\", db=\"raft\",type=\"wal_file_sync_percentile95\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} 95%", "refId": "C", "step": 10 }, { "expr": "tikv_engine_wal_file_sync_micro_seconds{instance=~\"$instance\", db=\"raft\",type=\"wal_file_sync_average\"}", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} avg", "refId": "D", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Raft RocksDB WAL sync duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "µs", "label": null, "logBase": 10, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 63 }, "id": 2793, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 300, "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "avg(tikv_engine_write_stall{instance=~\"$instance\", type=\"write_stall_max\"}) by (db)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{db}} max", "metric": "", "refId": "A", "step": 10 }, { "expr": "avg(tikv_engine_write_stall{instance=~\"$instance\", type=\"write_stall_percentile99\"}) by (db)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{db}} 99%", "metric": "", "refId": "B", "step": 10 }, { "expr": "avg(tikv_engine_write_stall{instance=~\"$instance\", type=\"write_stall_percentile95\"}) by (db)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{db}} 95%", "metric": "", "refId": "C", "step": 10 }, { "expr": "avg(tikv_engine_write_stall{instance=~\"$instance\", type=\"write_stall_average\"}) by (db)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{db}} avg", "metric": "", "refId": "D", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "RocksDB write stall duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "µs", "label": null, "logBase": 10, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 4, "w": 24, "x": 0, "y": 70 }, "id": 2791, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "sort": "max", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "rate(node_disk_io_time_seconds_total[1m]) > 0", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} - {{device}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "IO utilization", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "fill": 1, "gridPos": { "h": 4, "w": 24, "x": 0, "y": 74 }, "id": 2818, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "rate(node_disk_reads_completed_total[5m]) + rate(node_disk_writes_completed_total[5m])", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{instance}} IOPs", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "IOPs", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "description": "DISK seconds Read/ Write Latency.\n- Critical:\n - Recommended performance value is < 10ms as avg value of the Avg Disk sec/Read,Write.\n - Critical value of the Avg Disk sec/Read,Write is > 50ms, should not exceed this value.", "fill": 1, "gridPos": { "h": 4, "w": 24, "x": 0, "y": 78 }, "id": 2820, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": true, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "(rate(node_disk_write_time_seconds_total[5m])/ rate(node_disk_writes_completed_total[5m])) > 0", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{instance}}: [{{ device }}]", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Disk Write Latency (ms)", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "s", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 2, "description": "", "fill": 1, "gridPos": { "h": 4, "w": 24, "x": 0, "y": 82 }, "id": 2819, "legend": { "alignAsTable": true, "avg": true, "current": true, "max": true, "min": true, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 2, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "irate(node_disk_read_bytes_total[5m]) + irate(node_disk_written_bytes_total[5m]) > 0", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{instance}}: [{{ device }}]", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Disk Throughput", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "none", "label": null, "logBase": 1, "max": null, "min": "0", "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Write Too Slow", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 7 }, "id": 2806, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 8 }, "id": 2810, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 300, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "avg(tikv_engine_num_files_at_level{instance=~\"$instance\", db=\"kv\", level=\"0\"}) by (instance, cf)", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{instance}} {{cf}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Level0 SST file number", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 8 }, "id": 2811, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 300, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "avg(tikv_engine_num_immutable_mem_table{instance=~\"$instance\", db=\"$db\"}) by (instance, cf)", "format": "time_series", "intervalFactor": 1, "legendFormat": "{{instance}} {{cf}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Immutable mem-table number", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": "0", "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 15 }, "id": 2808, "legend": { "alignAsTable": true, "avg": false, "current": false, "max": false, "min": false, "rightSide": true, "show": true, "sideWidth": 300, "total": false, "values": false }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "avg(tikv_engine_pending_compaction_bytes{instance=~\"$instance\", db=\"$db\"}) by (instance, cf)", "format": "time_series", "instant": false, "intervalFactor": 1, "legendFormat": "{{instance}} {{cf}}", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Pending compaction bytes", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": "0", "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "fill": 1, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 15 }, "id": 2812, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 300, "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "scopedVars": { "db": { "selected": false, "text": "kv", "value": "kv" } }, "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "avg(tikv_engine_write_stall{instance=~\"$instance\", type=\"write_stall_max\", db=\"$db\"}) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} max", "metric": "", "refId": "A", "step": 10 }, { "expr": "avg(tikv_engine_write_stall{instance=~\"$instance\", type=\"write_stall_percentile99\", db=\"$db\"}) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} 99%", "metric": "", "refId": "B", "step": 10 }, { "expr": "avg(tikv_engine_write_stall{instance=~\"$instance\", type=\"write_stall_percentile95\", db=\"$db\"}) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} 95%", "metric": "", "refId": "C", "step": 10 }, { "expr": "avg(tikv_engine_write_stall{instance=~\"$instance\", type=\"write_stall_average\", db=\"$db\"}) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}} avg", "metric": "", "refId": "D", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "RocksDB write stall duration", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "µs", "label": null, "logBase": 10, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": "db", "title": "Write Stall - $db", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 8 }, "id": 2803, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "fill": 1, "gridPos": { "h": 8, "w": 12, "x": 0, "y": 10 }, "id": 102, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 300, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "avg(tikv_engine_block_cache_size_bytes{instance=~\"$instance\", db=\"kv\"}) by(cf)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{cf}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Block cache size", "tooltip": { "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 8, "w": 12, "x": 12, "y": 10 }, "id": 2770, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "sort": "current", "sortDesc": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "avg(process_resident_memory_bytes{instance=~\"$instance\"}) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "refId": "A", "step": 10 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Memory", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "OOM", "type": "row" }, { "collapsed": true, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 9 }, "id": 2804, "panels": [ { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "decimals": 1, "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 11 }, "id": 1481, "legend": { "alignAsTable": true, "avg": false, "current": true, "hideEmpty": true, "max": true, "min": false, "rightSide": true, "show": true, "sideWidth": 250, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null as zero", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "histogram_quantile(0.99, sum(rate(tikv_raftstore_region_size_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99%", "metric": "", "refId": "B", "step": 10 }, { "expr": "histogram_quantile(0.95, sum(rate(tikv_raftstore_region_size_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "95%", "metric": "", "refId": "C", "step": 10 }, { "expr": "sum(rate(tikv_raftstore_region_size_sum{instance=~\"$instance\"}[1m])) / sum(rate(tikv_raftstore_region_size_count{instance=~\"$instance\"}[1m])) ", "format": "time_series", "intervalFactor": 2, "legendFormat": "avg", "metric": "", "refId": "D", "step": 10 }, { "expr": "histogram_quantile(0.999999, sum(rate(tikv_raftstore_region_size_bucket{instance=~\"$instance\"}[1m])) by (le))", "format": "time_series", "intervalFactor": 2, "legendFormat": "99.9999%", "refId": "A" } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Approximate Region size", "tooltip": { "msResolution": false, "shared": true, "sort": 2, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "bytes", "label": null, "logBase": 1, "max": null, "min": "0", "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": false } ], "yaxis": { "align": false, "alignLevel": null } }, { "aliasColors": {}, "bars": false, "dashLength": 10, "dashes": false, "datasource": "${DS_TEST-CLUSTER}", "editable": true, "error": false, "fill": 0, "grid": {}, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 11 }, "id": 2792, "legend": { "alignAsTable": true, "avg": false, "current": true, "max": true, "min": false, "rightSide": true, "show": true, "total": false, "values": true }, "lines": true, "linewidth": 1, "links": [], "nullPointMode": "null", "percentage": false, "pointradius": 5, "points": false, "renderer": "flot", "seriesOverrides": [], "spaceLength": 10, "stack": false, "steppedLine": false, "targets": [ { "expr": "sum(rate(tikv_thread_cpu_seconds_total{instance=~\"$instance\", name=~\"split_check\"}[1m])) by (instance)", "format": "time_series", "intervalFactor": 2, "legendFormat": "{{instance}}", "metric": "tikv_thread_cpu_seconds_total", "refId": "A", "step": 4 } ], "thresholds": [], "timeFrom": null, "timeRegions": [], "timeShift": null, "title": "Split checker CPU", "tooltip": { "msResolution": false, "shared": true, "sort": 0, "value_type": "individual" }, "type": "graph", "xaxis": { "buckets": null, "mode": "time", "name": null, "show": true, "values": [] }, "yaxes": [ { "format": "percentunit", "label": null, "logBase": 1, "max": null, "min": null, "show": true }, { "format": "short", "label": null, "logBase": 1, "max": null, "min": null, "show": true } ], "yaxis": { "align": false, "alignLevel": null } } ], "repeat": null, "title": "Huge Region", "type": "row" } ], "refresh": "1m", "schemaVersion": 18, "style": "dark", "tags": [], "templating": { "list": [ { "allValue": null, "current": {}, "datasource": "${DS_TEST-CLUSTER}", "definition": "", "hide": 0, "includeAll": true, "label": "db", "multi": true, "name": "db", "options": [], "query": "label_values(tikv_engine_block_cache_size_bytes, db)", "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 1, "tagValuesQuery": "", "tags": [], "tagsQuery": "", "type": "query", "useTags": false }, { "allValue": null, "current": {}, "datasource": "${DS_TEST-CLUSTER}", "definition": "", "hide": 0, "includeAll": true, "label": "command", "multi": true, "name": "command", "options": [], "query": "label_values(tikv_storage_command_total, type)", "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 1, "tagValuesQuery": "", "tags": [], "tagsQuery": "", "type": "query", "useTags": false }, { "allValue": ".*", "current": {}, "datasource": "${DS_TEST-CLUSTER}", "definition": "", "hide": 0, "includeAll": true, "label": "Instance", "multi": true, "name": "instance", "options": [], "query": "label_values(tikv_engine_size_bytes, instance)", "refresh": 2, "regex": "", "skipUrlSync": false, "sort": 1, "tagValuesQuery": "", "tags": [], "tagsQuery": "", "type": "query", "useTags": false } ] }, "time": { "from": "now-3h", "to": "now" }, "timepicker": { "refresh_intervals": [ "5s", "10s", "30s", "1m", "5m", "15m", "30m", "1h", "2h", "1d" ], "time_options": [ "5m", "15m", "1h", "6h", "12h", "24h", "2d", "7d", "30d" ] }, "timezone": "browser", "title": "Test-Cluster-TiKV-Trouble-Shooting", "uid": "Lg4wiEkZz", "version": 4 } ================================================ FILE: start.yml ================================================ --- # Copyright 2016 PingCAP, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # See the License for the specific language governing permissions and # limitations under the License. # The playbook of TiDB - name: check config locally hosts: localhost any_errors_fatal: true tags: - always roles: - check_config_static - name: gather all facts, and check dest hosts: all any_errors_fatal: true tags: - always roles: - check_config_dynamic - hosts: monitored_servers tags: - node_exporter tasks: - name: start node_exporter by supervise shell: cd {{ deploy_dir }}/scripts && ./start_node_exporter.sh when: process_supervision == 'supervise' - name: start node_exporter by systemd become: true systemd: name=node_exporter-{{ node_exporter_port }}.service state=started enabled=no when: process_supervision == 'systemd' - name: wait until the node_exporter port is up wait_for: host: "{{ ansible_host }}" port: "{{ node_exporter_port }}" state: started msg: "the node_exporter port {{ node_exporter_port }} is not up" - name: wait until the node_exporter metrics page is available uri: url: "http://{{ ansible_host }}:{{ node_exporter_port }}/metrics" register: node_exporter_http_result until: node_exporter_http_result.status == 200 retries: 12 delay: 5 - hosts: monitored_servers tags: - blackbox_exporter tasks: - name: start blackbox_exporter by supervise shell: cd {{ deploy_dir }}/scripts && ./start_blackbox_exporter.sh when: process_supervision == 'supervise' - name: start blackbox_exporter by systemd become: true systemd: name=blackbox_exporter-{{ blackbox_exporter_port }}.service state=started enabled=no when: process_supervision == 'systemd' - name: wait until the blackbox_exporter port is up wait_for: host: "{{ ansible_host }}" port: "{{ blackbox_exporter_port }}" state: started msg: "the blackbox_exporter port {{ blackbox_exporter_port }} is not up" - name: wait until the blackbox_exporter metrics page is available uri: url: "http://{{ ansible_host }}:{{ blackbox_exporter_port }}/metrics" register: blackbox_exporter_http_result until: blackbox_exporter_http_result.status == 200 retries: 12 delay: 5 - hosts: alertmanager_servers tags: - alertmanager tasks: - name: start alertmanager by supervise shell: cd {{ deploy_dir }}/scripts && ./start_alertmanager.sh when: process_supervision == 'supervise' - name: start alertmanager by systemd systemd: name=alertmanager-{{ alertmanager_port }}.service state=started become: true when: process_supervision == 'systemd' - name: wait until the alertmanager port is up wait_for: host: "{{ ansible_host }}" port: "{{ alertmanager_port }}" state: started msg: "the alertmanager port {{ alertmanager_port }} is not up" - hosts: monitoring_servers tags: - pushgateway tasks: - name: start pushgateway by supervise shell: cd {{ deploy_dir }}/scripts && ./start_{{ item }}.sh when: process_supervision == 'supervise' with_items: - pushgateway - name: start pushgateway by systemd systemd: name={{ item }} state=started enabled=no when: process_supervision == 'systemd' become: true with_items: - pushgateway-{{ pushgateway_port }}.service - name: wait until the pushgateway port is up wait_for: host: "{{ ansible_host }}" port: "{{ pushgateway_port }}" state: started msg: "the pushgateway port {{ pushgateway_port }} is not up" - name: wait until the pushgateway metrics page is available uri: url: "http://{{ ansible_host }}:{{ pushgateway_port }}/metrics" register: pushgateway_http_result until: pushgateway_http_result.status == 200 retries: 12 delay: 5 - hosts: monitoring_servers tags: - prometheus tasks: - name: start prometheus by supervise shell: cd {{ deploy_dir }}/scripts && ./start_{{ item }}.sh when: process_supervision == 'supervise' with_items: - prometheus - name: start prometheus by systemd systemd: name={{ item }} state=started enabled=no when: process_supervision == 'systemd' become: true with_items: - prometheus-{{ prometheus_port }}.service - name: wait until the prometheus port is up wait_for: host: "{{ ansible_host }}" port: "{{ prometheus_port }}" state: started msg: "the prometheus port {{ prometheus_port }} is not up" - name: wait until the prometheus metrics page is available uri: url: "http://{{ ansible_host }}:{{ prometheus_port }}/metrics" register: prometheus_http_result until: prometheus_http_result.status == 200 retries: 12 delay: 5 - hosts: kafka_exporter_servers tags: - kafka_exporter tasks: - name: start kafka_exporter by supervise shell: cd {{ deploy_dir }}/scripts && ./start_kafka_exporter.sh when: - enable_binlog|default(false) - process_supervision == 'supervise' - name: start kafka_exporter by systemd become: true systemd: name=kafka_exporter-{{ kafka_exporter_port }}.service state=started enabled=no when: - enable_binlog|default(false) - process_supervision == 'systemd' - name: wait until the kafka_exporter port is up wait_for: host: "{{ ansible_host }}" port: "{{ kafka_exporter_port }}" state: started msg: "the kafka_exporter port {{ kafka_exporter_port }} is not up" when: enable_binlog|default(false) - hosts: pd_servers tags: - pd tasks: - name: start PD by supervise shell: cd {{ deploy_dir }}/scripts && ./start_{{ item }}.sh when: process_supervision == 'supervise' with_items: - pd - name: start PD by systemd systemd: name=pd-{{ pd_client_port }}.service state=started enabled=no become: true when: process_supervision == 'systemd' - name: wait until the PD port is up wait_for: host: "{{ ansible_host }}" port: "{{ pd_client_port }}" state: started msg: "the PD port {{ pd_client_port }} is not up" - name: wait until the PD health page is available uri: url: "http://{{ ansible_host }}:{{ pd_client_port }}/health" return_content: yes register: pd_http_result until: pd_http_result.status == 200 and 'true' in pd_http_result.content retries: 12 delay: 5 when: not enable_tls|default(false) - name: wait until the PD health page is available when enable_tls uri: url: "https://{{ ansible_host }}:{{ pd_client_port }}/health" validate_certs: no client_cert: "{{ pd_cert_dir }}/pd-server-{{ ansible_host }}.pem" client_key: "{{ pd_cert_dir }}/pd-server-{{ ansible_host }}-key.pem" return_content: yes register: pd_https_result until: pd_https_result.status == 200 and 'true' in pd_https_result.content retries: 12 delay: 5 when: enable_tls|default(false) - hosts: tikv_servers tags: - tikv tasks: - name: Check if tikv_port already in use wait_for: host: "{{ ansible_host }}" port: "{{ tikv_port }}" state: stopped timeout: 3 msg: "{{ tikv_port }} already in use" - name: Check if tikv_status_port already in use wait_for: host: "{{ ansible_host }}" port: "{{ tikv_status_port }}" state: stopped timeout: 3 msg: "{{ tikv_status_port }} already in use" - name: start TiKV by supervise shell: cd {{ deploy_dir }}/scripts && ./start_{{ item }}.sh when: process_supervision == 'supervise' with_items: - tikv - name: start TiKV by systemd systemd: name=tikv-{{ tikv_port }}.service state=started enabled=no become: true when: process_supervision == 'systemd' - name: wait until the TiKV port is up wait_for: host: "{{ ansible_host }}" port: "{{ tikv_port }}" state: started msg: "the TiKV port {{ tikv_port }} is not up" - name: wait until the TiKV status page is available uri: url: "http://{{ ansible_host }}:{{ tikv_status_port }}/status" return_content: yes register: tikv_http_result until: tikv_http_result.status == 200 retries: 12 delay: 5 when: not enable_tls|default(false) - name: wait until the TiKV status page is available when enable_tls uri: url: "https://{{ ansible_host }}:{{ tikv_status_port }}/status" validate_certs: no client_cert: "{{ tikv_cert_dir }}/tikv-server-{{ ansible_host }}.pem" client_key: "{{ tikv_cert_dir }}/tikv-server-{{ ansible_host }}-key.pem" return_content: yes register: tikv_https_result until: tikv_https_result.status == 200 retries: 10 delay: 5 when: enable_tls|default(false) - command: cat {{ deploy_dir }}/status/tikv.pid register: new_tikv_pid ignore_errors: yes changed_when: false - name: display new tikv pid debug: msg: "tikv binary or docker pid: {{ new_tikv_pid.stdout }}" - hosts: pd_servers[0] tasks: - name: wait for region replication complete uri: url: "http://{{ ansible_host }}:{{ pd_client_port }}/pd/api/v1/cluster/status" return content: yes body_format: json register: cluster_status_http until: cluster_status_http.json is defined and cluster_status_http.json.is_initialized is defined and cluster_status_http.json.is_initialized == true retries: 20 delay: 10 when: - not enable_tls|default(false) - wait_replication|default(false) - hosts: pd_servers[0] tasks: - name: wait for region replication complete with tls enabled uri: url: "https://{{ ansible_host }}:{{ pd_client_port }}/pd/api/v1/cluster/status" validate_certs: no client_cert: "{{ pd_cert_dir }}/pd-server-{{ ansible_host }}.pem" client_key: "{{ pd_cert_dir }}/pd-server-{{ ansible_host }}-key.pem" return content: yes body_format: json register: cluster_status_https until: cluster_status_https.json is defined and cluster_status_https.json.is_initialized is defined and cluster_status_https.json.is_initialized == true retries: 20 delay: 10 when: - enable_tls|default(false) - wait_replication|default(false) - hosts: pump_servers tags: - pump tasks: - name: start pump by supervise shell: cd {{ deploy_dir }}/scripts && ./start_{{ item }}.sh when: - enable_binlog|default(false) - process_supervision == 'supervise' with_items: - pump - name: start pump by systemd systemd: name=pump-{{ pump_port }}.service state=started enabled=no become: true when: - enable_binlog|default(false) - process_supervision == 'systemd' - name: wait until the pump port is up wait_for: host: "{{ ansible_host }}" port: "{{ pump_port }}" state: started msg: "the pump port {{ pump_port }} is not up" when: enable_binlog|default(false) - hosts: tidb_servers tags: - tidb tasks: - name: Check if tidb_port already in use wait_for: host: "{{ ansible_host }}" port: "{{ tidb_port }}" state: stopped timeout: 3 msg: "{{ tidb_port }} already in use" - name: Check if tidb_status_port already in use wait_for: host: "{{ ansible_host }}" port: "{{ tidb_status_port }}" state: stopped timeout: 3 msg: "{{ tidb_status_port }} already in use" - name: start TiDB by supervise shell: cd {{ deploy_dir }}/scripts && ./start_{{ item }}.sh when: process_supervision == 'supervise' with_items: - tidb - name: start TiDB by systemd systemd: name=tidb-{{ tidb_port }}.service state=started enabled=no become: true when: process_supervision == 'systemd' - name: wait until the TiDB port is up wait_for: host: "{{ ansible_host }}" port: "{{ tidb_port }}" state: started msg: "the TiDB port {{ tidb_port }} is not up" - name: wait until the TiDB status page is available uri: url: "http://{{ ansible_host }}:{{ tidb_status_port }}/status" return_content: yes register: tidb_http_result until: tidb_http_result.status == 200 and 'TiDB' in tidb_http_result.content retries: 12 delay: 5 when: not enable_tls|default(false) - name: wait until the TiDB status page is available when enable_tls uri: url: "https://{{ ansible_host }}:{{ tidb_status_port }}/status" validate_certs: no client_cert: "{{ tidb_cert_dir }}/tidb-server-{{ ansible_host }}.pem" client_key: "{{ tidb_cert_dir }}/tidb-server-{{ ansible_host }}-key.pem" return_content: yes register: tidb_https_result until: tidb_https_result.status == 200 and 'TiDB' in tidb_https_result.content retries: 10 delay: 5 when: enable_tls|default(false) - hosts: tiflash_servers tags: - tiflash tasks: - name: start TiFlash by supervise shell: cd {{ deploy_dir }}/scripts && ./start_{{ item }}.sh when: process_supervision == 'supervise' with_items: - tiflash - name: start TiFlash by systemd systemd: name=tiflash-{{ tcp_port }}.service state=started enabled=no become: true when: process_supervision == 'systemd' - name: wait until the TiFlash port is up wait_for: host: "{{ ansible_host }}" port: "{{ http_port }}" state: started msg: "the TiFlash port {{ http_port }} is not up" - name: wait until the TiFlash status page is available uri: url: "http://{{ ansible_host }}:{{ http_port }}/?query=select%20version()" return_content: yes register: tiflash_http_result until: tiflash_http_result.status == 200 retries: 12 delay: 5 when: not enable_tls|default(false) - hosts: grafana_servers tags: - grafana roles: - { role: grafana, grafana_exec_vars_only: true } tasks: - name: start grafana by supervise shell: cd {{ deploy_dir }}/scripts && ./start_{{ item }}.sh when: process_supervision == 'supervise' with_items: - grafana - name: start grafana by systemd systemd: name=grafana-{{ grafana_port }}.service state=started enabled=no when: process_supervision == 'systemd' become: true - name: wait until the grafana port is up wait_for: host: "{{ ansible_host }}" port: "{{ grafana_port }}" state: started msg: "the grafana port {{ grafana_port }} is not up" - name: wait until the grafana login page is available uri: url: "http://{{ ansible_host }}:{{ grafana_port }}/login" register: grafana_http_result until: grafana_http_result.status == 200 retries: 12 delay: 5 - set_fact: grafana_host: "{{ ansible_host }}" - include_tasks: "common_tasks/create_grafana_api_keys.yml" - name: import grafana data source shell: > chdir={{ grafana_data_dir }} warn=no curl -q -X POST -d @data_source.json --header 'Content-Type: application/json' "http://{{ grafana_admin_user }}:{{ grafana_admin_password | urlencode | regex_replace('/','%2F') }}@127.0.0.1:{{ grafana_port }}/api/datasources" - name: import grafana dashboards - prepare config delegate_to: localhost template: src=grafana.dest.json.j2 dest={{ playbook_dir }}/scripts/dests-{{ inventory_hostname }}.json vars: - ansible_become: false - ansible_connection: local - grafana_dest_config: name: "{{ cluster_name | title }}" url: "http://{{ grafana_host }}:{{ grafana_port }}/" user: "{{ grafana_admin_user }}" password: "{{ grafana_admin_password }}" apikey: "{{ lookup('file', grafana_api_keys_dir + '/grafana_apikey.key') }}" datasource: "{{ cluster_name }}" titles: br: "{{ cluster_name | title }}-Backup-Restore" node: "{{ cluster_name | title }}-Node_exporter" pd: "{{ cluster_name | title }}-PD" tidb: "{{ cluster_name | title }}-TiDB" tidb_summary: "{{ cluster_name | title }}-TiDB-Summary" tikv_summary: "{{ cluster_name | title }}-TiKV-Summary" tikv_details: "{{ cluster_name | title }}-TiKV-Details" tikv_trouble_shot: "{{ cluster_name | title }}-TiKV-Trouble-Shooting" tiflash_summary: "{{ cluster_name | title }}-TiFlash-Summary" tiflash_proxy_summary: "{{ cluster_name | title }}-TiFlash-Proxy-Summary" tiflash_proxy_details: "{{ cluster_name | title }}-TiFlash-Proxy-Details" binlog: "{{ cluster_name | title }}-Binlog" overview: "{{ cluster_name | title }}-Overview" disk_performance: "{{ cluster_name | title }}-Disk-Performance" blackbox_exporter: "{{ cluster_name | title }}-Blackbox_exporter" kafka_overview: "{{ cluster_name | title }}-Kafka-Overview" lightning: "{{ cluster_name | title }}-Lightning" performance_read: "{{ cluster_name | title }}-Performance-Read" performance_write: "{{ cluster_name | title }}-Performance-Write" - name: import grafana dashboards - run import script delegate_to: localhost shell: "python grafana-config-copy.py dests-{{ inventory_hostname }}.json" args: chdir: "{{ playbook_dir }}/scripts" vars: - ansible_become: false - ansible_connection: local - hosts: localhost tags: - always roles: - { role: dashboard_topo } ================================================ FILE: start_drainer.yml ================================================ --- # Copyright 2018 PingCAP, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # See the License for the specific language governing permissions and # limitations under the License. # The playbook of TiDB - name: check config locally hosts: localhost any_errors_fatal: true tags: - always roles: - check_config_static - name: gather all facts, and check dest hosts: all any_errors_fatal: true tags: - always roles: - check_config_dynamic - hosts: drainer_servers tags: - drainer tasks: - name: start drainer by supervise shell: cd {{ deploy_dir }}/scripts && ./start_drainer.sh when: - enable_binlog|default(false) - process_supervision == 'supervise' - name: start drainer by systemd systemd: name=drainer-{{ drainer_port }}.service state=started enabled=no become: true when: - enable_binlog|default(false) - process_supervision == 'systemd' - name: wait until the drainer port is up wait_for: host: "{{ ansible_host }}" port: "{{ drainer_port }}" state: started msg: "the drainer port {{ drainer_port }} is not up" when: enable_binlog|default(false) ================================================ FILE: start_spark.yml ================================================ --- # Copyright 2016 PingCAP, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # See the License for the specific language governing permissions and # limitations under the License. # The playbook of TiDB - hosts: spark_master tags: - spark_master tasks: - name: start spark master shell: sh {{ deploy_dir }}/spark/sbin/start-master.sh - hosts: spark_slaves tags: - spark_slaves tasks: - name: start spark slave shell: sh {{ deploy_dir }}/spark/sbin/start-slave.sh ================================================ FILE: stop.yml ================================================ --- # Copyright 2016 PingCAP, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # See the License for the specific language governing permissions and # limitations under the License. # The playbook of TiDB - name: check config locally hosts: localhost any_errors_fatal: true tags: - always roles: - check_config_static - name: gather all facts, and check dest hosts: all any_errors_fatal: true tags: - always roles: - check_config_dynamic - hosts: monitored_servers tags: - node_exporter tasks: - name: stop node_exporter by supervise shell: cd {{ deploy_dir }}/scripts && ./stop_{{ item }}.sh with_items: - node_exporter when: process_supervision == 'supervise' - name: stop node_exporter by systemd systemd: name={{ item }} state=stopped become: true when: process_supervision == 'systemd' with_items: - node_exporter-{{ node_exporter_port }}.service - name: wait until the node_exporter port is down wait_for: host: "{{ ansible_host }}" port: "{{ node_exporter_port }}" state: stopped msg: "the node_exporter port {{ node_exporter_port }} is not down" - hosts: monitored_servers tags: - blackbox_exporter tasks: - name: stop blackbox_exporter by supervise shell: cd {{ deploy_dir }}/scripts && ./stop_{{ item }}.sh with_items: - blackbox_exporter when: process_supervision == 'supervise' - name: stop node_exporter/blackbox_exporter by systemd systemd: name={{ item }} state=stopped become: true when: process_supervision == 'systemd' with_items: - blackbox_exporter-{{ blackbox_exporter_port }}.service - name: wait until the blackbox_exporter port is down wait_for: host: "{{ ansible_host }}" port: "{{ blackbox_exporter_port }}" state: stopped msg: "the blackbox_exporter port {{ blackbox_exporter_port }} is not down" - hosts: alertmanager_servers tags: - alertmanager tasks: - name: stop alertmanager by supervise shell: cd {{ deploy_dir }}/scripts && ./stop_alertmanager.sh when: process_supervision == 'supervise' - name: stop alertmanager by systemd systemd: name=alertmanager-{{ alertmanager_port }}.service state=stopped become: true when: process_supervision == 'systemd' - name: wait until the alertmanager port is down wait_for: host: "{{ ansible_host }}" port: "{{ alertmanager_port }}" state: stopped msg: "the alertmanager port {{ alertmanager_port }} is not down" - hosts: monitoring_servers tags: - pushgateway tasks: - name: stop pushgateway by supervise shell: cd {{ deploy_dir }}/scripts && ./stop_{{ item }}.sh with_items: - pushgateway when: process_supervision == 'supervise' - name: stop pushgateway by systemd systemd: name={{ item }} state=stopped when: process_supervision == 'systemd' become: true with_items: - pushgateway-{{ pushgateway_port }}.service - name: wait until the pushgateway port is down wait_for: host: "{{ ansible_host }}" port: "{{ pushgateway_port }}" state: stopped msg: "the pushgateway port {{ pushgateway_port }} is not down" - hosts: monitoring_servers tags: - prometheus tasks: - name: stop prometheus by supervise shell: cd {{ deploy_dir }}/scripts && ./stop_{{ item }}.sh with_items: - prometheus when: process_supervision == 'supervise' - name: stop prometheus by systemd systemd: name={{ item }} state=stopped when: process_supervision == 'systemd' become: true with_items: - prometheus-{{ prometheus_port }}.service - name: wait until the prometheus port is down wait_for: host: "{{ ansible_host }}" port: "{{ prometheus_port }}" state: stopped msg: "the prometheus port {{ prometheus_port }} is not down" - hosts: kafka_exporter_servers tags: - kafka_exporter tasks: - name: stop kafka_exporter by supervise shell: cd {{ deploy_dir }}/scripts && ./stop_kafka_exporter.sh when: - enable_binlog|default(false) - process_supervision == 'supervise' - name: stop kafka_exporter by systemd become: true systemd: name=kafka_exporter-{{ kafka_exporter_port }}.service state=stopped enabled=no when: - enable_binlog|default(false) - process_supervision == 'systemd' - name: wait until the kafka_exporter port is down wait_for: host: "{{ ansible_host }}" port: "{{ kafka_exporter_port }}" state: stopped msg: "the kafka_exporter port {{ kafka_exporter_port }} is not down" when: enable_binlog|default(false) - hosts: tidb_servers tags: - tidb tasks: - name: stop TiDB by supervise shell: cd {{ deploy_dir }}/scripts && ./stop_{{ item }}.sh when: process_supervision == 'supervise' with_items: - tidb - name: stop TiDB by systemd systemd: name=tidb-{{ tidb_port }}.service state=stopped become: true when: process_supervision == 'systemd' - name: wait until the TiDB port is down wait_for: host: "{{ ansible_host }}" port: "{{ tidb_port }}" state: stopped msg: "the TiDB port {{ tidb_port }} is not down" - hosts: pump_servers tags: - pump tasks: - name: stop pump by supervise shell: cd {{ deploy_dir }}/scripts && ./stop_{{ item }}.sh when: - enable_binlog|default(false) - process_supervision == 'supervise' with_items: - pump - name: stop pump by systemd systemd: name=pump-{{ pump_port }}.service state=stopped become: true when: - enable_binlog|default(false) - process_supervision == 'systemd' - name: wait until the pump port is down wait_for: host: "{{ ansible_host }}" port: "{{ pump_port }}" state: stopped msg: "the pump port {{ pump_port }} is not down" when: enable_binlog|default(false) - hosts: tiflash_servers tags: - tiflash tasks: - name: stop TiFlash by supervise shell: cd {{ deploy_dir }}/scripts && ./stop_{{ item }}.sh when: process_supervision == 'supervise' with_items: - tiflash - name: stop TiFlash by systemd systemd: name=tiflash-{{ tcp_port }}.service state=stopped become: true when: process_supervision == 'systemd' - name: wait until the TiFlash port is down wait_for: host: "{{ ansible_host }}" port: "{{ http_port }}" state: stopped msg: "the TiFlash port {{ http_port }} is not down" - hosts: tikv_servers tags: - tikv tasks: - name: stop TiKV by supervise shell: cd {{ deploy_dir }}/scripts && ./stop_{{ item }}.sh when: process_supervision == 'supervise' with_items: - tikv - name: stop TiKV by systemd systemd: name=tikv-{{ tikv_port }}.service state=stopped become: true when: process_supervision == 'systemd' - name: wait until the TiKV port is down wait_for: host: "{{ ansible_host }}" port: "{{ tikv_port }}" state: stopped msg: "the TiKV port {{ tikv_port }} is not down" - command: cat {{ deploy_dir }}/status/tikv.pid register: old_tikv_pid ignore_errors: yes changed_when: false - name: display old tikv pid debug: msg: "tikv binary or docker pid: {{ old_tikv_pid.stdout }}" - hosts: pd_servers tags: - pd tasks: - name: stop PD by supervise shell: cd {{ deploy_dir }}/scripts && ./stop_{{ item }}.sh when: process_supervision == 'supervise' with_items: - pd - name: stop PD by systemd systemd: name=pd-{{ pd_client_port }}.service state=stopped become: true when: process_supervision == 'systemd' - name: wait until the PD port is down wait_for: host: "{{ ansible_host }}" port: "{{ pd_client_port }}" state: stopped msg: "the PD port {{ pd_client_port }} is not down" - hosts: grafana_servers tags: - grafana tasks: - name: stop grafana by supervise shell: cd {{ deploy_dir }}/scripts && ./stop_{{ item }}.sh when: process_supervision == 'supervise' with_items: - grafana - name: stop grafana by systemd systemd: name=grafana-{{ grafana_port }}.service state=stopped become: true when: process_supervision == 'systemd' - name: wait until the grafana port is down wait_for: host: "{{ ansible_host }}" port: "{{ grafana_port }}" state: stopped msg: "the grafana port {{ grafana_port }} is not down" ================================================ FILE: stop_drainer.yml ================================================ --- # Copyright 2018 PingCAP, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # See the License for the specific language governing permissions and # limitations under the License. # The playbook of TiDB - name: check config locally hosts: localhost any_errors_fatal: true tags: - always roles: - check_config_static - name: gather all facts, and check dest hosts: all any_errors_fatal: true tags: - always roles: - check_config_dynamic - hosts: drainer_servers tags: - drainer tasks: - name: stop drainer by supervise shell: cd {{ deploy_dir }}/scripts && ./stop_drainer.sh when: - enable_binlog|default(false) - process_supervision == 'supervise' - name: stop drainer by systemd systemd: name=drainer-{{ drainer_port }}.service state=stopped enabled=no become: true when: - enable_binlog|default(false) - process_supervision == 'systemd' - name: wait until the drainer port is down wait_for: host: "{{ ansible_host }}" port: "{{ drainer_port }}" state: stopped msg: "the drainer port {{ drainer_port }} is not down" when: enable_binlog|default(false) ================================================ FILE: stop_spark.yml ================================================ --- # Copyright 2016 PingCAP, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # See the License for the specific language governing permissions and # limitations under the License. # The playbook of TiDB - hosts: spark_slaves tags: - spark_slaves tasks: - name: stop spark slave shell: sh {{ deploy_dir }}/spark/sbin/stop-slave.sh - hosts: spark_master tags: - spark_master tasks: - name: stop spark master shell: sh {{ deploy_dir }}/spark/sbin/stop-master.sh ================================================ FILE: templates/grafana.dest.json.j2 ================================================ [{{ grafana_dest_config | to_json }}] ================================================ FILE: unsafe_cleanup.yml ================================================ --- # Copyright 2016 PingCAP, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # See the License for the specific language governing permissions and # limitations under the License. # The playbook of TiDB - hosts: localhost tasks: - name: confirm operation pause: prompt: "The operation will clear the cluster and the data will be deleted (Ctrl-c C or enter to continue, Ctrl-c A to Abort)" - import_playbook: stop.yml - import_playbook: unsafe_cleanup_container.yml when: deployment_method == 'docker' - hosts: monitored_servers tasks: - name: clean systemd config file: path="/etc/systemd/system/{{ item }}" state=absent become: true when: process_supervision == 'systemd' with_items: - node_exporter-{{ node_exporter_port }}.service - blackbox_exporter-{{ blackbox_exporter_port }}.service - hosts: alertmanager_servers tasks: - name: clean systemd config file: path="/etc/systemd/system/{{ item }}" state=absent become: true when: process_supervision == 'systemd' with_items: - alertmanager-{{ alertmanager_port }}.service - hosts: monitoring_servers tasks: - name: clean systemd config file: path="/etc/systemd/system/{{ item }}" state=absent become: true when: process_supervision == 'systemd' with_items: - pushgateway-{{ pushgateway_port }}.service - prometheus-{{ prometheus_port }}.service - hosts: kafka_exporter_servers tasks: - name: clean systemd config file: path="/etc/systemd/system/{{ item }}" state=absent become: true when: process_supervision == 'systemd' with_items: - kafka_exporter-{{ kafka_exporter_port }}.service - hosts: tidb_servers tasks: - name: clean systemd config file: path="/etc/systemd/system/{{ item }}" state=absent become: true when: process_supervision == 'systemd' with_items: - tidb-{{ tidb_port }}.service - hosts: pump_servers tasks: - name: clean systemd config file: path="/etc/systemd/system/{{ item }}" state=absent become: true when: - enable_binlog|default(false) - process_supervision == 'systemd' with_items: - pump-{{ pump_port }}.service - hosts: tiflash_servers tasks: - name: clean systemd config file: path="/etc/systemd/system/{{ item }}" state=absent become: true when: process_supervision == 'systemd' with_items: - tiflash-{{ tcp_port }}.service - hosts: tikv_servers tasks: - name: clean systemd config file: path="/etc/systemd/system/{{ item }}" state=absent become: true when: process_supervision == 'systemd' with_items: - tikv-{{ tikv_port }}.service - name: cleaning up wal dir file: path={{ wal_dir }} state=absent when: "wal_dir is defined" - name: cleaning up raftdb dir file: path={{ raftdb_path }} state=absent when: "raftdb_path is defined" - hosts: tiflash_servers tasks: - name: clean systemd config file: path="/etc/systemd/system/{{ item }}" state=absent become: true when: process_supervision == 'systemd' with_items: - tiflash-{{ tcp_port }}.service - name: cleaning up raft dir file: path={{ raft_data_dir }} state=absent when: "raft_data_dir is defined" - name: cleaning up data dir file: path={{ item }} state=absent with_items: "{{ data_dir | split_string(',') }}" when: "data_dir is defined" - hosts: pd_servers tasks: - name: clean systemd config file: path="/etc/systemd/system/{{ item }}" state=absent become: true when: process_supervision == 'systemd' with_items: - pd-{{ pd_client_port }}.service - hosts: grafana_servers tasks: - name: clean systemd config file: path="/etc/systemd/system/{{ item }}" state=absent become: true when: process_supervision == 'systemd' with_items: - grafana-{{ grafana_port }}.service - hosts: all tasks: - name: using rm command to clean up deploy_dir shell: "rm -rf {{ deploy_dir }}/*" - name: cleaning up deploy dir file: path={{ deploy_dir }} state=absent ================================================ FILE: unsafe_cleanup_container.yml ================================================ --- # Copyright 2016 PingCAP, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # See the License for the specific language governing permissions and # limitations under the License. # The playbook of TiDB - hosts: monitored_servers tasks: - name: remove node_exporter/blackbox_exporter container docker_container: name: "{{ item }}" state: absent with_items: - node_exporter-{{ node_exporter_port }} - blackbox_exporter-{{ blackbox_exporter_port }} - hosts: monitoring_servers tasks: - name: remove pushgateway and prometheus container docker_container: name: "{{ item }}" state: absent with_items: - pushgateway-{{ pushgateway_port }} - prometheus-{{ prometheus_port }} - hosts: alertmanager_servers tasks: - name: remove alertmanager container docker_container: name: alertmanager-{{ alertmanager_port }} state: absent - hosts: pd_servers tasks: - name: remove pd container docker_container: name: pd state: absent - hosts: tikv_servers tasks: - name: remove TiKV container docker_container: name: "tikv-{{ tikv_port }}" state: absent - hosts: tidb_servers tags: - tidb tasks: - name: remove tidb container docker_container: name: "tidb-{{ tidb_port }}" state: absent - name: remove pump container docker_container: name: pump state: absent when: enable_binlog|default(false) - hosts: grafana_servers tasks: - name: remove grafana container docker_container: name: grafana-{{ grafana_port }} state: absent ================================================ FILE: unsafe_cleanup_data.yml ================================================ --- # Copyright 2016 PingCAP, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # See the License for the specific language governing permissions and # limitations under the License. # The playbook of TiDB - hosts: localhost tasks: - name: confirm operation pause: prompt: "The database data will be deleted (Ctrl-c C or enter to continue, Ctrl-c A to Abort)" - name: check config locally hosts: localhost tags: - always roles: - check_config_static - name: gather all facts, and check dest hosts: all tags: - always roles: - check_config_dynamic - name: TiDB cluster hosts: tidb_servers tags: - tidb tasks: - name: stop TiDB by supervise shell: cd {{ deploy_dir }}/scripts && ./stop_{{ item }}.sh when: process_supervision == 'supervise' with_items: - tidb - name: stop TiDB by systemd systemd: name=tidb-{{ tidb_port }}.service state=stopped become: true when: process_supervision == 'systemd' - name: wait until the TiDB port is down wait_for: host: "{{ ansible_host }}" port: "{{ tidb_port }}" state: stopped msg: "the TiDB port {{ tidb_port }} is not down" - name: pump cluster hosts: pump_servers tags: - pump tasks: - name: stop pump by supervise shell: cd {{ deploy_dir }}/scripts && ./stop_{{ item }}.sh with_items: - pump when: - enable_binlog|default(false) - process_supervision == 'supervise' - name: stop pump by systemd systemd: name=pump-{{ pump_port }}.service state=stopped become: true when: - enable_binlog|default(false) - process_supervision == 'systemd' - name: wait until the pump port is down wait_for: host: "{{ ansible_host }}" port: "{{ pump_port }}" state: stopped msg: "the pump port {{ pump_port }} is not down" when: enable_binlog|default(false) - name: clean pump data file: path={{ pump_data_dir }} state=absent when: enable_binlog|default(false) - name: TiKV cluster hosts: tikv_servers tasks: - name: stop TiKV by supervise shell: cd {{ deploy_dir }}/scripts && ./stop_{{ item }}.sh when: process_supervision == 'supervise' with_items: - tikv - name: stop TiKV by systemd systemd: name=tikv-{{ tikv_port }}.service state=stopped become: true when: process_supervision == 'systemd' - name: wait until the TiKV port is down wait_for: host: "{{ ansible_host }}" port: "{{ tikv_port }}" state: stopped msg: "the TiKV port {{ tikv_port }} is not down" - name: clean TiKV data file: path={{ tikv_data_dir }} state=absent - name: create new TiKV data dir file: path={{ tikv_data_dir }} state=directory mode=0755 - name: clean TiKV wal data file: path={{ wal_dir }} state=absent when: wal_dir is defined - name: create new TiKV wal data dir file: path={{ wal_dir }} state=directory mode=0755 when: wal_dir is defined - name: clean TiKV raftdb data file: path={{ raftdb_path }} state=absent when: raftdb_path is defined - name: create new TiKV raftdb data dir file: path={{ raftdb_path }} state=directory mode=0755 when: raftdb_path is defined - name: TiFlash cluster hosts: tiflash_servers tasks: - name: stop TiFlash by supervise shell: cd {{ deploy_dir }}/scripts && ./stop_{{ item }}.sh when: process_supervision == 'supervise' with_items: - tiflash - name: stop TiFlash by systemd systemd: name=tiflash-{{ tcp_port }}.service state=stopped become: true when: process_supervision == 'systemd' - name: wait until the TiFlash port is down wait_for: host: "{{ ansible_host }}" port: "{{ http_port }}" state: stopped msg: "the TiFlash port {{ http_port }} is not down" - name: clean TiFlash raft data dir file: path={{ raft_data_dir }} state=absent when: raft_data_dir is defined - name: create new TiFlash raft data dir file: path={{ raft_data_dir }} state=directory mode=0755 when: raft_data_dir is defined - name: clean TiFlash data dir file: path={{ item }} state=absent with_items: "{{ data_dir | split_string(',') }}" when: data_dir is defined - name: create new TiFlash data dir file: path={{ data_dir }} state=directory mode=0755 when: data_dir is defined - name: PD cluster hosts: pd_servers tasks: - name: stop PD by supervise shell: cd {{ deploy_dir }}/scripts && ./stop_{{ item }}.sh when: process_supervision == 'supervise' with_items: - pd - name: stop PD by systemd systemd: name=pd-{{ pd_client_port }}.service state=stopped become: true when: process_supervision == 'systemd' - name: wait until the PD port is down wait_for: host: "{{ ansible_host }}" port: "{{ pd_client_port }}" state: stopped msg: "the PD port {{ pd_client_port }} is not down" - name: clean PD data file: path={{ pd_data_dir }} state=absent - name: create new PD data dir file: path={{ pd_data_dir }} state=directory mode=0755