Repository: Vonng/pg_exporter Branch: main Commit: e303f2ad915c Files: 158 Total size: 1.5 MB Directory structure: gitextract_340wwud1/ ├── .github/ │ └── workflows/ │ ├── release.yaml │ └── test-release.yaml ├── .gitignore ├── .goreleaser.yml ├── Dockerfile ├── Dockerfile.goreleaser ├── LICENSE ├── Makefile ├── README.md ├── config/ │ ├── 0000-doc.yml │ ├── 0110-pg.yml │ ├── 0120-pg_meta.yml │ ├── 0130-pg_setting.yml │ ├── 0210-pg_repl.yml │ ├── 0220-pg_sync_standby.yml │ ├── 0230-pg_downstream.yml │ ├── 0240-pg_slot.yml │ ├── 0250-pg_recv.yml │ ├── 0260-pg_sub.yml │ ├── 0270-pg_origin.yml │ ├── 0300-pg_io.yml │ ├── 0310-pg_size.yml │ ├── 0320-pg_archiver.yml │ ├── 0330-pg_bgwriter.yml │ ├── 0331-pg_checkpointer.yml │ ├── 0340-pg_ssl.yml │ ├── 0350-pg_checkpoint.yml │ ├── 0355-pg_timeline.yml │ ├── 0360-pg_recovery.yml │ ├── 0370-pg_slru.yml │ ├── 0380-pg_shmem.yml │ ├── 0390-pg_wal.yml │ ├── 0410-pg_activity.yml │ ├── 0420-pg_wait.yml │ ├── 0430-pg_backend.yml │ ├── 0440-pg_xact.yml │ ├── 0450-pg_lock.yml │ ├── 0460-pg_query.yml │ ├── 0510-pg_vacuuming.yml │ ├── 0520-pg_indexing.yml │ ├── 0530-pg_clustering.yml │ ├── 0540-pg_backup.yml │ ├── 0610-pg_db.yml │ ├── 0620-pg_db_confl.yml │ ├── 0640-pg_pubrel.yml │ ├── 0650-pg_subrel.yml │ ├── 0700-pg_table.yml │ ├── 0710-pg_index.yml │ ├── 0720-pg_func.yml │ ├── 0730-pg_seq.yml │ ├── 0740-pg_relkind.yml │ ├── 0750-pg_defpart.yml │ ├── 0810-pg_table_size.yml │ ├── 0820-pg_table_bloat.yml │ ├── 0830-pg_index_bloat.yml │ ├── 0910-pgbouncer_list.yml │ ├── 0920-pgbouncer_database.yml │ ├── 0930-pgbouncer_stat.yml │ ├── 0940-pgbouncer_pool.yml │ ├── 1000-pg_wait_event.yml │ ├── 1800-pg_tsdb_hypertable.yml │ ├── 1900-pg_citus.yml │ └── 2000-pg_heartbeat.yml ├── docker/ │ ├── .dockerignore │ ├── README.md │ ├── build.sh │ └── release.sh ├── exporter/ │ ├── arg.go │ ├── args_normalize.go │ ├── args_normalize_test.go │ ├── collector.go │ ├── column.go │ ├── concurrency_test.go │ ├── config.go │ ├── config_coverage_pg9_test.go │ ├── config_coverage_test.go │ ├── config_merged_test.go │ ├── config_style_test.go │ ├── config_test.go │ ├── exporter.go │ ├── exporter_handlers_opts_test.go │ ├── global.go │ ├── health_state_test.go │ ├── main.go │ ├── metrics_lifecycle_test.go │ ├── pgurl.go │ ├── pgurl_test.go │ ├── predicate_cache_test.go │ ├── probehealth_pgbouncer_test.go │ ├── prom_validate.go │ ├── query.go │ ├── query_column_test.go │ ├── reload_signals_unix.go │ ├── reload_signals_windows.go │ ├── reload_test.go │ ├── server.go │ ├── server_exporter_test.go │ ├── testmain_test.go │ ├── utils.go │ ├── utils_test.go │ ├── validate_labels.go │ └── validate_labels_test.go ├── go.mod ├── go.sum ├── hugo.yaml ├── legacy/ │ ├── README.md │ ├── config/ │ │ ├── 0000-doc.yml │ │ ├── 0110-pg.yml │ │ ├── 0120-pg_meta.yml │ │ ├── 0130-pg_setting.yml │ │ ├── 0210-pg_repl.yml │ │ ├── 0220-pg_sync_standby.yml │ │ ├── 0230-pg_downstream.yml │ │ ├── 0240-pg_slot.yml │ │ ├── 0250-pg_recv.yml │ │ ├── 0270-pg_origin.yml │ │ ├── 0310-pg_size.yml │ │ ├── 0320-pg_archiver.yml │ │ ├── 0330-pg_bgwriter.yml │ │ ├── 0331-pg_checkpointer.yml │ │ ├── 0340-pg_ssl.yml │ │ ├── 0350-pg_checkpoint.yml │ │ ├── 0355-pg_timeline.yml │ │ ├── 0360-pg_recovery.yml │ │ ├── 0410-pg_activity.yml │ │ ├── 0420-pg_wait.yml │ │ ├── 0440-pg_xact.yml │ │ ├── 0450-pg_lock.yml │ │ ├── 0460-pg_query.yml │ │ ├── 0610-pg_db.yml │ │ ├── 0620-pg_db_confl.yml │ │ ├── 0700-pg_table.yml │ │ ├── 0710-pg_index.yml │ │ ├── 0720-pg_func.yml │ │ ├── 0740-pg_relkind.yml │ │ ├── 0810-pg_table_size.yml │ │ ├── 0820-pg_table_bloat.yml │ │ ├── 0830-pg_index_bloat.yml │ │ ├── 0910-pgbouncer_list.yml │ │ ├── 0920-pgbouncer_database.yml │ │ ├── 0930-pgbouncer_stat.yml │ │ ├── 0940-pgbouncer_pool.yml │ │ ├── 1800-pg_tsdb_hypertable.yml │ │ ├── 1900-pg_citus.yml │ │ └── 2000-pg_heartbeat.yml │ └── pg_exporter.yml ├── main.go ├── monitor/ │ ├── initdb.sh │ ├── pgrds-instance.json │ └── pgsql-exporter.json ├── package/ │ ├── nfpm-amd64-deb.yaml │ ├── nfpm-amd64-rpm.yaml │ ├── nfpm-arm64-deb.yaml │ ├── nfpm-arm64-rpm.yaml │ ├── pg_exporter.default │ ├── pg_exporter.service │ └── preinstall.sh └── pg_exporter.yml ================================================ FILE CONTENTS ================================================ ================================================ FILE: .github/workflows/release.yaml ================================================ name: Release on: push: tags: - 'v*.*.*' permissions: contents: write jobs: release: runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 with: fetch-depth: 0 - name: Set up Go uses: actions/setup-go@v5 with: go-version-file: 'go.mod' cache: true - name: Run unit tests run: go test ./... - name: Run go vet run: go vet ./... - name: Set up QEMU uses: docker/setup-qemu-action@v3 - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - name: Login to Docker Hub uses: docker/login-action@v3 with: username: ${{ secrets.DOCKERHUB_USERNAME }} password: ${{ secrets.DOCKERHUB_TOKEN }} - name: Run GoReleaser uses: goreleaser/goreleaser-action@v6 with: distribution: goreleaser version: latest args: release --clean env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - name: Upload artifacts uses: actions/upload-artifact@v4 if: always() with: name: dist path: dist/ ================================================ FILE: .github/workflows/test-release.yaml ================================================ name: Test Release on: workflow_dispatch: # 允许手动触发 pull_request: paths: - '.goreleaser.yml' - '.github/workflows/release.yaml' - '.github/workflows/test-release.yaml' permissions: contents: read jobs: test: runs-on: ubuntu-latest steps: - name: Checkout uses: actions/checkout@v4 with: fetch-depth: 0 - name: Set up Go uses: actions/setup-go@v5 with: go-version-file: 'go.mod' cache: true - name: Set up QEMU uses: docker/setup-qemu-action@v3 - name: Set up Docker Buildx uses: docker/setup-buildx-action@v3 - name: Run unit tests run: go test ./... - name: Run go vet run: go vet ./... - name: Test GoReleaser config uses: goreleaser/goreleaser-action@v6 with: distribution: goreleaser version: latest args: check - name: Build snapshot uses: goreleaser/goreleaser-action@v6 with: distribution: goreleaser version: latest args: release --snapshot --clean --skip=publish,docker - name: List artifacts run: | echo "Generated artifacts:" ls -lh dist/*.tar.gz || true echo "" echo "Checksums:" cat dist/checksums.txt || true ================================================ FILE: .gitignore ================================================ # binary files pg_exporter # tmp files test/ deploy/ upload.sh temp/ dist/ .DS_Store # IDE files .vscode/ .idea/ .code/ .claude .codex/ .codex_tmp/ pg_exporter.iml CLAUDE.md .hugo_build.lock public/ resources/ tmp/ ================================================ FILE: .goreleaser.yml ================================================ version: 2 env: - CGO_ENABLED=0 before: hooks: - go mod download - go mod tidy builds: - id: pg_exporter main: ./main.go binary: pg_exporter goos: - linux - darwin - windows goarch: - amd64 - arm64 - ppc64le goarm: - 6 - 7 goamd64: - v1 ignore: # Darwin only supports amd64 and arm64 - goos: darwin goarch: ppc64le # Windows only supports amd64 and 386 - goos: windows goarch: arm64 - goos: windows goarch: arm - goos: windows goarch: ppc64le ldflags: - -s -w - -extldflags "-static" - -X 'pg_exporter/exporter.Version={{.Version}}' - -X 'pg_exporter/exporter.Branch={{.Branch}}' - -X 'pg_exporter/exporter.Revision={{.ShortCommit}}' - -X 'pg_exporter/exporter.BuildDate={{.Date}}' flags: - -a archives: - id: pg_exporter name_template: >- {{ .ProjectName }}-{{ .Version }}.{{ .Os }}- {{- if eq .Arch "amd64" }}amd64 {{- else if eq .Arch "386" }}386 {{- else if eq .Arch "arm64" }}arm64 {{- else if eq .Arch "arm" }}armv{{ .Arm }} {{- else if eq .Arch "ppc64le" }}ppc64le {{- else }}{{ .Arch }}{{ end }} files: - pg_exporter.yml - LICENSE - package/pg_exporter.default - package/pg_exporter.service nfpms: - id: pg_exporter_rpm package_name: pg_exporter file_name_template: >- {{ .PackageName }}-{{ .Version }}-{{ .Release }}. {{- if eq .Arch "amd64" }}x86_64 {{- else if eq .Arch "arm64" }}aarch64 {{- else }}{{ .Arch }}{{ end }} vendor: PGSTY homepage: https://pigsty.io/docs/pg_exporter maintainer: Ruohang Feng description: | Prometheus exporter for PostgreSQL / Pgbouncer server metrics. Supported version: Postgres9.4 - 17+ & Pgbouncer 1.8 - 1.24+ Part of Project Pigsty -- Battery Included PostgreSQL Distribution with ultimate observability support: https://pigsty.io/docs license: Apache-2.0 formats: - rpm bindir: /usr/bin release: "1" section: database priority: optional contents: - src: pg_exporter.yml dst: /etc/pg_exporter.yml type: config|noreplace file_info: mode: 0700 owner: prometheus group: prometheus - src: package/pg_exporter.default dst: /etc/default/pg_exporter type: config|noreplace file_info: mode: 0700 owner: prometheus group: prometheus - src: package/pg_exporter.service dst: /usr/lib/systemd/system/pg_exporter.service type: config - src: LICENSE dst: /usr/share/doc/pg_exporter/LICENSE file_info: mode: 0644 scripts: preinstall: package/preinstall.sh rpm: compression: gzip prefixes: - /usr/bin - id: pg_exporter_deb package_name: pg-exporter file_name_template: >- {{ .PackageName }}_{{ .Version }}-{{ .Release }}_ {{- if eq .Arch "amd64" }}amd64 {{- else if eq .Arch "arm64" }}arm64 {{- else }}{{ .Arch }}{{ end }} vendor: PGSTY homepage: https://pigsty.io/docs/pg_exporter maintainer: Ruohang Feng description: | Prometheus exporter for PostgreSQL / Pgbouncer server metrics. Supported version: Postgres9.4 - 17+ & Pgbouncer 1.8 - 1.24+ Part of Project Pigsty -- Battery Included PostgreSQL Distribution with ultimate observability support: https://pigsty.io/docs license: Apache-2.0 formats: - deb bindir: /usr/bin release: "1" section: database priority: optional contents: - src: pg_exporter.yml dst: /etc/pg_exporter.yml type: config|noreplace file_info: mode: 0700 owner: prometheus group: prometheus - src: package/pg_exporter.default dst: /etc/default/pg_exporter type: config|noreplace file_info: mode: 0700 owner: prometheus group: prometheus - src: package/pg_exporter.service dst: /lib/systemd/system/pg_exporter.service type: config - src: LICENSE dst: /usr/share/doc/pg_exporter/LICENSE file_info: mode: 0644 scripts: preinstall: package/preinstall.sh checksum: name_template: 'checksums.txt' algorithm: sha256 snapshot: version_template: "{{ .Tag }}-next" changelog: sort: asc filters: exclude: - '^docs:' - '^test:' - '^chore:' - 'Merge pull request' - 'Merge branch' release: github: owner: pgsty name: pg_exporter draft: false prerelease: false mode: replace # Replace existing release with same tag replace_existing_artifacts: true # Replace existing artifacts name_template: "{{.ProjectName}}-v{{.Version}}" disable: false discussion_category_name: "" # Skip discussion creation announce: skip: true # Skip all announcements # Docker configuration for multi-arch images dockers: - id: pg_exporter_amd64 ids: - pg_exporter goos: linux goarch: amd64 image_templates: - "pgsty/pg_exporter:{{ .Version }}-amd64" - "pgsty/pg_exporter:latest-amd64" dockerfile: Dockerfile.goreleaser use: buildx build_flag_templates: - "--platform=linux/amd64" - "--label=org.opencontainers.image.version={{.Version}}" - "--label=org.opencontainers.image.created={{.Date}}" - "--label=org.opencontainers.image.revision={{.FullCommit}}" extra_files: - pg_exporter.yml - LICENSE - id: pg_exporter_arm64 ids: - pg_exporter goos: linux goarch: arm64 image_templates: - "pgsty/pg_exporter:{{ .Version }}-arm64" - "pgsty/pg_exporter:latest-arm64" dockerfile: Dockerfile.goreleaser use: buildx build_flag_templates: - "--platform=linux/arm64" - "--label=org.opencontainers.image.version={{.Version}}" - "--label=org.opencontainers.image.created={{.Date}}" - "--label=org.opencontainers.image.revision={{.FullCommit}}" extra_files: - pg_exporter.yml - LICENSE docker_manifests: - name_template: "pgsty/pg_exporter:{{ .Version }}" image_templates: - "pgsty/pg_exporter:{{ .Version }}-amd64" - "pgsty/pg_exporter:{{ .Version }}-arm64" - name_template: "pgsty/pg_exporter:latest" image_templates: - "pgsty/pg_exporter:latest-amd64" - "pgsty/pg_exporter:latest-arm64" ================================================ FILE: Dockerfile ================================================ # syntax=docker/dockerfile:1 FROM golang:1.26.2-alpine AS builder-env ARG GOPROXY=https://proxy.golang.org,direct ARG GOSUMDB=sum.golang.org ENV GOPROXY=${GOPROXY} ENV GOSUMDB=${GOSUMDB} # Build a self-contained pg_exporter container with a clean environment and no # dependencies. # # build with # # docker buildx build -f Dockerfile --tag pg_exporter . # WORKDIR /build COPY go.mod go.sum ./ RUN \ --mount=type=cache,target=/go/pkg/mod \ --mount=type=cache,target=/root/.cache/go-build \ CGO_ENABLED=0 GOOS=linux go mod download COPY . /build RUN \ --mount=type=cache,target=/go/pkg/mod \ --mount=type=cache,target=/root/.cache/go-build \ CGO_ENABLED=0 GOOS=linux go build -a -o /pg_exporter . FROM scratch LABEL org.opencontainers.image.authors="Ruohang Feng , Craig Ringer " \ org.opencontainers.image.url="https://github.com/pgsty/pg_exporter" \ org.opencontainers.image.source="https://github.com/pgsty/pg_exporter" \ org.opencontainers.image.licenses="Apache-2.0" \ org.opencontainers.image.title="pg_exporter" \ org.opencontainers.image.description="PostgreSQL/Pgbouncer metrics exporter for Prometheus" WORKDIR /bin COPY --from=builder-env /pg_exporter /bin/pg_exporter COPY pg_exporter.yml /etc/pg_exporter.yml EXPOSE 9630/tcp ENTRYPOINT ["/bin/pg_exporter"] ================================================ FILE: Dockerfile.goreleaser ================================================ # Dockerfile for goreleaser # This uses pre-built binaries from goreleaser instead of building from source FROM scratch LABEL org.opencontainers.image.authors="Ruohang Feng " \ org.opencontainers.image.url="https://github.com/pgsty/pg_exporter" \ org.opencontainers.image.source="https://github.com/pgsty/pg_exporter" \ org.opencontainers.image.licenses="Apache-2.0" \ org.opencontainers.image.title="pg_exporter" \ org.opencontainers.image.description="PostgreSQL/Pgbouncer metrics exporter for Prometheus" WORKDIR /bin COPY pg_exporter /bin/pg_exporter COPY pg_exporter.yml /etc/pg_exporter.yml COPY LICENSE /LICENSE EXPOSE 9630/tcp ENTRYPOINT ["/bin/pg_exporter"] ================================================ FILE: LICENSE ================================================ Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [2019-2025] [Ruohang Feng](rh@vonng.com) Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ================================================ FILE: Makefile ================================================ #==============================================================# # File : Makefile # Mtime : 2025-08-14 # License : Apache-2.0 @ https://github.com/pgsty/pg_exporter # Copyright : 2018-2026 Ruohang Feng / Vonng (rh@vonng.com) #==============================================================# VERSION ?= v1.2.2 BUILD_DATE := $(shell date '+%Y%m%d%H%M%S') GIT_BRANCH := $(shell git rev-parse --abbrev-ref HEAD 2>/dev/null || echo "unknown") GIT_REVISION := $(shell git rev-parse --short HEAD 2>/dev/null || echo "HEAD") LDFLAGS_META := -X 'pg_exporter/exporter.Version=$(VERSION)' \ -X 'pg_exporter/exporter.Branch=$(GIT_BRANCH)' \ -X 'pg_exporter/exporter.Revision=$(GIT_REVISION)' \ -X 'pg_exporter/exporter.BuildDate=$(BUILD_DATE)' LDFLAGS_STATIC := -s -w -extldflags \"-static\" $(LDFLAGS_META) # Release Dir LINUX_AMD_DIR:=dist/$(VERSION)/pg_exporter-$(VERSION).linux-amd64 LINUX_ARM_DIR:=dist/$(VERSION)/pg_exporter-$(VERSION).linux-arm64 DARWIN_AMD_DIR:=dist/$(VERSION)/pg_exporter-$(VERSION).darwin-amd64 DARWIN_ARM_DIR:=dist/$(VERSION)/pg_exporter-$(VERSION).darwin-arm64 WINDOWS_DIR:=dist/$(VERSION)/pg_exporter-$(VERSION).windows-amd64 ############################################################### # Shortcuts # ############################################################### build: go build -ldflags "$(LDFLAGS_META)" -o pg_exporter clean: rm -rf pg_exporter build-darwin-amd64: CGO_ENABLED=0 GOOS=darwin GOARCH=amd64 go build -a -ldflags "$(LDFLAGS_STATIC)" -o pg_exporter build-darwin-arm64: CGO_ENABLED=0 GOOS=darwin GOARCH=arm64 go build -a -ldflags "$(LDFLAGS_STATIC)" -o pg_exporter build-linux-amd64: CGO_ENABLED=0 GOOS=linux GOARCH=amd64 go build -a -ldflags "$(LDFLAGS_STATIC)" -o pg_exporter build-linux-arm64: CGO_ENABLED=0 GOOS=linux GOARCH=arm64 go build -a -ldflags "$(LDFLAGS_STATIC)" -o pg_exporter r: release release: release-linux release-darwin release-linux: linux-amd64 linux-arm64 linux-amd64: clean build-linux-amd64 rm -rf $(LINUX_AMD_DIR) && mkdir -p $(LINUX_AMD_DIR) nfpm package --packager rpm --config package/nfpm-amd64-rpm.yaml --target dist/$(VERSION) nfpm package --packager deb --config package/nfpm-amd64-deb.yaml --target dist/$(VERSION) cp pg_exporter $(LINUX_AMD_DIR)/pg_exporter cp pg_exporter.yml $(LINUX_AMD_DIR)/pg_exporter.yml cp LICENSE $(LINUX_AMD_DIR)/LICENSE tar -czf dist/$(VERSION)/pg_exporter-$(VERSION).linux-amd64.tar.gz -C dist/$(VERSION) pg_exporter-$(VERSION).linux-amd64 rm -rf $(LINUX_AMD_DIR) linux-arm64: clean build-linux-arm64 rm -rf $(LINUX_ARM_DIR) && mkdir -p $(LINUX_ARM_DIR) nfpm package --packager rpm --config package/nfpm-arm64-rpm.yaml --target dist/$(VERSION) nfpm package --packager deb --config package/nfpm-arm64-deb.yaml --target dist/$(VERSION) cp pg_exporter $(LINUX_ARM_DIR)/pg_exporter cp pg_exporter.yml $(LINUX_ARM_DIR)/pg_exporter.yml cp LICENSE $(LINUX_ARM_DIR)/LICENSE tar -czf dist/$(VERSION)/pg_exporter-$(VERSION).linux-arm64.tar.gz -C dist/$(VERSION) pg_exporter-$(VERSION).linux-arm64 rm -rf $(LINUX_ARM_DIR) release-darwin: darwin-amd64 darwin-arm64 darwin-amd64: clean build-darwin-amd64 rm -rf $(DARWIN_AMD_DIR) && mkdir -p $(DARWIN_AMD_DIR) cp pg_exporter $(DARWIN_AMD_DIR)/pg_exporter cp pg_exporter.yml $(DARWIN_AMD_DIR)/pg_exporter.yml cp LICENSE $(DARWIN_AMD_DIR)/LICENSE tar -czf dist/$(VERSION)/pg_exporter-$(VERSION).darwin-amd64.tar.gz -C dist/$(VERSION) pg_exporter-$(VERSION).darwin-amd64 rm -rf $(DARWIN_AMD_DIR) darwin-arm64: clean build-darwin-arm64 rm -rf $(DARWIN_ARM_DIR) && mkdir -p $(DARWIN_ARM_DIR) cp pg_exporter $(DARWIN_ARM_DIR)/pg_exporter cp pg_exporter.yml $(DARWIN_ARM_DIR)/pg_exporter.yml cp LICENSE $(DARWIN_ARM_DIR)/LICENSE tar -czf dist/$(VERSION)/pg_exporter-$(VERSION).darwin-arm64.tar.gz -C dist/$(VERSION) pg_exporter-$(VERSION).darwin-arm64 rm -rf $(DARWIN_ARM_DIR) ############################################################### # Configuration # ############################################################### # generate merged config from separated configuration conf: rm -rf pg_exporter.yml cat config/*.yml >> pg_exporter.yml # generate legacy merged config for PostgreSQL 9.1 - 9.6 conf9: rm -rf legacy/pg_exporter.yml cat legacy/config/*.yml >> legacy/pg_exporter.yml # Backward-compatible alias (deprecated) conf-pg9: conf9 ############################################################### # Release # ############################################################### release-dir: mkdir -p dist/$(VERSION) release-clean: rm -rf dist/$(VERSION) ############################################################### # GoReleaser # ############################################################### # Install goreleaser if not present goreleaser-install: @which goreleaser > /dev/null || (echo "Installing goreleaser..." && go install github.com/goreleaser/goreleaser/v2@latest) # Build snapshot release (without publishing) goreleaser-snapshot: goreleaser-install goreleaser release --snapshot --clean --skip=publish # Build release locally (without git tag) goreleaser-build: goreleaser-install goreleaser build --snapshot --clean # Build release locally without snapshot suffix (requires clean git) goreleaser-local: goreleaser-install goreleaser release --clean --skip=publish # Release with goreleaser (requires git tag) goreleaser-release: goreleaser-install goreleaser release --clean # Test release (creates prerelease, no notifications) goreleaser-test-release: goreleaser-install @echo "Creating test release (prerelease mode, no notifications)..." goreleaser release --clean # Production release (set prerelease to false in config first) goreleaser-prod-release: goreleaser-install @echo "Creating production release (will notify subscribers if announce.skip is false)..." goreleaser release --clean # Check goreleaser configuration goreleaser-check: goreleaser-install goreleaser check # New main release task using goreleaser release-new: goreleaser-release # build docker image docker: docker-build docker-build: ./docker/build.sh docker-release: ./docker/release.sh ############################################################### # Develop # ############################################################### install: build sudo install -m 0755 pg_exporter /usr/bin/pg_exporter uninstall: sudo rm -rf /usr/bin/pg_exporter runb: ./pg_exporter --log.level=info --config=pg_exporter.yml --auto-discovery run: go run main.go --log.level=info --config=pg_exporter.yml --auto-discovery debug: go run main.go --log.level=debug --config=pg_exporter.yml --auto-discovery curl: curl localhost:9630/metrics | grep -v '#' | grep pg_ upload: ./upload.sh d: dev dev: hugo serve .PHONY: build clean build-darwin build-linux\ release release-darwin release-linux release-windows docker docker-build docker-release \ install uninstall debug curl upload \ goreleaser-install goreleaser-snapshot goreleaser-build goreleaser-release goreleaser-test-release \ goreleaser-check release-new goreleaser-local ================================================ FILE: README.md ================================================

PG Exporter Logo

# PG EXPORTER [![Webite: https://pigsty.io/docs/pg_exporter](https://img.shields.io/badge/website-pigsty.io/docs/pg_exporter-slategray?style=flat&logo=cilium&logoColor=white)](https://pigsty.io/docs/pg_exporter) [![DockerHub: pgsty/pg_exporter](https://img.shields.io/badge/docker-pgsty/pg_exporter-slategray?style=flat&logo=docker&logoColor=white)](https://hub.docker.com/r/pgsty/pg_exporter) [![Version: 1.2.2](https://img.shields.io/badge/version-1.2.2-slategray?style=flat&logo=cilium&logoColor=white)](https://github.com/pgsty/pg_exporter/releases/tag/v1.2.2) [![License: Apache-2.0](https://img.shields.io/github/license/pgsty/pg_exporter?logo=opensourceinitiative&logoColor=green&color=slategray)](https://github.com/pgsty/pg_exporter/blob/main/LICENSE) [![GitHub Stars](https://img.shields.io/github/stars/pgsty/pg_exporter?style=flat&logo=github&logoColor=black&color=slategray)](https://star-history.com/#pgsty/pg_exporter&Date) [![Go Report Card](https://goreportcard.com/badge/github.com/pgsty/pg_exporter)](https://goreportcard.com/report/github.com/pgsty/pg_exporter) > **Advanced [PostgreSQL](https://www.postgresql.org) & [pgBouncer](https://www.pgbouncer.org/) metrics [exporter](https://prometheus.io/docs/instrumenting/exporters/) for [Prometheus](https://prometheus.io/)** PG Exporter brings ultimate monitoring experience to your PostgreSQL with **declarative config**, **dynamic planning**, and **customizable collectors**. It provides **600+** metrics and ~3K time series per instance, covers everything you'll need for PostgreSQL observability. Check [**https://demo.pigsty.io**](https://demo.pigsty.io/ui/) for live demo, which is built upon this exporter by [**Pigsty**](https://pigsty.io).
DocsQuick StartFeaturesUsageAPIDeploymentCollectorsDemo

[![pigsty-dashboard](https://pigsty.io/img/pigsty/dashboard.jpg)](https://demo.pigsty.io) -------- ## Features - **Highly Customizable**: Define almost all metrics through declarative YAML configs - **Full Coverage**: Monitor PostgreSQL (10-18+) and pgBouncer (1.8-1.25+) in a single exporter - **Fine-grained Control**: Configure timeout, caching, skip conditions, and fatality per collector - **Dynamic Planning**: Define multiple query branches based on different conditions - **Self-monitoring**: Rich metrics about pg_exporter [itself](https://demo.pigsty.io/d/pgsql-exporter) for complete observability - **Production-Ready**: Battle-tested in real-world environments across 12K+ cores for 6+ years - **Auto-discovery**: Automatically discover and monitor multiple databases within an instance - **Health Check APIs**: Comprehensive HTTP endpoints for service health and traffic routing - **Extension Support**: `timescaledb`, `citus`, `pg_stat_statements`, `pg_wait_sampling`,... - **Local-first URL behavior**: Built for on-host deployment, with implicit local target fallback and automatic `sslmode=disable` when omitted > Also support PG 9.x with [legacy config bundle](legacy/). -------- ## Quick Start RPM / DEB / Tarball available in the GitHub [release page](https://github.com/pgsty/pg_exporter/releases), and Pigsty's YUM / APT [Infra Repo](https://pigsty.io/docs/repo/infra). To run this exporter, you need to pass the postgres/pgbouncer URL via env or arg: ```bash PG_EXPORTER_URL='postgres://user:pass@host:port/postgres' pg_exporter curl http://localhost:9630/metrics # access metrics ``` There are built-in metrics such as `pg_up`, `pg_version`, `pg_in_recovery`, `pg_exporter_build_info`, and exporter self-metrics under `pg_exporter_*` (disable with `--disable-intro`). **All other metrics are defined in the [`pg_exporter.yml`](pg_exporter.yml) config file**. There are two monitoring dashboards in the [`monitor/`](monitor/) directory. You can use [**Pigsty**](https://pigsty.io) to monitor existing PostgreSQL cluster or RDS, it will setup pg_exporter for you. -------- ## Usage ```bash usage: pg_exporter [] Flags: -h, --[no-]help Show context-sensitive help (also try --help-long and --help-man). -u, --url=URL postgres target url -c, --config=CONFIG path to config dir or file --web.listen-address=:9630 ... Addresses on which to expose metrics and web interface. --web.config.file="" Path to configuration file that can enable TLS or authentication. -l, --label="" constant lables:comma separated list of label=value pair ($PG_EXPORTER_LABEL) -t, --tag="" tags,comma separated list of server tag ($PG_EXPORTER_TAG) -C, --[no-]disable-cache force not using cache ($PG_EXPORTER_DISABLE_CACHE) -m, --[no-]disable-intro disable internal/exporter self metrics ($PG_EXPORTER_DISABLE_INTRO) -a, --[no-]auto-discovery automatically scrape all database for given server ($PG_EXPORTER_AUTO_DISCOVERY) -x, --exclude-database="template0,template1,postgres" excluded databases when enabling auto-discovery ($PG_EXPORTER_EXCLUDE_DATABASE) -i, --include-database="" included databases when enabling auto-discovery ($PG_EXPORTER_INCLUDE_DATABASE) -n, --namespace="" prefix of built-in metrics, (pg|pgbouncer) by default ($PG_EXPORTER_NAMESPACE) -f, --[no-]fail-fast fail fast instead of waiting during start-up ($PG_EXPORTER_FAIL_FAST) -T, --connect-timeout=100 connect timeout in ms, 100 by default ($PG_EXPORTER_CONNECT_TIMEOUT) -P, --web.telemetry-path="/metrics" URL path under which to expose metrics. ($PG_EXPORTER_TELEMETRY_PATH) -D, --[no-]dry-run dry run and print raw configs -E, --[no-]explain explain server planned queries --log.level="info" log level: debug|info|warn|error] --log.format="logfmt" log format: logfmt|json --[no-]version Show application version. ``` Parameters could be given via command-line args or environment variables. | CLI Arg | Environment Variable | Default Value | |------------------------|--------------------------------|----------------------------------| | `--url` | `PG_EXPORTER_URL` | `postgresql:///?sslmode=disable` | | `--config` | `PG_EXPORTER_CONFIG` | `pg_exporter.yml` | | `--label` | `PG_EXPORTER_LABEL` | | | `--tag` | `PG_EXPORTER_TAG` | | | `--auto-discovery` | `PG_EXPORTER_AUTO_DISCOVERY` | `true` | | `--disable-cache` | `PG_EXPORTER_DISABLE_CACHE` | `false` | | `--fail-fast` | `PG_EXPORTER_FAIL_FAST` | `false` | | `--exclude-database` | `PG_EXPORTER_EXCLUDE_DATABASE` | | | `--include-database` | `PG_EXPORTER_INCLUDE_DATABASE` | | | `--namespace` | `PG_EXPORTER_NAMESPACE` | `pg\|pgbouncer` | | `--connect-timeout` | `PG_EXPORTER_CONNECT_TIMEOUT` | `100` | | `--dry-run` | | `false` | | `--explain` | | `false` | | `--log.level` | | `info` | | `--log.format` | | `logfmt` | | `--web.listen-address` | | `:9630` | | `--web.config.file` | | `""` | | `--web.telemetry-path` | `PG_EXPORTER_TELEMETRY_PATH` | `/metrics` | ### Connection URL Defaults - If `--url` / `PG_EXPORTER_URL` is not provided, pg_exporter falls back to a local-first default URL: `postgresql:///?sslmode=disable`. - If `sslmode` is not explicitly set in the URL, pg_exporter injects `sslmode=disable` by default. - This is an intentional design choice for common on-host deployments (`pg_exporter` and PostgreSQL/PgBouncer on the same machine), where loopback TLS adds overhead with little practical gain. - If you need TLS for remote targets, provide `sslmode` explicitly in the connection URL (for example: `sslmode=require`, `verify-ca`, `verify-full`). ------ ## API PG Exporter provides a rich set of HTTP endpoints: Here are `pg_exporter` REST APIs ```bash # Fetch metrics (customizable) curl localhost:9630/metrics # Reload configuration curl -X POST localhost:9630/reload # Explain configuration curl localhost:9630/explain # Print Statistics curl localhost:9630/stat # Aliveness health check (200 up, 503 down) curl localhost:9630/up curl localhost:9630/health curl localhost:9630/liveness curl localhost:9630/readiness # traffic route health check ### 200 if not in recovery, 404 if in recovery, 503 if server is down curl localhost:9630/primary curl localhost:9630/leader curl localhost:9630/master curl localhost:9630/read-write curl localhost:9630/rw ### 200 if in recovery, 404 if not in recovery, 503 if server is down curl localhost:9630/replica curl localhost:9630/standby curl localhost:9630/read-only curl localhost:9630/ro ### 200 if server is ready for read traffic (including primary), 503 if server is down curl localhost:9630/read ``` -------- ## Build To build a static stand-alone binary for docker scratch ```bash CGO_ENABLED=0 GOOS=linux go build -a -ldflags '-extldflags "-static"' -o pg_exporter ``` Or [download](https://github.com/pgsty/pg_exporter/releases) the latest prebuilt binaries from release pages. We also have pre-packaged RPM / DEB packages in the [Pigsty Infra Repo](https://pigsty.io/docs/repo/infra/) -------- ## Docker You can find pre-built amd64/arm64 docker images here: [pgsty/pg_exporter](https://hub.docker.com/r/pgsty/pg_exporter) -------- ## Deployment Redhat rpm and Debian/Ubuntu deb packages are made with `nfpm` for `x86/arm64`: * `/usr/bin/pg_exporter`: the pg_exporter binary. * [`/etc/pg_exporter.yml`](pg_exporter.yml): the config file * [`/usr/lib/systemd/system/pg_exporter.service`](package/pg_exporter.service): systemd service file * [`/etc/default/pg_exporter`](package/pg_exporter.default): systemd service envs & options Which is also available on Pigsty's [Infra Repo](https://pigsty.io/docs/repo/infra). ------ ## Collectors Configs lie in the core of `pg_exporter`. Actually, this project contains more lines of YAML than go. * A monolith battery-included config file: [`pg_exporter.yml`](pg_exporter.yml) * Separated metrics definition in [`config/collector`](config/) * Example of how to write a config file: [`doc.yml`](config/0000-doc.yml) * Legacy config bundle for PostgreSQL 9.1 - 9.6: [`legacy/`](legacy/) ([`legacy/README.md`](legacy/README.md)) Current `pg_exporter` is shipped with the following metrics collector definition files - [0000-doc.yml](config/0000-doc.yml) - [0110-pg.yml](config/0110-pg.yml) - [0120-pg_meta.yml](config/0120-pg_meta.yml) - [0130-pg_setting.yml](config/0130-pg_setting.yml) - [0210-pg_repl.yml](config/0210-pg_repl.yml) - [0220-pg_sync_standby.yml](config/0220-pg_sync_standby.yml) - [0230-pg_downstream.yml](config/0230-pg_downstream.yml) - [0240-pg_slot.yml](config/0240-pg_slot.yml) - [0250-pg_recv.yml](config/0250-pg_recv.yml) - [0260-pg_sub.yml](config/0260-pg_sub.yml) - [0270-pg_origin.yml](config/0270-pg_origin.yml) - [0300-pg_io.yml](config/0300-pg_io.yml) - [0310-pg_size.yml](config/0310-pg_size.yml) - [0320-pg_archiver.yml](config/0320-pg_archiver.yml) - [0330-pg_bgwriter.yml](config/0330-pg_bgwriter.yml) - [0331-pg_checkpointer.yml](config/0331-pg_checkpointer.yml) - [0340-pg_ssl.yml](config/0340-pg_ssl.yml) - [0350-pg_checkpoint.yml](config/0350-pg_checkpoint.yml) - [0355-pg_timeline.yml](config/0355-pg_timeline.yml) - [0360-pg_recovery.yml](config/0360-pg_recovery.yml) - [0370-pg_slru.yml](config/0370-pg_slru.yml) - [0380-pg_shmem.yml](config/0380-pg_shmem.yml) - [0390-pg_wal.yml](config/0390-pg_wal.yml) - [0410-pg_activity.yml](config/0410-pg_activity.yml) - [0420-pg_wait.yml](config/0420-pg_wait.yml) - [0430-pg_backend.yml](config/0430-pg_backend.yml) - [0440-pg_xact.yml](config/0440-pg_xact.yml) - [0450-pg_lock.yml](config/0450-pg_lock.yml) - [0460-pg_query.yml](config/0460-pg_query.yml) - [0510-pg_vacuuming.yml](config/0510-pg_vacuuming.yml) - [0520-pg_indexing.yml](config/0520-pg_indexing.yml) - [0530-pg_clustering.yml](config/0530-pg_clustering.yml) - [0540-pg_backup.yml](config/0540-pg_backup.yml) - [0610-pg_db.yml](config/0610-pg_db.yml) - [0620-pg_db_confl.yml](config/0620-pg_db_confl.yml) - [0640-pg_pubrel.yml](config/0640-pg_pubrel.yml) - [0650-pg_subrel.yml](config/0650-pg_subrel.yml) - [0700-pg_table.yml](config/0700-pg_table.yml) - [0710-pg_index.yml](config/0710-pg_index.yml) - [0720-pg_func.yml](config/0720-pg_func.yml) - [0730-pg_seq.yml](config/0730-pg_seq.yml) - [0740-pg_relkind.yml](config/0740-pg_relkind.yml) - [0750-pg_defpart.yml](config/0750-pg_defpart.yml) - [0810-pg_table_size.yml](config/0810-pg_table_size.yml) - [0820-pg_table_bloat.yml](config/0820-pg_table_bloat.yml) - [0830-pg_index_bloat.yml](config/0830-pg_index_bloat.yml) - [0910-pgbouncer_list.yml](config/0910-pgbouncer_list.yml) - [0920-pgbouncer_database.yml](config/0920-pgbouncer_database.yml) - [0930-pgbouncer_stat.yml](config/0930-pgbouncer_stat.yml) - [0940-pgbouncer_pool.yml](config/0940-pgbouncer_pool.yml) - [1000-pg_wait_event.yml](config/1000-pg_wait_event.yml) - [1800-pg_tsdb_hypertable.yml](config/1800-pg_tsdb_hypertable.yml) - [1900-pg_citus.yml](config/1900-pg_citus.yml) - [2000-pg_heartbeat.yml](config/2000-pg_heartbeat.yml) > #### Note > > Supported version: PostgreSQL 10, 11, 12, 13, 14, 15, 16, 17, 18+ > > But you can still get PostgreSQL 9.1 - 9.6 support by switching to the [`legacy/pg_exporter.yml`](legacy/pg_exporter.yml) config `pg_exporter` will generate approximately 600 metrics for a completely new database cluster. For a real-world database with 10 ~ 100 tables, it may generate several 1k ~ 10k metrics. You may need to modify or disable some database-level metrics on a database with several thousand or more tables to complete the scrape in time. Config files are using YAML format, there are lots of examples in the [conf](https://github.com/pgsty/pg_exporter/tree/main/config/collector) dir. and here is a [sample](config/0000-doc.yml) config. ``` #==============================================================# # 1. Config File #==============================================================# # The configuration file for pg_exporter is a YAML file. # Default configurations are retrieved via following precedence: # 1. command line args: --config= # 2. environment variables: PG_EXPORTER_CONFIG= # 3. pg_exporter.yml (Current directory) # 4. /etc/pg_exporter.yml (config file) # 5. /etc/pg_exporter (config dir) #==============================================================# # 2. Config Format #==============================================================# # pg_exporter config could be a single YAML file, or a directory containing a series of separated YAML files. # Each YAML config file consists of one or more metrics Collector definition, which are top-level objects. # If a directory is provided, all YAML files in that directory (non-recursive; subdirectories are ignored) will be merged in alphabetic order. # Collector definition examples are shown below. #==============================================================# # 3. Collector Example #==============================================================# # # Here is an example of a metrics collector definition # pg_primary_only: # Collector branch name. Must be UNIQUE among the entire configuration # name: pg # Collector namespace, used as METRIC PREFIX, set to branch name by default, can be override # # the same namespace may contain multiple collector branches. It`s the user`s responsibility # # to make sure that AT MOST ONE collector is picked for each namespace. # # desc: PostgreSQL basic information (on primary) # Collector description # query: | # Metrics Query SQL # # SELECT extract(EPOCH FROM CURRENT_TIMESTAMP) AS timestamp, # pg_current_wal_lsn() - '0/0' AS lsn, # pg_current_wal_insert_lsn() - '0/0' AS insert_lsn, # pg_current_wal_lsn() - '0/0' AS write_lsn, # pg_current_wal_flush_lsn() - '0/0' AS flush_lsn, # extract(EPOCH FROM now() - pg_postmaster_start_time()) AS uptime, # extract(EPOCH FROM now() - pg_conf_load_time()) AS conf_reload_time, # pg_is_in_backup() AS is_in_backup, # extract(EPOCH FROM now() - pg_backup_start_time()) AS backup_time; # # # [OPTIONAL] metadata fields, control collector behavior # ttl: 10 # Cache TTL: in seconds, how long will pg_exporter cache this collector`s query result. # timeout: 0.1 # Query Timeout: in seconds, queries that exceed this limit will be canceled. # min_version: 100000 # minimal supported version, boundary IS included. In server version number format, # max_version: 130000 # maximal supported version, boundary NOT included, In server version number format # fatal: false # Collector marked `fatal` fails, the entire scrape will abort immediately and marked as failed # skip: false # Collector marked `skip` will not be installed during the planning procedure # # tags: [cluster, primary] # Collector tags, used for planning and scheduling # # # tags are list of strings, which could be: # # * `cluster` marks this query as cluster level, so it will only execute once for the same PostgreSQL Server # # * `primary` or `master` mark this query can only run on a primary instance (WILL NOT execute if pg_is_in_recovery()) # # * `standby` or `replica` mark this query can only run on a replica instance (WILL execute if pg_is_in_recovery()) # # some special tag prefix have special interpretation: # # * `dbname:` means this query will ONLY be executed on database with name `` # # * `username:` means this query will only be executed when connect with user `` # # * `extension:` means this query will only be executed when extension `` is installed # # * `schema:` means this query will only by executed when schema `` exist # # * `not:` means this query WILL NOT be executed when exporter is tagged with `` # # * `` means this query WILL be executed when exporter is tagged with `` # # ( could not be cluster,primary,standby,master,replica,etc...) # # # One or more "predicate queries" may be defined for a metric query. These # # are run before the main metric query (after any cache hit check). If all # # of them, when run sequentially, return a single row with a single column # # boolean true result, the main metric query is executed. If any of them # # return false or return zero rows, the main query is skipped. If any # # predicate query returns more than one row, a non-boolean result, or fails # # with an error, the whole query is marked failed. Predicate queries can be # # used to check for the presence of specific functions, tables, extensions, # # settings, and vendor-specific pg features before running the main query. # # predicate_queries: # - name: predicate query name # predicate_query: | # SELECT EXISTS (SELECT 1 FROM information_schema.routines WHERE routine_schema = 'pg_catalog' AND routine_name = 'pg_backup_start_time'); # # metrics: # List of returned columns, each column must have a `name` and `usage`, `rename` and `description` are optional # - timestamp: # Column name, should be exactly the same as returned column name # usage: GAUGE # Metric type, `usage` could be # * DISCARD: completely ignoring this field # * LABEL: use columnName=columnValue as a label in metric # * GAUGE: Mark column as a gauge metric, full name will be `_` # * COUNTER: Same as above, except it is a counter rather than a gauge. # rename: ts # [OPTIONAL] Alias, optional, the alias will be used instead of the column name # description: xxxx # [OPTIONAL] Description of the column, will be used as a metric description # default: 0 # [OPTIONAL] Default value, will be used when column is NULL # scale: 1000 # [OPTIONAL] Scale the value by this factor # - lsn: # usage: COUNTER # description: log sequence number, current write location (on primary) # - insert_lsn: # usage: COUNTER # description: primary only, location of current wal inserting # - write_lsn: # usage: COUNTER # description: primary only, location of current wal writing # - flush_lsn: # usage: COUNTER # description: primary only, location of current wal syncing # - uptime: # usage: GAUGE # description: seconds since postmaster start # - conf_reload_time: # usage: GAUGE # description: seconds since last configuration reload # - is_in_backup: # usage: GAUGE # description: 1 if backup is in progress # - backup_time: # usage: GAUGE # description: seconds since the current backup start. null if don`t have one # # .... # you can also use rename & scale to customize the metric name and value: # - checkpoint_write_time: # rename: write_time # usage: COUNTER # scale: 1e-3 # description: Total amount of time that has been spent in the portion of checkpoint processing where files are written to disk, in seconds #==============================================================# # 4. Collector Presets #==============================================================# # pg_exporter is shipped with a series of preset collectors (already numbered and ordered by filename) # # 1xx Basic metrics: basic info, metadata, settings # 2xx Replication metrics: replication, walreceiver, downstream, sync standby, slots, subscription # 3xx Persist metrics: size, wal, background writer, checkpointer, ssl, checkpoint, recovery, slru cache, shmem usage # 4xx Activity metrics: backend count group by state, wait event, locks, xacts, queries # 5xx Progress metrics: clustering, vacuuming, indexing, basebackup, copy # 6xx Database metrics: pg_database, publication, subscription # 7xx Object metrics: pg_class, table, index, function, sequence, default partition # 8xx Optional metrics: optional metrics collector (disable by default, slow queries) # 9xx Pgbouncer metrics: metrics from pgbouncer admin database `pgbouncer` # # 100-599 Metrics for entire database cluster (scrape once) # 600-899 Metrics for single database instance (scrape for each database ,except for pg_db itself) #==============================================================# # 5. Cache TTL #==============================================================# # Cache can be used for reducing query overhead, it can be enabled by setting a non-zero value for `ttl` # It is highly recommended to use cache to avoid duplicate scrapes. Especially when you got multiple Prometheus # scraping the same instance with slow monitoring queries. Setting `ttl` to zero or leaving blank will disable # result caching, which is the default behavior # # TTL has to be smaller than your scrape interval. 15s scrape interval and 10s TTL is a good start for # production environment. Some expensive monitoring queries (such as size/bloat check) will have longer `ttl` # which can also be used as a mechanism to achieve `different scrape frequency` #==============================================================# # 6. Query Timeout #==============================================================# # Collectors can be configured with an optional Timeout. If the collector's query executes more than that # timeout, it will be canceled immediately. Setting the `timeout` to 0 or leaving blank will reset it to # default timeout 0.1 (100ms). Setting it to any negative number will disable the query timeout feature. # All queries have a default timeout of 100ms, if exceeded, the query will be canceled immediately to avoid # avalanche. You can explicitly overwrite that option. but beware: in some extreme cases, if all your # timeouts sum up greater your scrape/cache interval (usually 15s), the queries may still be jammed. # or, you can just disable potential slow queries. #==============================================================# # 7. Version Compatibility #==============================================================# # Each collector has two optional version compatibility parameters: `min_version` and `max_version`. # These two parameters specify the version compatibility of the collector. If target postgres/pgbouncer's # version is less than `min_version`, or higher than `max_version`, the collector will not be installed. # These two parameters are using PostgreSQL server version number format, which is a 6-digit integer # format as :. # For example, 090600 stands for 9.6, and 120100 stands for 12.1 # And beware that version compatibility range is left-inclusive right exclusive: [min, max), set to zero or # leaving blank will affect as -inf or +inf #==============================================================# # 8. Fatality #==============================================================# # If a collector is marked with `fatal` falls, the entire scrape operation will be marked as fail and key metrics # `pg_up` / `pgbouncer_up` will be reset to 0. It is always a good practice to set up AT LEAST ONE fatal # collector for pg_exporter. `pg.pg_primary_only` and `pgbouncer_list` are the default fatal collector. # # If a collector without `fatal` flag fails, it will increase global fail counters. But the scrape operation # will carry on. The entire scrape result will not be marked as faile, thus will not affect the `_up` metric. #==============================================================# # 9. Skip #==============================================================# # Collector with `skip` flag set to true will NOT be installed. # This could be a handy option to disable collectors #==============================================================# # 10. Tags and Planning #==============================================================# # Tags are designed for collector planning & schedule. It can be handy to customize which queries run # on which instances. And thus you can use one-single monolith config for multiple environments # # Tags are a list of strings, each string could be: # Pre-defined special tags # * `cluster` marks this collector as cluster level, so it will ONLY BE EXECUTED ONCE for the same PostgreSQL Server # * `primary` or `master` mark this collector as primary-only, so it WILL NOT work iff pg_is_in_recovery() # * `standby` or `replica` mark this collector as replica-only, so it WILL work iff pg_is_in_recovery() # Special tag prefix which have different interpretation: # * `dbname:` means this collector will ONLY work on database with name `` # * `username:` means this collector will ONLY work when connect with user `` # * `extension:` means this collector will ONLY work when extension `` is installed # * `schema:` means this collector will only work when schema `` exists # Customized positive tags (filter) and negative tags (taint) # * `not:` means this collector WILL NOT work when exporter is tagged with `` # * `` means this query WILL work if exporter is tagged with `` (special tags not included) # # pg_exporter will trigger the Planning procedure after connecting to the target. It will gather database facts # and match them with tags and other metadata (such as supported version range). Collector will only # be installed if and only if it is compatible with the target server. ``` -------------------- ## About Author: [Vonng](https://vonng.com/en) ([rh@vonng.com](mailto:rh@vonng.com)) Contributors: https://github.com/pgsty/pg_exporter/graphs/contributors License: [Apache-2.0](LICENSE) Copyright: 2018-2026 rh@vonng.com

PG Exporter Logo

================================================ FILE: config/0000-doc.yml ================================================ #==============================================================# # Desc : pg_exporter metrics collector definition # Ver : PostgreSQL 10 ~ 18+ and pgbouncer 1.9~1.25+ # Ctime : 2019-12-09 # Mtime : 2026-03-21 # Homepage : https://pigsty.io # Author : Ruohang Feng (rh@vonng.com) # License : Apache-2.0 @ https://github.com/pgsty/pg_exporter # Copyright : 2018-2026 Ruohang Feng / Vonng (rh@vonng.com) #==============================================================# #==============================================================# # 1. Config File #==============================================================# # The configuration file for pg_exporter is a YAML file. # Default configurations are retrieved via following precedence: # 1. command line args: --config= # 2. environment variables: PG_EXPORTER_CONFIG= # 3. pg_exporter.yml (Current directory) # 4. /etc/pg_exporter.yml (config file) # 5. /etc/pg_exporter (config dir) #==============================================================# # 2. Config Format #==============================================================# # pg_exporter config could be a single YAML file, or a directory containing a series of separated YAML files. # Each YAML config file consists of one or more metrics Collector definition, which are top-level objects. # If a directory is provided, all YAML in that directory will be merged in alphabetic order. # Collector definition examples are shown below. #==============================================================# # 3. Collector Example #==============================================================# # # Here is an example of a metrics collector definition # pg_primary_only: # Collector branch name. Must be UNIQUE among the entire configuration # name: pg # Collector namespace, used as METRIC PREFIX, set to branch name by default, can be override # # the same namespace may contain multiple collector branches. It`s the user`s responsibility # # to make sure that AT MOST ONE collector is picked for each namespace. # # desc: PostgreSQL basic information (on primary) # Collector description # query: | # Metrics Query SQL # # SELECT extract(EPOCH FROM CURRENT_TIMESTAMP) AS timestamp, # pg_current_wal_lsn() - '0/0' AS lsn, # pg_current_wal_insert_lsn() - '0/0' AS insert_lsn, # pg_current_wal_lsn() - '0/0' AS write_lsn, # pg_current_wal_flush_lsn() - '0/0' AS flush_lsn, # extract(EPOCH FROM now() - pg_postmaster_start_time()) AS uptime, # extract(EPOCH FROM now() - pg_conf_load_time()) AS conf_reload_time, # pg_is_in_backup() AS is_in_backup, # extract(EPOCH FROM now() - pg_backup_start_time()) AS backup_time; # # # [OPTIONAL] metadata fields, control collector behavior # ttl: 10 # Cache TTL: in seconds, how long will pg_exporter cache this collector`s query result. # timeout: 0.1 # Query Timeout: in seconds, queries that exceed this limit will be canceled. # min_version: 100000 # minimal supported version, boundary IS included. In server version number format, # max_version: 130000 # maximal supported version, boundary NOT included, In server version number format # fatal: false # Collector marked `fatal` fails, the entire scrape will abort immediately and marked as failed # skip: false # Collector marked `skip` will not be installed during the planning procedure # # tags: [cluster, primary] # Collector tags, used for planning and scheduling # # # tags are list of strings, which could be: # # * `cluster` marks this query as cluster level, so it will only execute once for the same PostgreSQL Server # # * `primary` or `master` mark this query can only run on a primary instance (WILL NOT execute if pg_is_in_recovery()) # # * `standby` or `replica` mark this query can only run on a replica instance (WILL execute if pg_is_in_recovery()) # # some special tag prefix have special interpretation: # # * `dbname:` means this query will ONLY be executed on database with name `` # # * `username:` means this query will only be executed when connect with user `` # # * `extension:` means this query will only be executed when extension `` is installed # # * `schema:` means this query will only by executed when schema `` exist # # * `not:` means this query WILL NOT be executed when exporter is tagged with `` # # * `` means this query WILL be executed when exporter is tagged with `` # # ( could not be cluster,primary,standby,master,replica,etc...) # # # One or more "predicate queries" may be defined for a metric query. These # # are run before the main metric query (after any cache hit check). If all # # of them, when run sequentially, return a single row with a single column # # boolean true result, the main metric query is executed. If any of them # # return false or return zero rows, the main query is skipped. If any # # predicate query returns more than one row, a non-boolean result, or fails # # with an error, the whole query is marked failed. Predicate queries can be # # used to check for the presence of specific functions, tables, extensions, # # settings, and vendor-specific pg features before running the main query. # # predicate_queries: # - name: predicate query name # predicate_query: | # SELECT EXISTS (SELECT 1 FROM information_schema.routines WHERE routine_schema = 'pg_catalog' AND routine_name = 'pg_backup_start_time'); # # metrics: # List of returned columns, each column must have a `name` and `usage`, `rename` and `description` are optional # - timestamp: # Column name, should be exactly the same as returned column name # usage: GAUGE # Metric type, `usage` could be # * DISCARD: completely ignoring this field # * LABEL: use columnName=columnValue as a label in metric # * GAUGE: Mark column as a gauge metric, full name will be `_` # * COUNTER: Same as above, except it is a counter rather than a gauge. # rename: ts # [OPTIONAL] Alias, optional, the alias will be used instead of the column name # description: xxxx # [OPTIONAL] Description of the column, will be used as a metric description # default: 0 # [OPTIONAL] Default value, will be used when column is NULL # scale: 1000 # [OPTIONAL] Scale the value by this factor # - lsn: # usage: COUNTER # description: log sequence number, current write location (on primary) # - insert_lsn: # usage: COUNTER # description: primary only, location of current wal inserting # - write_lsn: # usage: COUNTER # description: primary only, location of current wal writing # - flush_lsn: # usage: COUNTER # description: primary only, location of current wal syncing # - uptime: # usage: GAUGE # description: seconds since postmaster start # - conf_reload_time: # usage: GAUGE # description: seconds since last configuration reload # - is_in_backup: # usage: GAUGE # description: 1 if backup is in progress # - backup_time: # usage: GAUGE # description: seconds since the current backup start. null if don`t have one # # .... # you can also use rename & scale to customize the metric name and value: # - checkpoint_write_time: # rename: write_time # usage: COUNTER # scale: 1e-3 # description: Total amount of time that has been spent in the portion of checkpoint processing where files are written to disk, in seconds #==============================================================# # 4. Collector Presets #==============================================================# # pg_exporter is shipped with a series of preset collectors (already numbered and ordered by filename) # # 1xx Basic metrics: basic info, metadata, settings # 2xx Replication metrics: replication, walreceiver, downstream, sync standby, slots, subscription # 3xx Persist metrics: size, wal, background writer, checkpointer, ssl, checkpoint, recovery, slru cache, shmem usage # 4xx Activity metrics: backend count group by state, wait event, locks, xacts, queries # 5xx Progress metrics: clustering, vacuuming, indexing, basebackup, copy # 6xx Database metrics: pg_database, publication, subscription # 7xx Object metrics: pg_class, table, index, function, sequence, default partition # 8xx Optional metrics: optional metrics collector (disable by default, slow queries) # 9xx Pgbouncer metrics: metrics from pgbouncer admin database `pgbouncer` # # 100-599 Metrics for entire database cluster (scrape once) # 600-899 Metrics for single database instance (scrape for each database ,except for pg_db itself) #==============================================================# # 5. Cache TTL #==============================================================# # Cache can be used for reducing query overhead, it can be enabled by setting a non-zero value for `ttl` # It is highly recommended to use cache to avoid duplicate scrapes. Especially when you got multiple Prometheus # scraping the same instance with slow monitoring queries. Setting `ttl` to zero or leaving blank will disable # result caching, which is the default behavior # # TTL has to be smaller than your scrape interval. 15s scrape interval and 10s TTL is a good start for # production environment. Some expensive monitoring queries (such as size/bloat check) will have longer `ttl` # which can also be used as a mechanism to achieve `different scrape frequency` #==============================================================# # 6. Query Timeout #==============================================================# # Collectors can be configured with an optional Timeout. If the collector's query executes more than that # timeout, it will be canceled immediately. Setting the `timeout` to 0 or leaving blank will reset it to # default timeout 0.1 (100ms). Setting it to any negative number will disable the query timeout feature. # All queries have a default timeout of 100ms, if exceeded, the query will be canceled immediately to avoid # avalanche. You can explicitly overwrite that option. but beware: in some extreme cases, if all your # timeouts sum up greater your scrape/cache interval (usually 15s), the queries may still be jammed. # or, you can just disable potential slow queries. #==============================================================# # 7. Version Compatibility #==============================================================# # Each collector has two optional version compatibility parameters: `min_version` and `max_version`. # These two parameters specify the version compatibility of the collector. If target postgres/pgbouncer's # version is less than `min_version`, or higher than `max_version`, the collector will not be installed. # These two parameters are using PostgreSQL server version number format, which is a 6-digit integer # format as :. # For example, 090600 stands for 9.6, and 120100 stands for 12.1 # And beware that version compatibility range is left-inclusive right exclusive: [min, max), set to zero or # leaving blank will affect as -inf or +inf #==============================================================# # 8. Fatality #==============================================================# # If a collector is marked with `fatal` falls, the entire scrape operation will be marked as fail and key metrics # `pg_up` / `pgbouncer_up` will be reset to 0. It is always a good practice to set up AT LEAST ONE fatal # collector for pg_exporter. `pg.pg_primary_only` and `pgbouncer_list` are the default fatal collector. # # If a collector without `fatal` flag fails, it will increase global fail counters. But the scrape operation # will carry on. The entire scrape result will not be marked as faile, thus will not affect the `_up` metric. #==============================================================# # 9. Skip #==============================================================# # Collector with `skip` flag set to true will NOT be installed. # This could be a handy option to disable collectors #==============================================================# # 10. Tags and Planning #==============================================================# # Tags are designed for collector planning & schedule. It can be handy to customize which queries run # on which instances. And thus you can use one-single monolith config for multiple environments # # Tags are a list of strings, each string could be: # Pre-defined special tags # * `cluster` marks this collector as cluster level, so it will ONLY BE EXECUTED ONCE for the same PostgreSQL Server # * `primary` or `master` mark this collector as primary-only, so it WILL NOT work iff pg_is_in_recovery() # * `standby` or `replica` mark this collector as replica-only, so it WILL work iff pg_is_in_recovery() # Special tag prefix which have different interpretation: # * `dbname:` means this collector will ONLY work on database with name `` # * `username:` means this collector will ONLY work when connect with user `` # * `extension:` means this collector will ONLY work when extension `` is installed # * `schema:` means this collector will only work when schema `` exists # Customized positive tags (filter) and negative tags (taint) # * `not:` means this collector WILL NOT work when exporter is tagged with `` # * `` means this query WILL work if exporter is tagged with `` (special tags not included) # # pg_exporter will trigger the Planning procedure after connecting to the target. It will gather database facts # and match them with tags and other metadata (such as supported version range). Collector will only # be installed if and only if it is compatible with the target server. ================================================ FILE: config/0110-pg.yml ================================================ #==============================================================# # 0110 pg #==============================================================# pg_primary_only: name: pg desc: PostgreSQL basic information (on primary) query: |- SELECT extract(EPOCH FROM CURRENT_TIMESTAMP) AS timestamp, extract(EPOCH FROM now() - pg_postmaster_start_time()) AS uptime, extract(EPOCH FROM pg_postmaster_start_time()) AS boot_time, pg_current_wal_lsn() - '0/0' AS lsn, pg_current_wal_insert_lsn() - '0/0' AS insert_lsn, pg_current_wal_lsn() - '0/0' AS write_lsn, pg_current_wal_flush_lsn() - '0/0' AS flush_lsn, NULL::BIGINT AS receive_lsn, NULL::BIGINT AS replay_lsn, extract(EPOCH FROM pg_conf_load_time()) AS reload_time, extract(EPOCH FROM now() - pg_conf_load_time()) AS conf_reload_time, NULL::FLOAT AS last_replay_time, 0::FLOAT AS lag, pg_is_in_recovery() AS is_in_recovery, FALSE AS is_wal_replay_paused; tags: [ cluster, primary ] ttl: 1 min_version: 100000 fatal: true skip: false metrics: - timestamp: { usage: GAUGE ,description: "current database timestamp in unix epoch" } - uptime: { usage: GAUGE ,description: "seconds since postmaster start" } - boot_time: { usage: GAUGE ,description: "postmaster boot timestamp in unix epoch" } - lsn: { usage: COUNTER ,description: "log sequence number, current write location" } - insert_lsn: { usage: COUNTER ,description: "primary only, location of current wal inserting" } - write_lsn: { usage: COUNTER ,description: "primary only, location of current wal writing" } - flush_lsn: { usage: COUNTER ,description: "primary only, location of current wal syncing" } - receive_lsn: { usage: COUNTER ,description: "replica only, location of wal synced to disk" } - replay_lsn: { usage: COUNTER ,description: "replica only, location of wal applied" } - reload_time: { usage: GAUGE ,description: "time when configuration was last reloaded" } - conf_reload_time: { usage: GAUGE ,description: "seconds since last configuration reload" } - last_replay_time: { usage: GAUGE ,description: "time when last transaction been replayed" } - lag: { usage: GAUGE ,description: "replica only, replication lag in seconds" } - is_in_recovery: { usage: GAUGE ,description: "1 if in recovery mode" } - is_wal_replay_paused: { usage: GAUGE ,description: "1 if wal play is paused" } pg_replica_only: name: pg desc: PostgreSQL basic information (on replica) query: |- SELECT extract(EPOCH FROM CURRENT_TIMESTAMP) AS timestamp, extract(EPOCH FROM now() - pg_postmaster_start_time()) AS uptime, extract(EPOCH FROM pg_postmaster_start_time()) AS boot_time, pg_last_wal_replay_lsn() - '0/0' AS lsn, NULL::BIGINT AS insert_lsn, NULL::BIGINT AS write_lsn, NULL::BIGINT AS flush_lsn, pg_last_wal_receive_lsn() - '0/0' AS receive_lsn, pg_last_wal_replay_lsn() - '0/0' AS replay_lsn, extract(EPOCH FROM pg_conf_load_time()) AS reload_time, extract(EPOCH FROM now() - pg_conf_load_time()) AS conf_reload_time, extract(EPOCH FROM pg_last_xact_replay_timestamp()) AS last_replay_time, CASE WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0 ELSE EXTRACT(EPOCH FROM now() - pg_last_xact_replay_timestamp()) END AS lag, pg_is_in_recovery() AS is_in_recovery, pg_is_wal_replay_paused() AS is_wal_replay_paused; tags: [ cluster, replica ] ttl: 1 min_version: 100000 fatal: true skip: false metrics: - timestamp: { usage: GAUGE ,description: "current database timestamp in unix epoch" } - uptime: { usage: GAUGE ,description: "seconds since postmaster start" } - boot_time: { usage: GAUGE ,description: "postmaster boot timestamp in unix epoch" } - lsn: { usage: COUNTER ,description: "log sequence number, current write location" } - insert_lsn: { usage: COUNTER ,description: "primary only, location of current wal inserting" } - write_lsn: { usage: COUNTER ,description: "primary only, location of current wal writing" } - flush_lsn: { usage: COUNTER ,description: "primary only, location of current wal syncing" } - receive_lsn: { usage: COUNTER ,description: "replica only, location of wal synced to disk" } - replay_lsn: { usage: COUNTER ,description: "replica only, location of wal applied" } - reload_time: { usage: GAUGE ,description: "time when configuration was last reloaded" } - conf_reload_time: { usage: GAUGE ,description: "seconds since last configuration reload" } - last_replay_time: { usage: GAUGE ,description: "time when last transaction been replayed" } - lag: { usage: GAUGE ,description: "replica only, replication lag in seconds" } - is_in_recovery: { usage: GAUGE ,description: "1 if in recovery mode" } - is_wal_replay_paused: { usage: GAUGE ,description: "1 if wal play is paused" } ================================================ FILE: config/0120-pg_meta.yml ================================================ #==============================================================# # 0120 pg_meta #==============================================================# pg_meta_13: name: pg_meta desc: PostgreSQL meta info for pg 13+, with extra primary conninfo query: | SELECT (SELECT system_identifier FROM pg_control_system()) AS cluster_id, current_setting('cluster_name') AS cluster_name, current_setting('port') AS listen_port, current_setting('data_directory', true) AS data_dir, current_setting('config_file', true) AS conf_path, current_setting('hba_file', true) AS hba_path, current_setting('wal_level') AS wal_level, current_setting('server_encoding') AS encoding, current_setting('server_version') AS version, current_setting('server_version_num') AS ver_num, version() AS ver_str, current_setting('shared_preload_libraries', true) AS extensions, current_setting('primary_conninfo', true) AS primary_conninfo, 1 AS info ttl: 10 min_version: 130000 tags: [ cluster ] metrics: - cluster_id: { usage: LABEL ,description: "cluster system identifier" } - cluster_name: { usage: LABEL ,description: "cluster name" } - listen_port: { usage: LABEL ,description: "listen port" } - data_dir: { usage: LABEL ,description: "path to data directory" } - conf_path: { usage: LABEL ,description: "path to postgresql.conf" } - hba_path: { usage: LABEL ,description: "path to pg_hba.conf" } - wal_level: { usage: LABEL ,description: "wal level" } - encoding: { usage: LABEL ,description: "server encoding" } - version: { usage: LABEL ,description: "server version in human-readable format" } - ver_num: { usage: LABEL ,description: "server version number in machine-readable format" } - ver_str: { usage: LABEL ,description: "complete version string" } - extensions: { usage: LABEL ,description: "server installed preload libraries" } - primary_conninfo: { usage: LABEL ,description: "connection string to upstream (do not set password here)" } - info: { usage: GAUGE ,description: "constant 1" } pg_meta_10: name: pg_meta desc: PostgreSQL meta info query: | SELECT (SELECT system_identifier FROM pg_control_system()) AS cluster_id, current_setting('cluster_name') AS cluster_name, current_setting('port') AS listen_port, current_setting('data_directory', true) AS data_dir, current_setting('config_file', true) AS conf_path, current_setting('hba_file', true) AS hba_path, current_setting('wal_level') AS wal_level, current_setting('server_encoding') AS encoding, current_setting('server_version') AS version, current_setting('server_version_num') AS ver_num, version() AS ver_str, current_setting('shared_preload_libraries', true) AS extensions, 'N/A' AS primary_conninfo, 1 AS info ttl: 10 min_version: 090600 max_version: 130000 tags: [ cluster ] metrics: - cluster_id: { usage: LABEL ,description: "cluster system identifier" } - cluster_name: { usage: LABEL ,description: "cluster name" } - listen_port: { usage: LABEL ,description: "listen port" } - data_dir: { usage: LABEL ,description: "path to data directory" } - conf_path: { usage: LABEL ,description: "path to postgresql.conf" } - hba_path: { usage: LABEL ,description: "path to pg_hba.conf" } - wal_level: { usage: LABEL ,description: "wal level" } - encoding: { usage: LABEL ,description: "server encoding" } - version: { usage: LABEL ,description: "server version in human-readable format" } - ver_num: { usage: LABEL ,description: "server version number in machine-readable format" } - ver_str: { usage: LABEL ,description: "complete version string" } - extensions: { usage: LABEL ,description: "server installed preload libraries" } - primary_conninfo: { usage: LABEL ,description: "connection string to upstream (do not set password here)" } - info: { usage: GAUGE ,description: "constant 1" } ================================================ FILE: config/0130-pg_setting.yml ================================================ #==============================================================# # 0130 pg_setting #==============================================================# # Key PostgreSQL configuration parameters # All parameters use current_setting(name, missing_ok) for version safety # Parameters introduced after PG10 use missing_ok=true to return NULL on older versions pg_setting: name: pg_setting desc: PostgreSQL shared configuration parameters (shared across all databases) query: | SELECT current_setting('max_connections')::int AS max_connections, current_setting('max_prepared_transactions')::int AS max_prepared_transactions, current_setting('max_locks_per_transaction')::int AS max_locks_per_transaction, current_setting('max_worker_processes')::int AS max_worker_processes, current_setting('max_parallel_workers')::int AS max_parallel_workers, current_setting('max_parallel_workers_per_gather')::int AS max_parallel_workers_per_gather, current_setting('max_parallel_maintenance_workers', true)::int AS max_parallel_maintenance_workers, current_setting('max_replication_slots')::int AS max_replication_slots, current_setting('max_wal_senders')::int AS max_wal_senders, current_setting('block_size')::int AS block_size, current_setting('wal_block_size')::int AS wal_block_size, pg_size_bytes(current_setting('segment_size')) AS segment_size, pg_size_bytes(current_setting('wal_segment_size')) AS wal_segment_size, CASE current_setting('data_checksums') WHEN 'on' THEN 1 ELSE 0 END AS data_checksums, CASE current_setting('wal_log_hints') WHEN 'on' THEN 1 ELSE 0 END AS wal_log_hints, CASE current_setting('fsync') WHEN 'on' THEN 1 ELSE 0 END AS fsync, CASE current_setting('full_page_writes') WHEN 'on' THEN 1 ELSE 0 END AS full_page_writes, CASE current_setting('wal_level') WHEN 'logical' THEN 3 WHEN 'replica' THEN 2 WHEN 'minimal' THEN 1 ELSE 0 END AS wal_level, pg_size_bytes(current_setting('min_wal_size')) AS min_wal_size, pg_size_bytes(current_setting('max_wal_size')) AS max_wal_size, pg_size_bytes(current_setting('max_slot_wal_keep_size', true)) AS max_slot_wal_keep_size, pg_size_bytes(current_setting('shared_buffers')) AS shared_buffers, pg_size_bytes(current_setting('work_mem')) AS work_mem, pg_size_bytes(current_setting('maintenance_work_mem')) AS maintenance_work_mem, pg_size_bytes(current_setting('effective_cache_size')) AS effective_cache_size, pg_size_bytes(current_setting('shared_memory_size', true)) AS shared_memory_size, CASE current_setting('huge_pages_status', true) WHEN 'on' THEN 1 WHEN 'off' THEN 0 WHEN 'unknown' THEN -1 ELSE NULL END AS hugepage_status, current_setting('shared_memory_size_in_huge_pages', true)::int AS hugepage_count, CASE current_setting('archive_mode') WHEN 'off' THEN 0 WHEN 'on' THEN 1 WHEN 'always' THEN 2 ELSE -1 END AS archive_mode, CASE current_setting('autovacuum') WHEN 'on' THEN 1 ELSE 0 END AS autovacuum, current_setting('autovacuum_max_workers')::int AS autovacuum_max_workers, extract(epoch from current_setting('checkpoint_timeout')::interval)::int AS checkpoint_timeout, current_setting('checkpoint_completion_target')::float AS checkpoint_completion_target, CASE current_setting('hot_standby') WHEN 'on' THEN 1 ELSE 0 END AS hot_standby, CASE current_setting('synchronous_commit') WHEN 'off' THEN 0 WHEN 'local' THEN 1 WHEN 'remote_write' THEN 2 WHEN 'on' THEN 3 WHEN 'remote_apply' THEN 4 ELSE -1 END AS synchronous_commit, CASE current_setting('io_method', true) WHEN 'sync' THEN 0 WHEN 'worker' THEN 1 WHEN 'io_uring' THEN 2 ELSE NULL END AS io_method; ttl: 10 min_version: 100000 tags: [ cluster ] metrics: - max_connections: { usage: GAUGE ,description: "maximum number of concurrent connections to the database server" } - max_prepared_transactions: { usage: GAUGE ,description: "maximum number of transactions that can be in the prepared state simultaneously" } - max_locks_per_transaction: { usage: GAUGE ,description: "maximum number of locks per transaction" } - max_worker_processes: { usage: GAUGE ,description: "maximum number of background processes" } - max_parallel_workers: { usage: GAUGE ,description: "maximum number of parallel workers that can be active at one time" } - max_parallel_workers_per_gather: { usage: GAUGE ,description: "maximum number of parallel workers per Gather node" } - max_parallel_maintenance_workers: { usage: GAUGE ,description: "maximum number of parallel maintenance workers (PG11+, NULL on older)" } - max_replication_slots: { usage: GAUGE ,description: "maximum number of replication slots" } - max_wal_senders: { usage: GAUGE ,description: "maximum number of concurrent WAL sender connections" } - block_size: { usage: GAUGE ,description: "database block size in bytes (default 8192)" } - wal_block_size: { usage: GAUGE ,description: "WAL block size in bytes" } - segment_size: { usage: GAUGE ,description: "database file segment size in bytes" } - wal_segment_size: { usage: GAUGE ,description: "WAL segment size in bytes" } - data_checksums: { usage: GAUGE ,description: "data checksums enabled, 1=on 0=off" } - wal_log_hints: { usage: GAUGE ,description: "WAL log hints enabled, 1=on 0=off" } - fsync: { usage: GAUGE ,description: "fsync enabled (CRITICAL for data safety), 1=on 0=off" } - full_page_writes: { usage: GAUGE ,description: "full page writes enabled, 1=on 0=off" } - wal_level: { usage: GAUGE ,description: "WAL level, 1=minimal 2=replica 3=logical" } - min_wal_size: { usage: GAUGE ,description: "minimum WAL size in bytes" } - max_wal_size: { usage: GAUGE ,description: "maximum WAL size in bytes" } - max_slot_wal_keep_size: { usage: GAUGE ,description: "maximum WAL size retained by replication slots in bytes (PG13+, NULL on older)" } - shared_buffers: { usage: GAUGE ,description: "shared buffer size in bytes" } - work_mem: { usage: GAUGE ,description: "work memory size in bytes" } - maintenance_work_mem: { usage: GAUGE ,description: "maintenance work memory size in bytes" } - effective_cache_size: { usage: GAUGE ,description: "planner's assumption about effective OS cache size in bytes" } - shared_memory_size: { usage: GAUGE ,description: "total shared memory size in bytes (PG13+, NULL on older)" } - hugepage_status: { usage: GAUGE ,description: "huge pages status, 1=on 0=off -1=unknown NULL=unavailable (PG14+)" } - hugepage_count: { usage: GAUGE ,description: "number of huge pages needed for shared memory (PG14+, NULL on older)" } - archive_mode: { usage: GAUGE ,description: "archive mode, 0=off 1=on 2=always" } - autovacuum: { usage: GAUGE ,description: "autovacuum enabled, 1=on 0=off" } - autovacuum_max_workers: { usage: GAUGE ,description: "maximum number of autovacuum worker processes" } - checkpoint_timeout: { usage: GAUGE ,description: "checkpoint timeout in seconds" } - checkpoint_completion_target: { usage: GAUGE ,description: "checkpoint completion target (0.0-1.0)" } - hot_standby: { usage: GAUGE ,description: "hot standby mode enabled, 1=on 0=off" } - synchronous_commit: { usage: GAUGE ,description: "synchronous commit level, 0=off 1=local 2=remote_write 3=on 4=remote_apply" } - io_method: { usage: GAUGE ,description: "I/O method (PG18+), 0=sync 1=worker 2=io_uring NULL=unavailable" } ================================================ FILE: config/0210-pg_repl.yml ================================================ #==============================================================# # 0210 pg_repl #==============================================================# pg_repl_12: name: pg_repl desc: PostgreSQL replication stat metrics 12+ query: | SELECT application_name AS appname, usename, coalesce(client_addr::TEXT,'localhost') AS address, pid::TEXT, client_port, CASE state WHEN 'streaming' THEN 0 WHEN 'startup' THEN 1 WHEN 'catchup' THEN 2 WHEN 'backup' THEN 3 WHEN 'stopping' THEN 4 ELSE -1 END AS state, CASE sync_state WHEN 'async' THEN 0 WHEN 'potential' THEN 1 WHEN 'sync' THEN 2 WHEN 'quorum' THEN 3 ELSE -1 END AS sync_state, sync_priority, backend_xmin::TEXT::BIGINT AS backend_xmin, current.lsn - '0/0' AS lsn, current.lsn - sent_lsn AS sent_diff, current.lsn - write_lsn AS write_diff, current.lsn - flush_lsn AS flush_diff, current.lsn - replay_lsn AS replay_diff, sent_lsn - '0/0' AS sent_lsn, write_lsn - '0/0' AS write_lsn, flush_lsn - '0/0' AS flush_lsn, replay_lsn - '0/0' AS replay_lsn, coalesce(extract(EPOCH FROM write_lag), 0) AS write_lag, coalesce(extract(EPOCH FROM flush_lag), 0) AS flush_lag, coalesce(extract(EPOCH FROM replay_lag), 0) AS replay_lag, extract(EPOCH FROM current_timestamp) AS "time", extract(EPOCH FROM backend_start) AS launch_time, extract(EPOCH FROM reply_time) AS reply_time FROM pg_stat_replication, (SELECT CASE WHEN pg_is_in_recovery() THEN pg_last_wal_replay_lsn() ELSE pg_current_wal_lsn() END AS lsn) current; ttl: 10 min_version: 120000 tags: [ cluster ] metrics: - appname: { usage: LABEL ,description: "Name of the application that is connected to this WAL sender" } - usename: { usage: LABEL ,description: "Name of the user logged into this WAL sender process" } - address: { usage: LABEL ,description: "IP address of the client connected to this WAL sender, localhost for unix socket" } - pid: { usage: LABEL ,description: "Process ID of the WAL sender process" } - client_port: { usage: GAUGE ,description: "TCP port number that the client is using for communication with this WAL sender, or -1 if a Unix socket is used" } - state: { usage: GAUGE ,description: "Current WAL sender encoded state 0-4 for streaming|startup|catchup|backup|stopping" } - sync_state: { usage: GAUGE ,description: "Encoded synchronous state of this standby server, 0-3 for async|potential|sync|quorum" } - sync_priority: { usage: GAUGE ,description: "Priority of this standby server for being chosen as the synchronous standby" } - backend_xmin: { usage: COUNTER ,description: "This standby's xmin horizon reported by hot_standby_feedback." } - lsn: { usage: COUNTER ,description: "Current log position on this server" } - sent_diff: { usage: GAUGE ,description: "Last log position sent to this standby server diff with current lsn" } - write_diff: { usage: GAUGE ,description: "Last log position written to disk by this standby server diff with current lsn" } - flush_diff: { usage: GAUGE ,description: "Last log position flushed to disk by this standby server diff with current lsn" } - replay_diff: { usage: GAUGE ,description: "Last log position replayed into the database on this standby server diff with current lsn" } - sent_lsn: { usage: COUNTER ,description: "Last write-ahead log location sent on this connection" } - write_lsn: { usage: COUNTER ,description: "Last write-ahead log location written to disk by this standby server" } - flush_lsn: { usage: COUNTER ,description: "Last write-ahead log location flushed to disk by this standby server" } - replay_lsn: { usage: COUNTER ,description: "Last write-ahead log location replayed into the database on this standby server" } - write_lag: { usage: GAUGE ,description: "Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written it" } - flush_lag: { usage: GAUGE ,description: "Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written and flushed it" } - replay_lag: { usage: GAUGE ,description: "Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written, flushed and applied it" } - time: { usage: COUNTER ,description: "Current timestamp in unix epoch" } - launch_time: { usage: COUNTER ,description: "Time when this process was started, i.e., when the client connected to this WAL sender" } - reply_time: { usage: GAUGE ,description: "Send time of last reply message received from standby server" } pg_repl_10: name: pg_repl desc: PostgreSQL replication stat metrics v10 v11 query: | SELECT application_name AS appname, usename, coalesce(client_addr::TEXT,'localhost') AS address, pid::TEXT, client_port, CASE state WHEN 'streaming' THEN 0 WHEN 'startup' THEN 1 WHEN 'catchup' THEN 2 WHEN 'backup' THEN 3 WHEN 'stopping' THEN 4 ELSE -1 END AS state, CASE sync_state WHEN 'async' THEN 0 WHEN 'potential' THEN 1 WHEN 'sync' THEN 2 WHEN 'quorum' THEN 3 ELSE -1 END AS sync_state, sync_priority, backend_xmin::TEXT::BIGINT AS backend_xmin, current.lsn - '0/0' AS lsn, current.lsn - sent_lsn AS sent_diff, current.lsn - write_lsn AS write_diff, current.lsn - flush_lsn AS flush_diff, current.lsn - replay_lsn AS replay_diff, sent_lsn - '0/0' AS sent_lsn, write_lsn - '0/0' AS write_lsn, flush_lsn - '0/0' AS flush_lsn, replay_lsn - '0/0' AS replay_lsn, coalesce(extract(EPOCH FROM write_lag), 0) AS write_lag, coalesce(extract(EPOCH FROM flush_lag), 0) AS flush_lag, coalesce(extract(EPOCH FROM replay_lag), 0) AS replay_lag, extract(EPOCH FROM current_timestamp) AS "time", extract(EPOCH FROM backend_start) AS launch_time FROM pg_stat_replication, (SELECT CASE WHEN pg_is_in_recovery() THEN pg_last_wal_replay_lsn() ELSE pg_current_wal_lsn() END AS lsn) current; ttl: 10 min_version: 100000 max_version: 120000 tags: [ cluster ] metrics: - appname: { usage: LABEL ,description: "Name of the application that is connected to this WAL sender" } - usename: { usage: LABEL ,description: "Name of the user logged into this WAL sender process" } - address: { usage: LABEL ,description: "IP address of the client connected to this WAL sender, localhost for unix socket" } - pid: { usage: LABEL ,description: "Process ID of the WAL sender process" } - client_port: { usage: GAUGE ,description: "TCP port number that the client is using for communication with this WAL sender, or -1 if a Unix socket is used" } - state: { usage: GAUGE ,description: "Current WAL sender encoded state 0-4 for streaming|startup|catchup|backup|stopping" } - sync_state: { usage: GAUGE ,description: "Encoded synchronous state of this standby server, 0-3 for async|potential|sync|quorum" } - sync_priority: { usage: GAUGE ,description: "Priority of this standby server for being chosen as the synchronous standby" } - backend_xmin: { usage: COUNTER ,description: "This standby's xmin horizon reported by hot_standby_feedback." } - lsn: { usage: COUNTER ,description: "Current log position on this server" } - sent_diff: { usage: GAUGE ,description: "Last log position sent to this standby server diff with current lsn" } - write_diff: { usage: GAUGE ,description: "Last log position written to disk by this standby server diff with current lsn" } - flush_diff: { usage: GAUGE ,description: "Last log position flushed to disk by this standby server diff with current lsn" } - replay_diff: { usage: GAUGE ,description: "Last log position replayed into the database on this standby server diff with current lsn" } - sent_lsn: { usage: COUNTER ,description: "Last write-ahead log location sent on this connection" } - write_lsn: { usage: COUNTER ,description: "Last write-ahead log location written to disk by this standby server" } - flush_lsn: { usage: COUNTER ,description: "Last write-ahead log location flushed to disk by this standby server" } - replay_lsn: { usage: COUNTER ,description: "Last write-ahead log location replayed into the database on this standby server" } - write_lag: { usage: GAUGE ,description: "Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written it" } - flush_lag: { usage: GAUGE ,description: "Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written and flushed it" } - replay_lag: { usage: GAUGE ,description: "Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written, flushed and applied it" } - time: { usage: COUNTER ,description: "Current timestamp in unix epoch" } - launch_time: { usage: COUNTER ,description: "Time when this process was started, i.e., when the client connected to this WAL sender" } ================================================ FILE: config/0220-pg_sync_standby.yml ================================================ #==============================================================# # 0220 pg_sync_standby #==============================================================# pg_sync_standby: name: pg_sync_standby desc: PostgreSQL synchronous standby status and names query: | SELECT CASE WHEN names <> '' THEN names ELSE '' END AS names, CASE WHEN names <> '' THEN 1 ELSE 0 END AS enabled FROM (SELECT current_setting('synchronous_standby_names') AS names) n; ttl: 10 min_version: 090400 tags: [ cluster ] metrics: - names: { usage: LABEL ,description: "List of standby servers that can support synchronous replication, if not enabled" } - enabled: { usage: GAUGE ,description: "Synchronous commit enabled, 1 if enabled, 0 if disabled" } ================================================ FILE: config/0230-pg_downstream.yml ================================================ #==============================================================# # 0230 pg_downstream #==============================================================# pg_downstream: name: pg_downstream desc: PostgreSQL replication client count group by state query: | SELECT l.state, coalesce(count, 0 ) AS count FROM unnest(ARRAY ['streaming','startup','catchup', 'backup', 'stopping']) l(state) LEFT JOIN (SELECT state, count(*) AS count FROM pg_stat_replication GROUP BY state)r ON l.state = r.state; ttl: 10 min_version: 090400 tags: [ cluster ] metrics: - state: { usage: LABEL ,description: "Replication client state, could be one of startup|catchup|streaming|backup|stopping" } - count: { usage: GAUGE ,description: "Count of corresponding state" } ================================================ FILE: config/0240-pg_slot.yml ================================================ #==============================================================# # 0240 pg_slot #==============================================================# pg_slot_17: name: pg_slot desc: PostgreSQL replication slot metrics v17, slot also exists on standby query: |- SELECT s.slot_name, s.slot_type, plugin, database AS datname,datoid,active_pid, active,temporary,two_phase,conflicting,failover,synced, xmin::TEXT::BIGINT AS xmin,catalog_xmin::TEXT::BIGINT AS catalog_xmin, restart_lsn - '0/0' AS restart_lsn, confirmed_flush_lsn - '0/0' AS confirm_lsn, CASE WHEN pg_is_in_recovery() THEN pg_last_wal_replay_lsn() ELSE pg_current_wal_lsn() END - restart_lsn AS retained_bytes, safe_wal_size, CASE wal_status WHEN 'reserved' THEN 0 WHEN 'extended' THEN 1 WHEN 'unreserved' THEN 2 WHEN 'lost' THEN 3 ELSE -1 END AS wal_status, spill_txns,spill_count,spill_bytes,stream_txns,stream_count,stream_bytes,total_txns,total_bytes,extract(EPOCH FROM stats_reset) AS reset_time, extract(EPOCH FROM inactive_since) AS inactive_since, CASE invalidation_reason WHEN 'wal_removed' THEN 1 WHEN 'rows_removed' THEN 2 WHEN 'wal_level_insufficient' THEN 3 ELSE 0 END AS invalidation_reason FROM pg_replication_slots s LEFT OUTER JOIN pg_stat_replication_slots ss ON s.slot_name = ss.slot_name; ttl: 10 min_version: 170000 tags: [ cluster ] metrics: - slot_name: { usage: LABEL ,description: "A unique, cluster-wide identifier for the replication slot" } - slot_type: { usage: LABEL ,description: "The slot type, physical or logical" } - plugin: { usage: LABEL ,description: "The base name of the shared object containing the output plugin this logical slot is using, or null for physical slots." } - datname: { usage: LABEL ,description: "The name of the database this slot is associated with, logical slots only, null for physical slot" } - datoid: { usage: GAUGE ,description: "The OID of the database this slot is associated with, logical slots only, null for physical slot" } - active_pid: { usage: GAUGE ,description: "The process ID of the session streaming data for this slot. NULL if inactive." } - active: { usage: GAUGE ,description: "True(1) if this slot is currently actively being used" } - temporary: { usage: GAUGE ,description: "True(1) if this is a temporary replication slot." } - two_phase: { usage: GAUGE ,description: "True(1) if the slot is enabled for decoding prepared transactions. Always false for physical slots." } - conflicting: { usage: GAUGE ,description: "True(1) if this logical slot conflicted with recovery. Always NULL for physical slots." } - failover: { usage: GAUGE ,description: "True(1) if this is a logical slot enabled to be synced to the standbys" } - synced: { usage: GAUGE ,description: "True(1) if this is a logical slot that was synced from a primary server" } - xmin: { usage: COUNTER ,description: "The oldest transaction that this slot needs the database to retain." } - catalog_xmin: { usage: COUNTER ,description: "The oldest transaction affecting the system catalogs that this slot needs the database to retain." } - restart_lsn: { usage: COUNTER ,description: "The address (LSN) of oldest WAL which still might be required by the consumer of this slot" } - confirm_lsn: { usage: COUNTER ,description: "The address (LSN) up to which the logical slot's consumer has confirmed receiving data." } - retained_bytes: { usage: GAUGE ,description: "Size of bytes that retained for this slot" } - safe_wal_size: { usage: GAUGE ,description: "bytes that can be written to WAL which will not make slot into lost" } - wal_status: { usage: GAUGE ,description: "WAL reserve status 0-3 means reserved,extended,unreserved,lost, -1 means other" } - spill_txns: { usage: COUNTER ,description: "Xacts that spilled to disk due to logical decode mem exceeding (subtrans included)" } - spill_count: { usage: COUNTER ,description: "Xacts that spilled to disk due to logical decode mem exceeding" } - spill_bytes: { usage: COUNTER ,description: "Bytes that spilled to disk due to logical decode mem exceeding" } - stream_txns: { usage: COUNTER ,description: "Xacts that streamed to decoding output plugin after mem exceed" } - stream_count: { usage: COUNTER ,description: "Xacts that streamed to decoding output plugin after mem exceed" } - stream_bytes: { usage: COUNTER ,description: "Bytes that streamed to decoding output plugin after mem exceed" } - total_txns: { usage: COUNTER ,description: "Number of decoded xacts sent to the decoding output plugin for this slot" } - total_bytes: { usage: COUNTER ,description: "Number of decoded bytes sent to the decoding output plugin for this slot" } - reset_time: { usage: GAUGE ,description: "When statistics were last reset" } - invalidation_reason: { usage: GAUGE ,description: "ok=0, wal_removed=1, rows_removed=2, wal_level_insufficient=3" } - inactive_since: { usage: GAUGE ,description: "The time when the slot became inactive" } pg_slot_16: name: pg_slot desc: PostgreSQL replication slot metrics v16 with conflicting, now slot also exists on standby query: |- SELECT s.slot_name, s.slot_type, plugin, database AS datname,datoid,active_pid, active,temporary,two_phase,conflicting,xmin::TEXT::BIGINT AS xmin,catalog_xmin::TEXT::BIGINT AS catalog_xmin, restart_lsn - '0/0' AS restart_lsn, confirmed_flush_lsn - '0/0' AS confirm_lsn, CASE WHEN pg_is_in_recovery() THEN pg_last_wal_replay_lsn() ELSE pg_current_wal_lsn() END - restart_lsn AS retained_bytes, safe_wal_size, CASE wal_status WHEN 'reserved' THEN 0 WHEN 'extended' THEN 1 WHEN 'unreserved' THEN 2 WHEN 'lost' THEN 3 ELSE -1 END AS wal_status, spill_txns,spill_count,spill_bytes,stream_txns,stream_count,stream_bytes,total_txns,total_bytes,extract(EPOCH FROM stats_reset) AS reset_time FROM pg_replication_slots s LEFT OUTER JOIN pg_stat_replication_slots ss ON s.slot_name = ss.slot_name; ttl: 10 min_version: 160000 max_version: 170000 tags: [ cluster ] metrics: - slot_name: { usage: LABEL ,description: "A unique, cluster-wide identifier for the replication slot" } - slot_type: { usage: LABEL ,description: "The slot type, physical or logical" } - plugin: { usage: LABEL ,description: "The base name of the shared object containing the output plugin this logical slot is using, or null for physical slots." } - datname: { usage: LABEL ,description: "The name of the database this slot is associated with, logical slots only, null for physical slot" } - datoid: { usage: GAUGE ,description: "The OID of the database this slot is associated with, logical slots only, null for physical slot" } - active_pid: { usage: GAUGE ,description: "The process ID of the session streaming data for this slot. NULL if inactive." } - active: { usage: GAUGE ,description: "True(1) if this slot is currently actively being used" } - temporary: { usage: GAUGE ,description: "True(1) if this is a temporary replication slot." } - two_phase: { usage: GAUGE ,description: "True(1) if the slot is enabled for decoding prepared transactions. Always false for physical slots." } - conflicting: { usage: GAUGE ,description: "True if this logical slot conflicted with recovery. Always NULL for physical slots." } - xmin: { usage: COUNTER ,description: "The oldest transaction that this slot needs the database to retain." } - catalog_xmin: { usage: COUNTER ,description: "The oldest transaction affecting the system catalogs that this slot needs the database to retain." } - restart_lsn: { usage: COUNTER ,description: "The address (LSN) of oldest WAL which still might be required by the consumer of this slot" } - confirm_lsn: { usage: COUNTER ,description: "The address (LSN) up to which the logical slot's consumer has confirmed receiving data." } - retained_bytes: { usage: GAUGE ,description: "Size of bytes that retained for this slot" } - safe_wal_size: { usage: GAUGE ,description: "bytes that can be written to WAL which will not make slot into lost" } - wal_status: { usage: GAUGE ,description: "WAL reserve status 0-3 means reserved,extended,unreserved,lost, -1 means other" } - spill_txns: { usage: COUNTER ,description: "Xacts that spilled to disk due to logical decode mem exceeding (subtrans included)" } - spill_count: { usage: COUNTER ,description: "Xacts that spilled to disk due to logical decode mem exceeding" } - spill_bytes: { usage: COUNTER ,description: "Bytes that spilled to disk due to logical decode mem exceeding" } - stream_txns: { usage: COUNTER ,description: "Xacts that streamed to decoding output plugin after mem exceed" } - stream_count: { usage: COUNTER ,description: "Xacts that streamed to decoding output plugin after mem exceed" } - stream_bytes: { usage: COUNTER ,description: "Bytes that streamed to decoding output plugin after mem exceed" } - total_txns: { usage: COUNTER ,description: "Number of decoded xacts sent to the decoding output plugin for this slot" } - total_bytes: { usage: COUNTER ,description: "Number of decoded bytes sent to the decoding output plugin for this slot" } - reset_time: { usage: GAUGE ,description: "When statistics were last reset" } pg_slot_14: name: pg_slot desc: PostgreSQL replication slot metrics v14 with pg_stat_replication_slots metrics query: |- SELECT s.slot_name, s.slot_type, plugin, database AS datname,datoid,active_pid, active,temporary,two_phase,xmin::TEXT::BIGINT AS xmin,catalog_xmin::TEXT::BIGINT AS catalog_xmin, restart_lsn - '0/0' AS restart_lsn, confirmed_flush_lsn - '0/0' AS confirm_lsn, CASE WHEN pg_is_in_recovery() THEN pg_last_wal_replay_lsn() ELSE pg_current_wal_lsn() END - restart_lsn AS retained_bytes, safe_wal_size, CASE wal_status WHEN 'reserved' THEN 0 WHEN 'extended' THEN 1 WHEN 'unreserved' THEN 2 WHEN 'lost' THEN 3 ELSE -1 END AS wal_status, spill_txns,spill_count,spill_bytes,stream_txns,stream_count,stream_bytes,total_txns,total_bytes,extract(EPOCH FROM stats_reset) AS reset_time FROM pg_replication_slots s LEFT OUTER JOIN pg_stat_replication_slots ss ON s.slot_name = ss.slot_name; ttl: 10 min_version: 140000 max_version: 160000 tags: [ cluster, primary ] metrics: - slot_name: { usage: LABEL ,description: "A unique, cluster-wide identifier for the replication slot" } - slot_type: { usage: LABEL ,description: "The slot type, physical or logical" } - plugin: { usage: LABEL ,description: "The base name of the shared object containing the output plugin this logical slot is using, or null for physical slots." } - datname: { usage: LABEL ,description: "The name of the database this slot is associated with, logical slots only, null for physical slot" } - datoid: { usage: GAUGE ,description: "The OID of the database this slot is associated with, logical slots only, null for physical slot" } - active_pid: { usage: GAUGE ,description: "The process ID of the session streaming data for this slot. NULL if inactive." } - active: { usage: GAUGE ,description: "True(1) if this slot is currently actively being used" } - temporary: { usage: GAUGE ,description: "True(1) if this is a temporary replication slot." } - two_phase: { usage: GAUGE ,description: "True(1) if the slot is enabled for decoding prepared transactions. Always false for physical slots." } - xmin: { usage: COUNTER ,description: "The oldest transaction that this slot needs the database to retain." } - catalog_xmin: { usage: COUNTER ,description: "The oldest transaction affecting the system catalogs that this slot needs the database to retain." } - restart_lsn: { usage: COUNTER ,description: "The address (LSN) of oldest WAL which still might be required by the consumer of this slot" } - confirm_lsn: { usage: COUNTER ,description: "The address (LSN) up to which the logical slot's consumer has confirmed receiving data." } - retained_bytes: { usage: GAUGE ,description: "Size of bytes that retained for this slot" } - safe_wal_size: { usage: GAUGE ,description: "bytes that can be written to WAL which will not make slot into lost" } - wal_status: { usage: GAUGE ,description: "WAL reserve status 0-3 means reserved,extended,unreserved,lost, -1 means other" } - spill_txns: { usage: COUNTER ,description: "Xacts that spilled to disk due to logical decode mem exceeding (subtrans included)" } - spill_count: { usage: COUNTER ,description: "Xacts that spilled to disk due to logical decode mem exceeding" } - spill_bytes: { usage: COUNTER ,description: "Bytes that spilled to disk due to logical decode mem exceeding" } - stream_txns: { usage: COUNTER ,description: "Xacts that streamed to decoding output plugin after mem exceed" } - stream_count: { usage: COUNTER ,description: "Xacts that streamed to decoding output plugin after mem exceed" } - stream_bytes: { usage: COUNTER ,description: "Bytes that streamed to decoding output plugin after mem exceed" } - total_txns: { usage: COUNTER ,description: "Number of decoded xacts sent to the decoding output plugin for this slot" } - total_bytes: { usage: COUNTER ,description: "Number of decoded bytes sent to the decoding output plugin for this slot" } - reset_time: { usage: GAUGE ,description: "When statistics were last reset" } pg_slot_13: name: pg_slot desc: PostgreSQL replication slot metrics v13 (wal safe size and status) query: |- SELECT slot_name, slot_type, plugin, database AS datname,datoid,active_pid, active,temporary,xmin::TEXT::BIGINT AS xmin,catalog_xmin::TEXT::BIGINT AS catalog_xmin, restart_lsn - '0/0' AS restart_lsn, confirmed_flush_lsn - '0/0' AS confirm_lsn, CASE WHEN pg_is_in_recovery() THEN pg_last_wal_replay_lsn() ELSE pg_current_wal_lsn() END - restart_lsn AS retained_bytes, safe_wal_size, CASE wal_status WHEN 'reserved' THEN 0 WHEN 'extended' THEN 1 WHEN 'unreserved' THEN 2 WHEN 'lost' THEN 3 ELSE -1 END AS wal_status FROM pg_replication_slots; ttl: 10 min_version: 130000 max_version: 140000 tags: [ cluster, primary ] metrics: - slot_name: { usage: LABEL ,description: "A unique, cluster-wide identifier for the replication slot" } - slot_type: { usage: LABEL ,description: "The slot type, physical or logical" } - plugin: { usage: LABEL ,description: "The base name of the shared object containing the output plugin this logical slot is using, or null for physical slots." } - datname: { usage: LABEL ,description: "The name of the database this slot is associated with, logical slots only, null for physical slot" } - datoid: { usage: GAUGE ,description: "The OID of the database this slot is associated with, logical slots only, null for physical slot" } - active_pid: { usage: GAUGE ,description: "The process ID of the session streaming data for this slot. NULL if inactive." } - active: { usage: GAUGE ,description: "True(1) if this slot is currently actively being used" } - temporary: { usage: GAUGE ,description: "True(1) if this is a temporary replication slot." } - xmin: { usage: COUNTER ,description: "The oldest transaction that this slot needs the database to retain." } - catalog_xmin: { usage: COUNTER ,description: "The oldest transaction affecting the system catalogs that this slot needs the database to retain." } - restart_lsn: { usage: COUNTER ,description: "The address (LSN) of oldest WAL which still might be required by the consumer of this slot" } - confirm_lsn: { usage: COUNTER ,description: "The address (LSN) up to which the logical slot's consumer has confirmed receiving data." } - retained_bytes: { usage: GAUGE ,description: "Size of bytes that retained for this slot" } - safe_wal_size: { usage: GAUGE ,description: "bytes that can be written to WAL which will not make slot into lost" } - wal_status: { usage: GAUGE ,description: "WAL reserve status 0-3 means reserved,extended,unreserved,lost, -1 means other" } pg_slot_10: name: pg_slot desc: PostgreSQL replication slot metrics 10 ~ 12 query: |- SELECT slot_name, slot_type, plugin, database AS datname,datoid,active_pid, active,temporary,xmin::TEXT::BIGINT AS xmin,catalog_xmin::TEXT::BIGINT AS catalog_xmin, restart_lsn - '0/0' AS restart_lsn, confirmed_flush_lsn - '0/0' AS confirm_lsn, CASE WHEN pg_is_in_recovery() THEN pg_last_wal_replay_lsn() ELSE pg_current_wal_lsn() END - restart_lsn AS retained_bytes FROM pg_replication_slots; ttl: 10 min_version: 100000 max_version: 130000 tags: [ cluster, primary ] metrics: - slot_name: { usage: LABEL ,description: "A unique, cluster-wide identifier for the replication slot" } - slot_type: { usage: LABEL ,description: "The slot type, physical or logical" } - plugin: { usage: LABEL ,description: "The base name of the shared object containing the output plugin this logical slot is using, or null for physical slots." } - datname: { usage: LABEL ,description: "The name of the database this slot is associated with, logical slots only, null for physical slot" } - datoid: { usage: GAUGE ,description: "The OID of the database this slot is associated with, logical slots only, null for physical slot" } - active_pid: { usage: GAUGE ,description: "The process ID of the session streaming data for this slot. NULL if inactive." } - active: { usage: GAUGE ,description: "True(1) if this slot is currently actively being used" } - temporary: { usage: GAUGE ,description: "True(1) if this is a temporary replication slot." } - xmin: { usage: COUNTER ,description: "The oldest transaction that this slot needs the database to retain." } - catalog_xmin: { usage: COUNTER ,description: "The oldest transaction affecting the system catalogs that this slot needs the database to retain." } - restart_lsn: { usage: COUNTER ,description: "The address (LSN) of oldest WAL which still might be required by the consumer of this slot" } - confirm_lsn: { usage: COUNTER ,description: "The address (LSN) up to which the logical slot's consumer has confirmed receiving data." } - retained_bytes: { usage: GAUGE ,description: "Size of bytes that retained for this slot" } ================================================ FILE: config/0250-pg_recv.yml ================================================ #==============================================================# # 0250 pg_recv #==============================================================# pg_recv_13: name: pg_recv desc: PostgreSQL walreceiver metrics 13+ query: |- SELECT coalesce(sender_host, (regexp_match(conninfo, '.*host=(\S+).*'))[1]) AS sender_host, coalesce(sender_port::TEXT, (regexp_match(conninfo, '.*port=(\S+).*'))[1]) AS sender_port, coalesce(slot_name, 'NULL') AS slot_name, pid, CASE status WHEN 'streaming' THEN 0 WHEN 'startup' THEN 1 WHEN 'catchup' THEN 2 WHEN 'backup' THEN 3 WHEN 'stopping' THEN 4 ELSE -1 END AS state, receive_start_lsn - '0/0' AS init_lsn,receive_start_tli AS init_tli, flushed_lsn - '0/0' AS flush_lsn,written_lsn - '0/0' AS write_lsn, received_tli AS flush_tli, latest_end_lsn - '0/0' AS reported_lsn, last_msg_send_time AS msg_send_time,last_msg_receipt_time AS msg_recv_time,latest_end_time AS reported_time,now() AS time FROM pg_stat_wal_receiver; ttl: 10 min_version: 130000 tags: [ cluster, replica ] metrics: - sender_host: { usage: LABEL ,description: "Host of the PostgreSQL instance this WAL receiver is connected to" } - sender_port: { usage: LABEL ,description: "Port number of the PostgreSQL instance this WAL receiver is connected to." } - slot_name: { usage: LABEL ,description: "Replication slot name used by this WAL receiver" } - pid: { usage: GAUGE ,description: "Process ID of the WAL receiver process" } - state: { usage: GAUGE ,description: "Encoded activity status of the WAL receiver process 0-4 for streaming|startup|catchup|backup|stopping" } - init_lsn: { usage: COUNTER ,description: "First write-ahead log location used when WAL receiver is started" } - init_tli: { usage: COUNTER ,description: "First timeline number used when WAL receiver is started" } - flush_lsn: { usage: COUNTER ,description: "Last write-ahead log location already received and flushed to disk" } - write_lsn: { usage: COUNTER ,description: "Last write-ahead log location already received and written to disk, but not flushed." } - flush_tli: { usage: COUNTER ,description: "Timeline number of last write-ahead log location received and flushed to disk" } - reported_lsn: { usage: COUNTER ,description: "Last write-ahead log location reported to origin WAL sender" } - msg_send_time: { usage: GAUGE ,description: "Send time of last message received from origin WAL sender" } - msg_recv_time: { usage: GAUGE ,description: "Receipt time of last message received from origin WAL sender" } - reported_time: { usage: GAUGE ,description: "Time of last write-ahead log location reported to origin WAL sender" } - time: { usage: GAUGE ,description: "Time of current snapshot" } pg_recv_11: name: pg_recv desc: PostgreSQL walreceiver metrics (11-12) query: |- SELECT coalesce(sender_host, (regexp_match(conninfo, '.*host=(\S+).*'))[1]) AS sender_host, coalesce(sender_port::TEXT, (regexp_match(conninfo, '.*port=(\S+).*'))[1]) AS sender_port, coalesce(slot_name, 'NULL') AS slot_name, pid, CASE status WHEN 'streaming' THEN 0 WHEN 'startup' THEN 1 WHEN 'catchup' THEN 2 WHEN 'backup' THEN 3 WHEN 'stopping' THEN 4 ELSE -1 END AS state, receive_start_lsn - '0/0' AS init_lsn,receive_start_tli AS init_tli, received_lsn - '0/0' AS flush_lsn, received_tli AS flush_tli, latest_end_lsn - '0/0' AS reported_lsn, last_msg_send_time AS msg_send_time,last_msg_receipt_time AS msg_recv_time,latest_end_time AS reported_time,now() AS time FROM pg_stat_wal_receiver; ttl: 10 tags: [ cluster, replica ] min_version: 110000 max_version: 130000 metrics: - sender_host: { usage: LABEL ,description: "Host of the PostgreSQL instance this WAL receiver is connected to" } - sender_port: { usage: LABEL ,description: "Port number of the PostgreSQL instance this WAL receiver is connected to." } - slot_name: { usage: LABEL ,description: "Replication slot name used by this WAL receiver" } - pid: { usage: GAUGE ,description: "Process ID of the WAL receiver process" } - state: { usage: GAUGE ,description: "Encoded activity status of the WAL receiver process 0-4 for streaming|startup|catchup|backup|stopping" } - init_lsn: { usage: COUNTER ,description: "First write-ahead log location used when WAL receiver is started" } - init_tli: { usage: COUNTER ,description: "First timeline number used when WAL receiver is started" } - flush_lsn: { usage: COUNTER ,description: "Last write-ahead log location already received and flushed to disk" } - flush_tli: { usage: COUNTER ,description: "Timeline number of last write-ahead log location received and flushed to disk" } - reported_lsn: { usage: COUNTER ,description: "Last write-ahead log location reported to origin WAL sender" } - msg_send_time: { usage: GAUGE ,description: "Send time of last message received from origin WAL sender" } - msg_recv_time: { usage: GAUGE ,description: "Receipt time of last message received from origin WAL sender" } - reported_time: { usage: GAUGE ,description: "Time of last write-ahead log location reported to origin WAL sender" } - time: { usage: GAUGE ,description: "Time of current snapshot" } pg_recv_10: name: pg_recv desc: PostgreSQL walreceiver metrics (10) query: |- SELECT (regexp_match(conninfo, '.*host=(\S+).*'))[1] AS sender_host, (regexp_match(conninfo, '.*port=(\S+).*'))[1] AS sender_port, coalesce(slot_name, 'NULL') AS slot_name, pid, CASE status WHEN 'streaming' THEN 0 WHEN 'startup' THEN 1 WHEN 'catchup' THEN 2 WHEN 'backup' THEN 3 WHEN 'stopping' THEN 4 ELSE -1 END AS state, receive_start_lsn - '0/0' AS init_lsn,receive_start_tli AS init_tli, received_lsn - '0/0' AS flush_lsn, received_tli AS flush_tli, latest_end_lsn - '0/0' AS reported_lsn, last_msg_send_time AS msg_send_time,last_msg_receipt_time AS msg_recv_time,latest_end_time AS reported_time,now() AS time FROM pg_stat_wal_receiver; ttl: 10 tags: [ cluster, replica ] min_version: 100000 max_version: 110000 metrics: - sender_host: { usage: LABEL ,description: "Host of the PostgreSQL instance this WAL receiver is connected to" } - sender_port: { usage: LABEL ,description: "Port number of the PostgreSQL instance this WAL receiver is connected to." } - slot_name: { usage: LABEL ,description: "Replication slot name used by this WAL receiver" } - pid: { usage: GAUGE ,description: "Process ID of the WAL receiver process" } - state: { usage: GAUGE ,description: "Encoded activity status of the WAL receiver process 0-4 for streaming|startup|catchup|backup|stopping" } - init_lsn: { usage: COUNTER ,description: "First write-ahead log location used when WAL receiver is started" } - init_tli: { usage: COUNTER ,description: "First timeline number used when WAL receiver is started" } - flush_lsn: { usage: COUNTER ,description: "Last write-ahead log location already received and flushed to disk" } - flush_tli: { usage: COUNTER ,description: "Timeline number of last write-ahead log location received and flushed to disk" } - reported_lsn: { usage: COUNTER ,description: "Last write-ahead log location reported to origin WAL sender" } - msg_send_time: { usage: GAUGE ,description: "Send time of last message received from origin WAL sender" } - msg_recv_time: { usage: GAUGE ,description: "Receipt time of last message received from origin WAL sender" } - reported_time: { usage: GAUGE ,description: "Time of last write-ahead log location reported to origin WAL sender" } - time: { usage: GAUGE ,description: "Time of current snapshot" } ================================================ FILE: config/0260-pg_sub.yml ================================================ #==============================================================# # 0260 pg_sub #==============================================================# pg_sub_16: name: pg_sub desc: PostgreSQL subscription statistics (16+) query: |- SELECT s1.subname, subid AS id, pid, received_lsn, reported_lsn, msg_send_time, msg_recv_time, reported_time, apply_error_count, sync_error_count FROM (SELECT subname, subid, pid, received_lsn - '0/0' AS received_lsn, latest_end_lsn - '0/0' AS reported_lsn, extract(epoch from last_msg_send_time) AS msg_send_time, extract(epoch from last_msg_receipt_time) AS msg_recv_time, extract(epoch from latest_end_time) AS reported_time FROM pg_stat_subscription WHERE relid IS NULL AND leader_pid IS NULL) s1 LEFT OUTER JOIN pg_stat_subscription_stats s2 USING(subid); ttl: 10 min_version: 160000 tags: [ cluster ] metrics: - subname: { usage: LABEL ,description: "Name of this subscription" } - id: { usage: GAUGE ,description: "OID of the subscription" } - pid: { usage: GAUGE ,description: "Process ID of the subscription leader apply worker" } - received_lsn: { usage: COUNTER ,description: "Last write-ahead log location received" } - reported_lsn: { usage: COUNTER ,description: "Last write-ahead log location reported to origin WAL sender" } - msg_send_time: { usage: GAUGE ,description: "Send time of last message received from origin WAL sender" } - msg_recv_time: { usage: GAUGE ,description: "Receipt time of last message received from origin WAL sender" } - reported_time: { usage: GAUGE ,description: "Time of last write-ahead log location reported to origin WAL sender" } - apply_error_count: { usage: COUNTER ,description: "Number of times an error occurred while applying changes" } - sync_error_count: { usage: COUNTER ,description: "Number of times an error occurred during the initial table synchronization" } pg_sub_15: name: pg_sub desc: PostgreSQL subscription statistics (15) query: |- SELECT s1.subname, subid AS id, pid, received_lsn, reported_lsn, msg_send_time, msg_recv_time, reported_time, apply_error_count, sync_error_count FROM (SELECT subname, subid, pid, received_lsn - '0/0' AS received_lsn, latest_end_lsn - '0/0' AS reported_lsn, extract(epoch from last_msg_send_time) AS msg_send_time, extract(epoch from last_msg_receipt_time) AS msg_recv_time, extract(epoch from latest_end_time) AS reported_time FROM pg_stat_subscription WHERE relid ISNULL) s1 LEFT OUTER JOIN pg_stat_subscription_stats s2 USING(subid); ttl: 10 min_version: 150000 max_version: 160000 tags: [ cluster ] metrics: - subname: { usage: LABEL ,description: "Name of this subscription" } - id: { usage: GAUGE ,description: "OID of the subscription" } - pid: { usage: GAUGE ,description: "Process ID of the subscription main apply worker process" } - received_lsn: { usage: COUNTER ,description: "Last write-ahead log location received" } - reported_lsn: { usage: COUNTER ,description: "Last write-ahead log location reported to origin WAL sender" } - msg_send_time: { usage: GAUGE ,description: "Send time of last message received from origin WAL sender" } - msg_recv_time: { usage: GAUGE ,description: "Receipt time of last message received from origin WAL sender" } - reported_time: { usage: GAUGE ,description: "Time of last write-ahead log location reported to origin WAL sender" } - apply_error_count: { usage: COUNTER ,description: "Number of times an error occurred while applying changes." } - sync_error_count: { usage: COUNTER ,description: "Number of times an error occurred during the initial table synchronization" } pg_sub_10: name: pg_sub desc: PostgreSQL subscription statistics (10-14) query: |- SELECT subname, subid AS id, pid, received_lsn - '0/0' AS received_lsn, latest_end_lsn - '0/0' AS reported_lsn, extract(epoch from last_msg_send_time) AS msg_send_time, extract(epoch from last_msg_receipt_time) AS msg_recv_time, extract(epoch from latest_end_time) AS reported_time FROM pg_stat_subscription WHERE relid ISNULL; ttl: 10 min_version: 100000 max_version: 150000 tags: [ cluster ] metrics: - subname: { usage: LABEL ,description: "Name of this subscription" } - id: { usage: GAUGE ,description: "OID of the subscription" } - pid: { usage: GAUGE ,description: "Process ID of the subscription main apply worker process" } - received_lsn: { usage: COUNTER ,description: "Last write-ahead log location received" } - reported_lsn: { usage: COUNTER ,description: "Last write-ahead log location reported to origin WAL sender" } - msg_send_time: { usage: GAUGE ,description: "Send time of last message received from origin WAL sender" } - msg_recv_time: { usage: GAUGE ,description: "Receipt time of last message received from origin WAL sender" } - reported_time: { usage: GAUGE ,description: "Time of last write-ahead log location reported to origin WAL sender" } ================================================ FILE: config/0270-pg_origin.yml ================================================ #==============================================================# # 0270 pg_origin #==============================================================# # skip by default, require additional privilege setup # GRANT SELECT ON pg_replication_origin, pg_replication_origin_status TO pg_monitor; pg_origin: name: pg_origin desc: PostgreSQL replay state (approximate) for a certain origin query: SELECT roname, remote_lsn - '0/0' AS remote_lsn, local_lsn - '0/0' AS local_lsn FROM pg_replication_origin o LEFT JOIN pg_replication_origin_status os ON o.roident = os.local_id; ttl: 10 min_version: 090500 skip: true tags: [ cluster ] metrics: - roname: { usage: LABEL ,description: "The external, user defined, name of a replication origin." } - remote_lsn: { usage: COUNTER ,description: "The origin node's LSN up to which data has been replicated." } - local_lsn: { usage: COUNTER ,description: "This node's LSN at which remote_lsn has been replicated." } ================================================ FILE: config/0300-pg_io.yml ================================================ #==============================================================# # 0300 pg_io #==============================================================# pg_io_18: name: pg_io desc: PostgreSQL I/O stats since v18 query: |- SELECT backend_type AS "type",object,context,reads,read_bytes,read_time,writes,write_bytes,write_time,writebacks,writeback_time, extends,extend_bytes,extend_time,hits,evictions,reuses,fsyncs,fsync_time,extract(EPOCH FROM stats_reset) AS reset_time FROM pg_stat_io; ttl: 10 timeout: 1 min_version: 180000 tags: [ cluster ] metrics: - type: { usage: LABEL ,description: "Type of backend" } - object: { usage: LABEL ,description: "Target object of an I/O operation, relation or temp" } - context: { usage: LABEL ,description: "The context of an I/O operation. normal,vacuum,bulkread,bulkwrite" } - reads: { usage: COUNTER ,default: 0 ,description: "Number of read operations, each of the size specified in op_bytes." } - read_bytes: { usage: COUNTER ,default: 0 ,description: "Number of read bytes" } - read_time: { usage: COUNTER ,default: 0 ,scale: 1e-3 ,description: "Time spent in read operations in seconds" } - writes: { usage: COUNTER ,default: 0 ,description: "Number of write operations, each of the size specified in op_bytes." } - write_time: { usage: COUNTER ,default: 0 ,scale: 1e-3 ,description: "Time spent in write operations in seconds" } - write_bytes: { usage: COUNTER ,default: 0 ,description: "Number of write bytes" } - writebacks: { usage: COUNTER ,default: 0 ,description: "Number of units of size op_bytes which the process requested the kernel write out to permanent storage." } - writeback_time: { usage: COUNTER ,default: 0 ,scale: 1e-3 ,description: "Time spent in writeback operations in seconds" } - extends: { usage: COUNTER ,default: 0 ,description: "Number of relation extend operations, each of the size specified in op_bytes." } - extend_bytes: { usage: COUNTER ,default: 0 ,description: "Number of extend bytes" } - extend_time: { usage: COUNTER ,default: 0 ,scale: 1e-3 ,description: "Time spent in extend operations in seconds" } - hits: { usage: COUNTER ,default: 0 ,description: "The number of times a desired block was found in a shared buffer." } - evictions: { usage: COUNTER ,default: 0 ,description: "Number of times a block has been written out from a shared or local buffer" } - reuses: { usage: COUNTER ,default: 0 ,description: "The number of times an existing buffer is reused" } - fsyncs: { usage: COUNTER ,default: 0 ,description: "Number of fsync calls. These are only tracked in context normal" } - fsync_time: { usage: COUNTER ,default: 0 ,scale: 1e-3 ,description: "Time spent in fsync operations in seconds" } - reset_time: { usage: GAUGE ,description: "Timestamp at which these statistics were last reset" } pg_io_16: name: pg_io desc: PostgreSQL I/O stats query: |- SELECT backend_type AS "type", object, context, reads, read_time,writes,write_time,writebacks,writeback_time,extends, extend_time,hits,evictions,reuses,fsyncs,fsync_time,extract(EPOCH FROM stats_reset) AS reset_time FROM pg_stat_io; ttl: 10 timeout: 1 min_version: 160000 max_version: 180000 tags: [ cluster ] metrics: - type: { usage: LABEL ,description: "Type of backend" } - object: { usage: LABEL ,description: "Target object of an I/O operation, relation or temp" } - context: { usage: LABEL ,description: "The context of an I/O operation. normal,vacuum,bulkread,bulkwrite" } - reads: { usage: COUNTER ,default: 0 ,description: "Number of read operations, each of the size specified in op_bytes." } - read_time: { usage: COUNTER ,default: 0 ,scale: 1e-3 ,description: "Time spent in read operations in seconds" } - writes: { usage: COUNTER ,default: 0 ,description: "Number of write operations, each of the size specified in op_bytes." } - write_time: { usage: COUNTER ,default: 0 ,scale: 1e-3 ,description: "Time spent in write operations in seconds" } - writebacks: { usage: COUNTER ,default: 0 ,description: "Number of units of size op_bytes which the process requested the kernel write out to permanent storage." } - writeback_time: { usage: COUNTER ,default: 0 ,scale: 1e-3 ,description: "Time spent in writeback operations in seconds" } - extends: { usage: COUNTER ,default: 0 ,description: "Number of relation extend operations, each of the size specified in op_bytes." } - extend_time: { usage: COUNTER ,default: 0 ,scale: 1e-3 ,description: "Time spent in extend operations in seconds" } - hits: { usage: COUNTER ,default: 0 ,description: "The number of times a desired block was found in a shared buffer." } - evictions: { usage: COUNTER ,default: 0 ,description: "Number of times a block has been written out from a shared or local buffer" } - reuses: { usage: COUNTER ,default: 0 ,description: "The number of times an existing buffer is reused" } - fsyncs: { usage: COUNTER ,default: 0 ,description: "Number of fsync calls. These are only tracked in context normal" } - fsync_time: { usage: COUNTER ,default: 0 ,scale: 1e-3 ,description: "Time spent in fsync operations in seconds" } - reset_time: { usage: GAUGE ,description: "Timestamp at which these statistics were last reset" } ================================================ FILE: config/0310-pg_size.yml ================================================ #==============================================================# # 0310 pg_size #==============================================================# pg_size: name: pg_size desc: PostgreSQL Database, WAL, Log size since v10 query: |- SELECT datname, pg_database_size(oid) AS bytes FROM pg_database UNION ALL SELECT 'log', CASE WHEN current_setting('logging_collector') = 'on' THEN COALESCE((SELECT SUM(size) FROM pg_catalog.pg_ls_logdir()), 0) ELSE 0 END UNION ALL SELECT 'wal', COALESCE((SELECT SUM(size) FROM pg_catalog.pg_ls_waldir()), 0); ttl: 60 timeout: 1 min_version: 100000 tags: [ cluster ] metrics: - datname: { usage: LABEL ,description: "Database name, or special category wal, or log" } - bytes: { usage: GAUGE ,description: "File size in bytes" } ================================================ FILE: config/0320-pg_archiver.yml ================================================ #==============================================================# # 0320 pg_archiver #==============================================================# pg_archiver: name: pg_archiver desc: PostgreSQL archiver process statistics query: |- SELECT archived_count AS finish_count,failed_count, extract(epoch FROM last_archived_time) AS finish_time, extract(epoch FROM last_failed_time) AS failed_time, extract(epoch FROM stats_reset) AS reset_time FROM pg_stat_archiver; ttl: 60 min_version: 090400 tags: [ cluster ] metrics: - finish_count: { usage: COUNTER ,description: "Number of WAL files that have been successfully archived" } - failed_count: { usage: COUNTER ,description: "Number of failed attempts for archiving WAL files" } - finish_time: { usage: GAUGE ,description: "Time of the last successful archive operation" } - failed_time: { usage: GAUGE ,description: "Time of the last failed archival operation" } - reset_time: { usage: GAUGE ,description: "Time at which archive statistics were last reset" } ================================================ FILE: config/0330-pg_bgwriter.yml ================================================ #==============================================================# # 0330 pg_bgwriter #==============================================================# # https://pgpedia.info/p/pg_stat_bgwriter.html pg_bgwriter_17: name: pg_bgwriter desc: "PostgreSQL background writer metrics PG 17+" query: SELECT buffers_clean, maxwritten_clean, buffers_alloc, extract(EPOCH FROM stats_reset) AS reset_time FROM pg_stat_bgwriter; ttl: 10 min_version: 170000 tags: [ cluster ] metrics: - buffers_clean: { usage: COUNTER ,description: "Number of buffers written by the background writer" } - maxwritten_clean: { usage: COUNTER ,description: "Number of times the background writer stopped a cleaning scan because it had written too many buffers" } - buffers_alloc: { usage: COUNTER ,description: "Number of buffers allocated" } - reset_time: { usage: GAUGE ,description: "Time at which bgwriter statistics were last reset" } pg_bgwriter_10: name: pg_bgwriter desc: "PostgreSQL background writer metrics (PG 9.4-16)" query: SELECT checkpoints_timed, checkpoints_req, checkpoint_write_time, checkpoint_sync_time, buffers_checkpoint, buffers_clean, buffers_backend, maxwritten_clean, buffers_backend_fsync, buffers_alloc, extract(EPOCH FROM stats_reset) AS reset_time FROM pg_stat_bgwriter; ttl: 10 min_version: 090400 max_version: 170000 tags: [ cluster ] metrics: - checkpoints_timed: { usage: COUNTER ,description: "Number of scheduled checkpoints that have been performed" } - checkpoints_req: { usage: COUNTER ,description: "Number of requested checkpoints that have been performed" } - checkpoint_write_time: { usage: COUNTER ,scale: 1e-3 ,description: "Total amount of time that has been spent in the portion of checkpoint processing where files are written to disk, in seconds" } - checkpoint_sync_time: { usage: COUNTER ,scale: 1e-3 ,description: "Total amount of time that has been spent in the portion of checkpoint processing where files are synchronized to disk, in seconds" } - buffers_checkpoint: { usage: COUNTER ,description: "Number of buffers written during checkpoints" } - buffers_clean: { usage: COUNTER ,description: "Number of buffers written by the background writer" } - buffers_backend: { usage: COUNTER ,description: "Number of buffers written directly by a backend" } - maxwritten_clean: { usage: COUNTER ,description: "Number of times the background writer stopped a cleaning scan because it had written too many buffers" } - buffers_backend_fsync: { usage: COUNTER ,description: "Number of times a backend had to execute its own fsync call" } - buffers_alloc: { usage: COUNTER ,description: "Number of buffers allocated" } - reset_time: { usage: GAUGE ,description: "Time at which bgwriter statistics were last reset" } ================================================ FILE: config/0331-pg_checkpointer.yml ================================================ #==============================================================# # 0331 pg_checkpointer #==============================================================# pg_checkpointer_18: name: pg_checkpointer desc: "PostgreSQL checkpointer stat metrics for pg 18+" query: SELECT num_timed, num_requested, num_done, restartpoints_timed, restartpoints_req, restartpoints_done, write_time, sync_time, buffers_written, slru_written, extract(EPOCH FROM stats_reset) AS reset_time FROM pg_stat_checkpointer; ttl: 10 min_version: 180000 tags: [ cluster ] metrics: - num_timed: { usage: COUNTER ,rename: timed ,description: "Number of scheduled checkpoints that have been performed" } - num_requested: { usage: COUNTER ,rename: req ,description: "Number of requested checkpoints that have been performed" } - num_done: { usage: COUNTER ,rename: done ,description: "Number of checkpoints that have been performed" } - restartpoints_timed: { usage: COUNTER ,description: "Number of scheduled restartpoints due to timeout or after a failed attempt to perform it" } - restartpoints_req: { usage: COUNTER ,description: "Number of requested restartpoints" } - restartpoints_done: { usage: COUNTER ,description: "Number of restartpoints that have been performed" } - write_time: { usage: COUNTER ,scale: 1e-3 ,description: "Total amount of time that has been spent in the portion of checkpoint processing where files are written to disk, in seconds" } - sync_time: { usage: COUNTER ,scale: 1e-3 ,description: "Total amount of time that has been spent in the portion of checkpoint processing where files are synchronized to disk, in seconds" } - buffers_written: { usage: COUNTER ,description: "Number of buffers written during checkpoints and restartpoints" } - slru_written: { usage: COUNTER ,description: "Number of SLRU buffers written during checkpoints and restartpoints" } - reset_time: { usage: GAUGE ,description: "Time at which checkpointer statistics were last reset" } pg_checkpointer_17: name: pg_checkpointer desc: "PostgreSQL checkpointer stat metrics for pg 17" query: SELECT num_timed, num_requested, restartpoints_timed, restartpoints_req, restartpoints_done, write_time, sync_time, buffers_written, extract(EPOCH FROM stats_reset) AS reset_time FROM pg_stat_checkpointer; ttl: 10 min_version: 170000 max_version: 180000 tags: [ cluster ] metrics: - num_timed: { usage: COUNTER ,rename: timed ,description: "Number of scheduled checkpoints that have been performed" } - num_requested: { usage: COUNTER ,rename: req ,description: "Number of requested checkpoints that have been performed" } - restartpoints_timed: { usage: COUNTER ,description: "Number of scheduled restartpoints due to timeout or after a failed attempt to perform it" } - restartpoints_req: { usage: COUNTER ,description: "Number of requested restartpoints" } - restartpoints_done: { usage: COUNTER ,description: "Number of restartpoints that have been performed" } - write_time: { usage: COUNTER ,scale: 1e-3 ,description: "Total amount of time that has been spent in the portion of checkpoint processing where files are written to disk, in seconds" } - sync_time: { usage: COUNTER ,scale: 1e-3 ,description: "Total amount of time that has been spent in the portion of checkpoint processing where files are synchronized to disk, in seconds" } - buffers_written: { usage: COUNTER ,description: "Number of buffers written during checkpoints and restartpoints" } - reset_time: { usage: GAUGE ,description: "Time at which checkpointer statistics were last reset" } pg_checkpointer_10: name: pg_checkpointer desc: "PostgreSQL checkpointer stat metrics for pg 9.4-16" query: SELECT checkpoints_timed, checkpoints_req, checkpoint_write_time, checkpoint_sync_time, buffers_checkpoint, extract(EPOCH FROM stats_reset) AS reset_time FROM pg_stat_bgwriter; ttl: 10 min_version: 090400 max_version: 170000 tags: [ cluster ] metrics: - checkpoints_timed: { usage: COUNTER ,rename: timed ,description: "Number of scheduled checkpoints that have been performed" } - checkpoints_req: { usage: COUNTER ,rename: req ,description: "Number of requested checkpoints that have been performed" } - checkpoint_write_time: { usage: COUNTER ,rename: write_time ,scale: 1e-3 ,description: "Total amount of time that has been spent in the portion of checkpoint processing where files are written to disk, in seconds" } - checkpoint_sync_time: { usage: COUNTER ,rename: sync_time ,scale: 1e-3 ,description: "Total amount of time that has been spent in the portion of checkpoint processing where files are synchronized to disk, in seconds" } - buffers_checkpoint: { usage: COUNTER ,rename: buffers_written ,description: "Number of buffers written during checkpoints and restartpoints" } - reset_time: { usage: GAUGE ,description: "Time at which checkpointer statistics were last reset" } ================================================ FILE: config/0340-pg_ssl.yml ================================================ #==============================================================# # 0340 pg_ssl #==============================================================# pg_ssl: name: pg_ssl desc: PostgreSQL SSL client connection count query: | SELECT count(*) FILTER (WHERE ssl) AS enabled, count(*) FILTER ( WHERE NOT ssl) AS disabled FROM pg_stat_ssl; ttl: 10 min_version: 090500 tags: [ cluster ] metrics: - enabled: { usage: GAUGE ,description: "Number of client connection that use ssl" } - disabled: { usage: GAUGE ,description: "Number of client connection that does not use ssl" } ================================================ FILE: config/0350-pg_checkpoint.yml ================================================ #==============================================================# # 0350 pg_checkpoint #==============================================================# pg_checkpoint: name: pg_checkpoint desc: checkpoint information from pg_control_checkpoint since 10 query: |- SELECT checkpoint_lsn - '0/0' AS checkpoint_lsn, redo_lsn - '0/0' AS redo_lsn, timeline_id AS tli, prev_timeline_id AS prev_tli, full_page_writes, split_part(next_xid, ':', 1) AS next_xid_epoch, split_part(next_xid, ':', 2) AS next_xid, next_oid::BIGINT, next_multixact_id::text::BIGINT, next_multi_offset::text::BIGINT, oldest_xid::text::BIGINT, oldest_xid_dbid::text::BIGINT, oldest_active_xid::text::BIGINT, oldest_multi_xid::text::BIGINT, oldest_multi_dbid::BIGINT, oldest_commit_ts_xid::text::BIGINT, newest_commit_ts_xid::text::BIGINT, checkpoint_time AS time, extract(epoch from now() - checkpoint_time) AS elapse FROM pg_control_checkpoint(); ttl: 60 min_version: 100000 tags: [ cluster ] metrics: - checkpoint_lsn: { usage: COUNTER ,description: "Latest checkpoint location" } - redo_lsn: { usage: COUNTER ,description: "Latest checkpoint's REDO location" } - tli: { usage: COUNTER ,description: "Latest checkpoint's TimeLineID" } - prev_tli: { usage: COUNTER ,description: "Latest checkpoint's PrevTimeLineID" } - full_page_writes: { usage: GAUGE ,description: "Latest checkpoint's full_page_writes enabled" } - next_xid_epoch: { usage: COUNTER ,description: "Latest checkpoint's NextXID epoch" } - next_xid: { usage: COUNTER ,description: "Latest checkpoint's NextXID xid" } - next_oid: { usage: COUNTER ,description: "Latest checkpoint's NextOID" } - next_multixact_id: { usage: COUNTER ,description: "Latest checkpoint's NextMultiXactId" } - next_multi_offset: { usage: COUNTER ,description: "Latest checkpoint's NextMultiOffset" } - oldest_xid: { usage: COUNTER ,description: "Latest checkpoint's oldestXID" } - oldest_xid_dbid: { usage: GAUGE ,description: "Latest checkpoint's oldestXID's DB OID" } - oldest_active_xid: { usage: COUNTER ,description: "Latest checkpoint's oldestActiveXID" } - oldest_multi_xid: { usage: COUNTER ,description: "Latest checkpoint's oldestMultiXid" } - oldest_multi_dbid: { usage: GAUGE ,description: "Latest checkpoint's oldestMulti's DB OID" } - oldest_commit_ts_xid: { usage: COUNTER ,description: "Latest checkpoint's oldestCommitTsXid" } - newest_commit_ts_xid: { usage: COUNTER ,description: "Latest checkpoint's newestCommitTsXid" } - time: { usage: COUNTER ,description: "Time of latest checkpoint" } - elapse: { usage: GAUGE ,description: "Seconds elapsed since latest checkpoint in seconds" } ================================================ FILE: config/0355-pg_timeline.yml ================================================ #==============================================================# # 0355 pg_timeline #==============================================================# pg_timeline: name: pg_timeline desc: Current timeline ID from primary or replica query: | SELECT COALESCE( (SELECT received_tli FROM pg_stat_wal_receiver), (SELECT timeline_id FROM pg_control_checkpoint()) ) AS id; ttl: 10 min_version: 100000 tags: [ cluster ] metrics: - id: { usage: GAUGE ,description: "Current timeline ID" } ================================================ FILE: config/0360-pg_recovery.yml ================================================ #==============================================================# # 0360 pg_recovery #==============================================================# pg_recovery: name: pg_recovery desc: PostgreSQL control recovery metrics (9.6+) query: | SELECT min_recovery_end_timeline AS min_timeline, min_recovery_end_lsn - '0/0' AS min_lsn, backup_start_lsn - '0/0' AS backup_start_lsn, backup_end_lsn - '0/0' AS backup_end_lsn, end_of_backup_record_required AS require_record FROM pg_control_recovery(); ttl: 10 min_version: 090600 tags: [ cluster, replica ] metrics: - min_timeline: { usage: COUNTER ,description: "Min recovery ending loc's timeline" } - min_lsn: { usage: COUNTER ,description: "Minimum recovery ending location" } - backup_start_lsn: { usage: COUNTER ,description: "Backup start location" } - backup_end_lsn: { usage: COUNTER ,description: "Backup end location" } - require_record: { usage: GAUGE ,description: "End-of-backup record required" } pg_recovery_prefetch: name: pg_recovery_prefetch desc: PostgreSQL recovery prefetch metrics (15+) query: SELECT prefetch,hit,skip_init,skip_new,skip_fpw,skip_rep,wal_distance,block_distance,io_depth,extract(EPOCH FROM stats_reset) AS reset_time FROM pg_stat_recovery_prefetch; ttl: 10 min_version: 150000 tags: [ cluster, replica ] metrics: - prefetch: { usage: COUNTER ,description: "Number of blocks prefetched because they were not in the buffer pool" } - hit: { usage: COUNTER ,description: "Number of blocks not prefetched because they were already in the buffer pool" } - skip_init: { usage: COUNTER ,description: "Number of blocks not prefetched because they would be zero-initialized" } - skip_new: { usage: COUNTER ,description: "Number of blocks not prefetched because they didn't exist yet" } - skip_fpw: { usage: COUNTER ,description: "Number of blocks not prefetched because a full page image was included in the WAL" } - skip_rep: { usage: COUNTER ,description: "Number of blocks not prefetched because they were already recently prefetched" } - wal_distance: { usage: GAUGE ,description: "How many bytes ahead the prefetcher is looking" } - block_distance: { usage: GAUGE ,description: "How many blocks ahead the prefetcher is looking" } - io_depth: { usage: GAUGE ,description: "How many prefetches have been initiated but are not yet known to have completed" } - reset_time: { usage: GAUGE ,description: "Time at which these recovery prefetch statistics were last reset" } ================================================ FILE: config/0370-pg_slru.yml ================================================ #==============================================================# # 0370 pg_slru #==============================================================# pg_slru_13: name: pg_slru desc: PostgreSQL simple-least-recently-used (SLRU) cache statistics v13 query: SELECT name, blks_zeroed, blks_hit, blks_read, blks_written, blks_exists, flushes, truncates, extract(EPOCH FROM stats_reset) AS reset_time FROM pg_stat_slru; ttl: 60 min_version: 130000 tags: [ cluster ] metrics: - name: { usage: LABEL ,description: "Name of the SLRU" } - blks_zeroed: { usage: COUNTER ,description: "Number of blocks zeroed during initializations" } - blks_hit: { usage: COUNTER ,description: "Number of times disk blocks were found already in the SLRU, so that a read was not necessary" } - blks_read: { usage: COUNTER ,description: "Number of disk blocks read for this SLRU" } - blks_written: { usage: COUNTER ,description: "Number of disk blocks written for this SLRU" } - blks_exists: { usage: COUNTER ,description: "Number of blocks checked for existence for this SLRU" } - flushes: { usage: COUNTER ,description: "Number of flushes of dirty data for this SLRU" } - truncates: { usage: COUNTER ,description: "Number of truncates for this SLRU" } - reset_time: { usage: GAUGE ,description: "Time at which these statistics were last reset" } ================================================ FILE: config/0380-pg_shmem.yml ================================================ #==============================================================# # 0380 pg_shmem #==============================================================# # pg_shmem require su privilege to work. Disable it or create auxiliary function with su before use: # CREATE OR REPLACE FUNCTION monitor.pg_shmem() RETURNS SETOF pg_shmem_allocations AS $$ SELECT * FROM pg_shmem_allocations;$$ LANGUAGE SQL SECURITY DEFINER; pg_shmem: name: pg_shmem desc: Allocations made from the server's main shared memory segment query: SELECT coalesce(name, 'Free') AS name, off AS offset, size, allocated_size FROM monitor.pg_shmem(); ttl: 60 min_version: 130000 skip: true # disable it by default tags: [cluster, "schema:monitor" ] metrics: - name: { usage: LABEL ,description: "Name of the shared memory allocation" } - offset: { usage: GAUGE ,description: "The offset at which the allocation starts" } - size: { usage: GAUGE ,description: "Size of the allocation" } - allocated_size: { usage: GAUGE ,description: "Size of the allocation including padding" } ================================================ FILE: config/0390-pg_wal.yml ================================================ #==============================================================# # 0390 pg_wal #==============================================================# pg_wal_18: name: pg_wal desc: PostgreSQL WAL statistics since v18 with some col removed query: SELECT wal_records AS records, wal_fpi AS fpi, wal_bytes AS bytes, wal_buffers_full AS buffers_full,extract(EPOCH FROM stats_reset) AS reset_time FROM pg_stat_wal; ttl: 10 tags: [ cluster ] min_version: 180000 metrics: - records: { usage: COUNTER ,description: "Total number of WAL records generated" } - fpi: { usage: COUNTER ,description: "Total number of WAL full page images generated" } - bytes: { usage: COUNTER ,description: "Total amount of WAL generated in bytes" } - buffers_full: { usage: COUNTER ,description: "Number of times WAL data was written to disk because WAL buffers became full" } - reset_time: { usage: GAUGE ,description: "When statistics were last reset" } pg_wal_14: name: pg_wal desc: PostgreSQL WAL statistics since v14 query: SELECT wal_records AS records, wal_fpi AS fpi, wal_bytes AS bytes, wal_buffers_full AS buffers_full, wal_write AS write, wal_sync AS sync, wal_write_time AS write_time, wal_sync_time AS sync_time, extract(EPOCH FROM stats_reset) AS reset_time FROM pg_stat_wal; ttl: 10 tags: [ cluster ] min_version: 140000 max_version: 180000 metrics: - records: { usage: COUNTER ,description: "Total number of WAL records generated" } - fpi: { usage: COUNTER ,description: "Total number of WAL full page images generated" } - bytes: { usage: COUNTER ,description: "Total amount of WAL generated in bytes" } - buffers_full: { usage: COUNTER ,description: "Number of times WAL data was written to disk because WAL buffers became full" } - write: { usage: COUNTER ,description: "Number of times WAL buffers were written out to disk via XLogWrite request." } - sync: { usage: COUNTER ,description: "Number of times WAL files were synced to disk via issue_xlog_fsync request" } - write_time: { usage: COUNTER ,scale: 1e-3 ,description: "Total amount of time spent writing WAL buffers to disk via XLogWrite request in seconds" } - sync_time: { usage: COUNTER ,scale: 1e-3 ,description: "Total amount of time spent syncing WAL files to disk via issue_xlog_fsync request, in seconds" } - reset_time: { usage: GAUGE ,description: "When statistics were last reset" } ================================================ FILE: config/0410-pg_activity.yml ================================================ #==============================================================# # 0410 pg_activity #==============================================================# pg_activity: name: pg_activity desc: PostgreSQL backend activity group by database and state query: |- SELECT datname, state, coalesce(count, 0) AS count, coalesce(max_duration, 0) AS max_duration, coalesce(max_tx_duration, 0) AS max_tx_duration, coalesce(max_conn_duration, 0) AS max_conn_duration FROM (SELECT d.datname, a.state FROM pg_database d, unnest(ARRAY ['active','idle','idle in transaction','idle in transaction (aborted)','fastpath function call','disabled']) a(state) WHERE d.datallowconn AND NOT d.datistemplate) base LEFT JOIN (SELECT datname, state, count(*) AS count, max(extract(epoch from now() - state_change)) AS max_duration, max(extract(epoch from now() - xact_start)) AS max_tx_duration, max(extract(epoch from now() - backend_start)) AS max_conn_duration FROM pg_stat_activity WHERE pid <> pg_backend_pid() GROUP BY 1,2) data USING (datname,state); ttl: 10 min_version: 090400 tags: [ cluster ] metrics: - datname: { usage: LABEL ,description: "Name of the database this backend is connected to" } - state: { usage: LABEL ,description: "Current overall state of this backend." } - count: { usage: GAUGE ,description: "Count of connection among (datname,state)" } - max_duration: { usage: GAUGE ,description: "Max duration since last state change among (datname, state)" } - max_tx_duration: { usage: GAUGE ,description: "Max transaction duration since state change among (datname, state)" } - max_conn_duration: { usage: GAUGE ,description: "Max backend session duration since state change among (datname, state)" } ================================================ FILE: config/0420-pg_wait.yml ================================================ #==============================================================# # 0420 pg_wait #==============================================================# pg_wait: name: pg_wait desc: PostgreSQL backend client count group by wait event type since 9.6 query: | SELECT coalesce(datname, '_system') AS datname, coalesce(wait_event_type, 'Running') AS event, count(*) AS count FROM pg_stat_activity GROUP BY 1, 2; ttl: 10 min_version: 090600 tags: [ cluster ] metrics: - datname: { usage: LABEL ,description: "Name of the database, _system for global process" } - event: { usage: LABEL ,description: "Wait event type" } - count: { usage: GAUGE ,description: "Count of WaitEvent on target database" } ================================================ FILE: config/0430-pg_backend.yml ================================================ #==============================================================# # 0430 pg_backend #==============================================================# pg_backend: name: pg_backend desc: PostgreSQL backend client count group by wait event type since 10 query: SELECT backend_type AS "type", count(*) AS count FROM pg_stat_activity GROUP BY backend_type; ttl: 10 min_version: 100000 tags: [ cluster ] metrics: - type: { usage: LABEL ,description: "Database backend process type" } - count: { usage: GAUGE ,description: "Database backend process count by backend_type" } ================================================ FILE: config/0440-pg_xact.yml ================================================ #==============================================================# # 0440 pg_xact #==============================================================# pg_xact: name: pg_xact desc: PostgreSQL transaction identifier metrics query: WITH snap(v) AS (SELECT txid_current_snapshot()), xset(v) AS (SELECT txid_snapshot_xip(v) FROM snap), xnum(v) AS (SELECT count(*) from xset), xmin(v) AS (SELECT txid_snapshot_xmin(v) FROM snap), xmax(v) AS (SELECT txid_snapshot_xmax(v) FROM snap) SELECT xmin.v AS xmin, xmax.v AS xmax, xnum.v AS xnum FROM xmin, xmax, xnum; ttl: 10 min_version: 090400 tags: [ cluster ] metrics: - xmin: { usage: COUNTER ,description: "Earliest txid that is still active" } - xmax: { usage: COUNTER ,description: "First as-yet-unassigned txid. txid >= this are invisible." } - xnum: { usage: GAUGE ,description: "Current active transaction count" } ================================================ FILE: config/0450-pg_lock.yml ================================================ #==============================================================# # 0450 pg_lock #==============================================================# pg_lock: name: pg_lock desc: PostgreSQL lock distribution by mode and database query: | SELECT datname, mode, coalesce(count, 0) AS count FROM (SELECT d.oid AS database, d.datname, l.mode FROM pg_database d, unnest(ARRAY ['AccessShareLock','RowShareLock','RowExclusiveLock','ShareUpdateExclusiveLock', 'ShareLock','ShareRowExclusiveLock','ExclusiveLock','AccessExclusiveLock']) l(mode) WHERE d.datallowconn AND NOT d.datistemplate) base LEFT JOIN (SELECT database, mode, count(*) AS count FROM pg_locks WHERE database IS NOT NULL GROUP BY 1, 2) cnt USING (database, mode); ttl: 10 min_version: 090400 tags: [ cluster ] metrics: - datname: { usage: LABEL ,description: "Name of the database this backend is connected to" } - mode: { usage: LABEL ,description: "Name of the lock mode held or desired by this process" } - count: { usage: GAUGE ,description: "Number of locks of corresponding mode and database" } ================================================ FILE: config/0460-pg_query.yml ================================================ #==============================================================# # 0460 pg_query #==============================================================# pg_query_17: name: pg_query desc: PostgreSQL Query metrics, require pg_stat_statements installed, 17+ query: |- SELECT datname, queryid AS query, sum(calls) AS calls, sum(rows) AS rows, sum(total_exec_time) AS exec_time, sum(shared_blk_read_time) + sum(shared_blk_write_time) AS io_time, sum(wal_bytes) AS wal_bytes ,sum(shared_blks_hit) AS sblk_hit, sum(shared_blks_read) AS sblk_read, sum(shared_blks_dirtied) AS sblk_dirtied, sum(shared_blks_written) AS sblk_written FROM pg_stat_statements(false) s JOIN pg_database d ON s.dbid = d.oid WHERE userid != 10 AND calls > 4 GROUP BY 1, 2 ORDER BY 3 DESC LIMIT 128; ttl: 10 timeout: 2 min_version: 170000 tags: [ cluster, "extension:pg_stat_statements" ] metrics: - datname: { usage: LABEL ,description: "Name of database" } - query: { usage: LABEL ,description: "QueryID generated from internal hash code, computed from the statement's parse tree" } - calls: { usage: COUNTER ,description: "Number of times the statement was executed" } - rows: { usage: COUNTER ,description: "Total number of rows retrieved or affected by the statement" } - exec_time: { usage: COUNTER ,scale: 1e-3 ,description: "Total time spent executing the statement, in seconds" } - io_time: { usage: COUNTER ,scale: 1e-3 ,description: "Total time the statement spent reading and writing blocks, in seconds" } - wal_bytes: { usage: COUNTER ,description: "Total amount of WAL bytes generated by the statement" } - sblk_hit: { usage: COUNTER ,description: "Total number of shared block cache hits by the statement" } - sblk_read: { usage: COUNTER ,description: "Total number of shared blocks read by the statement" } - sblk_dirtied: { usage: COUNTER ,description: "Total number of shared blocks dirtied by the statement" } - sblk_written: { usage: COUNTER ,description: "Total number of shared blocks written by the statement" } pg_query_13: name: pg_query desc: PostgreSQL Query metrics, require pg_stat_statements installed, 13 - 16 query: |- SELECT datname, queryid AS query, sum(calls) AS calls, sum(rows) AS rows, sum(total_exec_time) AS exec_time, sum(blk_read_time) + sum(blk_write_time) AS io_time, sum(wal_bytes) AS wal_bytes ,sum(shared_blks_hit) AS sblk_hit, sum(shared_blks_read) AS sblk_read, sum(shared_blks_dirtied) AS sblk_dirtied, sum(shared_blks_written) AS sblk_written FROM pg_stat_statements(false) s JOIN pg_database d ON s.dbid = d.oid WHERE userid != 10 AND calls > 4 GROUP BY 1, 2 ORDER BY 3 DESC LIMIT 128; ttl: 10 timeout: 2 min_version: 130000 max_version: 170000 tags: [ cluster, "extension:pg_stat_statements" ] metrics: - datname: { usage: LABEL ,description: "Name of database" } - query: { usage: LABEL ,description: "QueryID generated from internal hash code, computed from the statement's parse tree" } - calls: { usage: COUNTER ,description: "Number of times the statement was executed" } - rows: { usage: COUNTER ,description: "Total number of rows retrieved or affected by the statement" } - exec_time: { usage: COUNTER ,scale: 1e-3 ,description: "Total time spent executing the statement, in seconds" } - io_time: { usage: COUNTER ,scale: 1e-3 ,description: "Total time the statement spent reading and writing blocks, in seconds" } - wal_bytes: { usage: COUNTER ,description: "Total amount of WAL bytes generated by the statement" } - sblk_hit: { usage: COUNTER ,description: "Total number of shared block cache hits by the statement" } - sblk_read: { usage: COUNTER ,description: "Total number of shared blocks read by the statement" } - sblk_dirtied: { usage: COUNTER ,description: "Total number of shared blocks dirtied by the statement" } - sblk_written: { usage: COUNTER ,description: "Total number of shared blocks written by the statement" } pg_query_10: name: pg_query desc: PostgreSQL query statement metrics, require pg_stat_statements installed, 9.4 ~ 12 query: |- SELECT datname, queryid AS query, sum(calls) AS calls, sum(rows) AS rows, sum(total_time) AS exec_time, sum(blk_read_time) + sum(blk_write_time) AS io_time, sum(shared_blks_hit) AS sblk_hit, sum(shared_blks_read) AS sblk_read, sum(shared_blks_dirtied) AS sblk_dirtied, sum(shared_blks_written) AS sblk_written FROM pg_stat_statements(false) s JOIN pg_database d ON s.dbid = d.oid WHERE userid != 10 AND calls > 4 GROUP BY 1, 2 ORDER BY 3 DESC LIMIT 128; ttl: 10 timeout: 2 min_version: 090400 max_version: 130000 tags: [ cluster, "extension:pg_stat_statements" ] metrics: - datname: { usage: LABEL ,description: "Name of database" } - query: { usage: LABEL ,description: "QueryID generated from internal hash code, computed from the statement's parse tree" } - calls: { usage: COUNTER ,description: "Number of times the statement was executed" } - rows: { usage: COUNTER ,description: "Total number of rows retrieved or affected by the statement" } - exec_time: { usage: COUNTER ,scale: 1e-3 ,description: "Total time spent executing the statement, in seconds" } - io_time: { usage: COUNTER ,scale: 1e-3 ,description: "Total time the statement spent reading and writing blocks, in seconds" } - sblk_hit: { usage: COUNTER ,description: "Total number of shared block cache hits by the statement" } - sblk_read: { usage: COUNTER ,description: "Total number of shared blocks read by the statement" } - sblk_dirtied: { usage: COUNTER ,description: "Total number of shared blocks dirtied by the statement" } - sblk_written: { usage: COUNTER ,description: "Total number of shared blocks written by the statement" } ================================================ FILE: config/0510-pg_vacuuming.yml ================================================ #==============================================================# # 0510 pg_vacuuming #==============================================================# pg_vacuuming_18: name: pg_vacuuming desc: PostgreSQL vacuum progress 18+ query: |- SELECT datname, pid, relid::RegClass AS relname, CASE phase WHEN 'scanning heap' THEN (CASE WHEN heap_blks_total > 0 THEN 1.0 * heap_blks_scanned / heap_blks_total ELSE 0.0 END) WHEN 'vacuuming heap' THEN (CASE WHEN heap_blks_total > 0 THEN 1.0 * heap_blks_vacuumed / heap_blks_total ELSE 0 END) ELSE NULL END AS progress, indexes_total, indexes_processed, dead_tuple_bytes, delay_time FROM pg_stat_progress_vacuum; ttl: 10 min_version: 180000 tags: [ cluster, primary ] metrics: - datname: { usage: LABEL ,description: "database name" } - pid: { usage: LABEL ,description: "process id of vacuum worker" } - relname: { usage: LABEL ,description: "relation name of vacuuming table" } - progress: { usage: GAUGE ,description: "vacuum progress ratio (0-1) based on heap blocks scanned/vacuumed" } - indexes_total: { usage: GAUGE ,description: "total number of indexes that will be vacuumed or cleaned up" } - indexes_processed: { usage: GAUGE ,description: "number of indexes that have been vacuumed or cleaned up" } - dead_tuple_bytes: { usage: GAUGE ,description: "total size of dead tuples collected since the beginning of vacuum in bytes" } - delay_time: { usage: COUNTER ,scale: 1e-3 ,description: "total time spent sleeping due to cost-based delay in seconds" } pg_vacuuming_17: name: pg_vacuuming desc: PostgreSQL vacuum progress 17 (with index progress tracking) query: |- SELECT datname, pid, relid::RegClass AS relname, CASE phase WHEN 'scanning heap' THEN (CASE WHEN heap_blks_total > 0 THEN 1.0 * heap_blks_scanned / heap_blks_total ELSE 0.0 END) WHEN 'vacuuming heap' THEN (CASE WHEN heap_blks_total > 0 THEN 1.0 * heap_blks_vacuumed / heap_blks_total ELSE 0 END) ELSE NULL END AS progress, indexes_total, indexes_processed, dead_tuple_bytes FROM pg_stat_progress_vacuum; ttl: 10 min_version: 170000 max_version: 180000 tags: [ cluster, primary ] metrics: - datname: { usage: LABEL ,description: "database name" } - pid: { usage: LABEL ,description: "process id of vacuum worker" } - relname: { usage: LABEL ,description: "relation name of vacuuming table" } - progress: { usage: GAUGE ,description: "vacuum progress ratio (0-1) based on heap blocks scanned/vacuumed" } - indexes_total: { usage: GAUGE ,description: "total number of indexes that will be vacuumed or cleaned up" } - indexes_processed: { usage: GAUGE ,description: "number of indexes that have been vacuumed or cleaned up" } - dead_tuple_bytes: { usage: GAUGE ,description: "total size of dead tuples collected since the beginning of vacuum in bytes" } pg_vacuuming_12: name: pg_vacuuming desc: PostgreSQL vacuum progress 12-16 query: |- SELECT datname, pid, relid::RegClass AS relname, CASE phase WHEN 'scanning heap' THEN (CASE WHEN heap_blks_total > 0 THEN 1.0 * heap_blks_scanned / heap_blks_total ELSE 0.0 END) WHEN 'vacuuming heap' THEN (CASE WHEN heap_blks_total > 0 THEN 1.0 * heap_blks_vacuumed / heap_blks_total ELSE 0 END) ELSE NULL END AS progress FROM pg_stat_progress_vacuum; ttl: 10 min_version: 120000 max_version: 170000 tags: [ cluster, primary ] metrics: - datname: { usage: LABEL ,description: "database name" } - pid: { usage: LABEL ,description: "process id of vacuum worker" } - relname: { usage: LABEL ,description: "relation name of vacuuming table" } - progress: { usage: GAUGE ,description: "vacuum progress ratio (0-1) based on heap blocks scanned/vacuumed" } ================================================ FILE: config/0520-pg_indexing.yml ================================================ #==============================================================# # 0520 pg_indexing #==============================================================# pg_indexing: name: pg_indexing desc: PostgreSQL index creating progress (v12+) query: |- SELECT datname, pid, relid::RegClass AS relname, (CASE WHEN blocks_total > 0 THEN 1.0 * blocks_done / blocks_total ELSE NULL END) AS blocks, (CASE WHEN tuples_total > 0 THEN 1.0 * tuples_done / tuples_total ELSE NULL END) AS tuples, (CASE WHEN partitions_total > 0 THEN 1.0 * partitions_done / partitions_total ELSE NULL END) AS partitions, (CASE WHEN lockers_total > 0 THEN 1.0 * lockers_done / lockers_total ELSE NULL END) AS lockers FROM pg_stat_progress_create_index pspci; ttl: 10 min_version: 120000 tags: [ cluster, primary ] metrics: - datname: { usage: LABEL ,description: "Name of the database" } - pid: { usage: LABEL ,description: "Process id of indexing table" } - relname: { usage: LABEL ,description: "Relation name of indexed table" } - blocks: { usage: GAUGE ,description: "Percent of blocks been proceeded" } - tuples: { usage: GAUGE ,description: "Percent of tuples been proceeded" } - partitions: { usage: GAUGE ,description: "Percent of partitions been proceeded" } - lockers: { usage: GAUGE ,description: "Percent of lockers been proceeded" } ================================================ FILE: config/0530-pg_clustering.yml ================================================ #==============================================================# # 0530 pg_clustering #==============================================================# pg_clustering: name: pg_clustering desc: PostgreSQL cluster or vacuum full progress (v12+) query: SELECT datname, pid, relid::RegClass AS relname, param4 AS tup_scan, CASE WHEN param6 > 0 THEN 1.0 * param7 / param6 ELSE 0 END AS progress FROM pg_stat_get_progress_info('cluster') s LEFT JOIN pg_database d ON s.datid = d.oid; ttl: 10 min_version: 120000 tags: [ cluster, primary ] metrics: - datname: { usage: LABEL ,description: "Name of database been clustering" } - pid: { usage: LABEL ,description: "Process id of indexing table" } - relname: { usage: LABEL ,description: "Relation name of indexed table" } - tup_scan: { usage: GAUGE ,description: "How much tuple been scanned" } - progress: { usage: GAUGE ,description: "Progress of heap been processed" } ================================================ FILE: config/0540-pg_backup.yml ================================================ #==============================================================# # 0540 pg_backup #==============================================================# pg_backup: name: pg_backup desc: PostgreSQL basebackup progress since 13 query: SELECT pid, param1 AS phase, CASE param2 WHEN -1::integer THEN NULL::bigint ELSE param2 END AS total_bytes, param3 AS sent_bytes FROM pg_stat_get_progress_info('BASEBACKUP'); ttl: 10 min_version: 130000 tags: [ cluster ] metrics: - pid: { usage: LABEL ,description: "process id of basebackup sender" } - phase: { usage: GAUGE ,description: "Phase encoded in 0~5 initial, wait checkpoint, estimate, streaming, waiting archive, transfer archive" } - total_bytes: { usage: GAUGE ,description: "Total amount of data that will be streamed" } - sent_bytes: { usage: GAUGE ,description: "Amount of data streamed" } ================================================ FILE: config/0610-pg_db.yml ================================================ #==============================================================# # 0610 pg_db #==============================================================# pg_db_18: name: pg_db desc: PostgreSQL database stats from pg_stat_database v18 query: |- SELECT d.datname, datid,age(datfrozenxid) AS age, datistemplate AS is_template, datallowconn AS allow_conn, datconnlimit AS conn_limit, datfrozenxid::TEXT::BIGINT as frozen_xid, numbackends,xact_commit,xact_rollback,xact_rollback + xact_commit AS xact_total,blks_read,blks_hit,blks_read + blks_hit AS blks_access,tup_returned,tup_fetched,tup_inserted,tup_updated,tup_deleted,tup_inserted + tup_updated + tup_deleted AS tup_modified, conflicts,temp_files,temp_bytes,deadlocks,coalesce(checksum_failures, -1) AS cks_fails, checksum_last_failure AS cks_fail_time,blk_read_time,blk_write_time, session_time,active_time,idle_in_transaction_time AS ixact_time,sessions,sessions_abandoned,sessions_fatal,sessions_killed,parallel_workers_to_launch,parallel_workers_launched, extract(EPOCH FROM stats_reset) AS reset_time FROM pg_database d JOIN pg_stat_database sd ON d.oid = sd.datid; ttl: 10 min_version: 180000 tags: [ cluster ] metrics: - datname: { usage: LABEL ,description: "Name of the database" } - datid: { usage: GAUGE ,description: "OID of the database" } - age: { usage: GAUGE ,description: "Age of database calculated from datfrozenxid" } - is_template: { usage: GAUGE ,description: "If true(1), then this database can be cloned by any user with CREATEDB privileges" } - allow_conn: { usage: GAUGE ,description: "If false(0) then no one can connect to this database." } - conn_limit: { usage: GAUGE ,description: "Sets maximum number of concurrent connections that can be made to this database. -1 means no limit." } - frozen_xid: { usage: GAUGE ,description: "All transaction IDs before this one have been frozen" } - numbackends: { usage: GAUGE ,description: "Number of backends currently connected to this database" } - xact_commit: { usage: COUNTER ,description: "Number of transactions in this database that have been committed" } - xact_rollback: { usage: COUNTER ,description: "Number of transactions in this database that have been rolled back" } - xact_total: { usage: COUNTER ,description: "Number of transactions in this database" } - blks_read: { usage: COUNTER ,description: "Number of disk blocks read in this database" } - blks_hit: { usage: COUNTER ,description: "Number of times disk blocks were found already in the buffer cache" } - blks_access: { usage: COUNTER ,description: "Number of times disk blocks that accessed read+hit" } - tup_returned: { usage: COUNTER ,description: "Number of rows returned by queries in this database" } - tup_fetched: { usage: COUNTER ,description: "Number of rows fetched by queries in this database" } - tup_inserted: { usage: COUNTER ,description: "Number of rows inserted by queries in this database" } - tup_updated: { usage: COUNTER ,description: "Number of rows updated by queries in this database" } - tup_deleted: { usage: COUNTER ,description: "Number of rows deleted by queries in this database" } - tup_modified: { usage: COUNTER ,description: "Number of rows modified by queries in this database" } - conflicts: { usage: COUNTER ,description: "Number of queries canceled due to conflicts with recovery in this database" } - temp_files: { usage: COUNTER ,description: "Number of temporary files created by queries in this database" } - temp_bytes: { usage: COUNTER ,description: "Total amount of data written to temporary files by queries in this database." } - deadlocks: { usage: COUNTER ,description: "Number of deadlocks detected in this database" } - cks_fails: { usage: COUNTER ,description: "Number of data page checksum failures detected in this database, -1 for not enabled" } - cks_fail_time: { usage: GAUGE ,description: "Time at which the last data page checksum failure was detected in this database" } - blk_read_time: { usage: COUNTER ,scale: 1e-3 ,description: "Time spent reading data file blocks by backends in this database, in seconds" } - blk_write_time: { usage: COUNTER ,scale: 1e-3 ,description: "Time spent writing data file blocks by backends in this database, in seconds" } - session_time: { usage: COUNTER ,scale: 1e-3 ,description: "Time spent by database sessions in this database, in seconds" } - active_time: { usage: COUNTER ,scale: 1e-3 ,description: "Time spent executing SQL statements in this database, in seconds" } - ixact_time: { usage: COUNTER ,scale: 1e-3 ,description: "Time spent idling while in a transaction in this database, in seconds" } - sessions: { usage: COUNTER ,description: "Total number of sessions established to this database" } - sessions_abandoned: { usage: COUNTER ,description: "Number of database sessions to this database that were terminated because connection to the client was lost" } - sessions_fatal: { usage: COUNTER ,description: "Number of database sessions to this database that were terminated by fatal errors" } - sessions_killed: { usage: COUNTER ,description: "Number of database sessions to this database that were terminated by operator intervention" } - parallel_workers_to_launch: { usage: COUNTER ,description: "Number of parallel workers planned to be launched by queries on this database" } - parallel_workers_launched: { usage: COUNTER ,description: "Number of parallel workers launched by queries on this database" } - reset_time: { usage: GAUGE ,description: "Time at which database statistics were last reset" } pg_db_14: name: pg_db desc: PostgreSQL database stats from pg_stat_database v14 (with 7 new time & session metrics) query: |- SELECT d.datname, datid,age(datfrozenxid) AS age, datistemplate AS is_template, datallowconn AS allow_conn, datconnlimit AS conn_limit, datfrozenxid::TEXT::BIGINT as frozen_xid, numbackends,xact_commit,xact_rollback,xact_rollback + xact_commit AS xact_total,blks_read,blks_hit,blks_read + blks_hit AS blks_access,tup_returned,tup_fetched,tup_inserted,tup_updated,tup_deleted,tup_inserted + tup_updated + tup_deleted AS tup_modified, conflicts,temp_files,temp_bytes,deadlocks,coalesce(checksum_failures, -1) AS cks_fails, checksum_last_failure AS cks_fail_time,blk_read_time,blk_write_time, session_time,active_time,idle_in_transaction_time AS ixact_time,sessions,sessions_abandoned,sessions_fatal,sessions_killed,extract(EPOCH FROM stats_reset) AS reset_time FROM pg_database d JOIN pg_stat_database sd ON d.oid = sd.datid; ttl: 10 min_version: 140000 max_version: 180000 tags: [ cluster ] metrics: - datname: { usage: LABEL ,description: "Name of the database" } - datid: { usage: GAUGE ,description: "OID of the database" } - age: { usage: GAUGE ,description: "Age of database calculated from datfrozenxid" } - is_template: { usage: GAUGE ,description: "If true(1), then this database can be cloned by any user with CREATEDB privileges" } - allow_conn: { usage: GAUGE ,description: "If false(0) then no one can connect to this database." } - conn_limit: { usage: GAUGE ,description: "Sets maximum number of concurrent connections that can be made to this database. -1 means no limit." } - frozen_xid: { usage: GAUGE ,description: "All transaction IDs before this one have been frozen" } - numbackends: { usage: GAUGE ,description: "Number of backends currently connected to this database" } - xact_commit: { usage: COUNTER ,description: "Number of transactions in this database that have been committed" } - xact_rollback: { usage: COUNTER ,description: "Number of transactions in this database that have been rolled back" } - xact_total: { usage: COUNTER ,description: "Number of transactions in this database" } - blks_read: { usage: COUNTER ,description: "Number of disk blocks read in this database" } - blks_hit: { usage: COUNTER ,description: "Number of times disk blocks were found already in the buffer cache" } - blks_access: { usage: COUNTER ,description: "Number of times disk blocks that accessed read+hit" } - tup_returned: { usage: COUNTER ,description: "Number of rows returned by queries in this database" } - tup_fetched: { usage: COUNTER ,description: "Number of rows fetched by queries in this database" } - tup_inserted: { usage: COUNTER ,description: "Number of rows inserted by queries in this database" } - tup_updated: { usage: COUNTER ,description: "Number of rows updated by queries in this database" } - tup_deleted: { usage: COUNTER ,description: "Number of rows deleted by queries in this database" } - tup_modified: { usage: COUNTER ,description: "Number of rows modified by queries in this database" } - conflicts: { usage: COUNTER ,description: "Number of queries canceled due to conflicts with recovery in this database" } - temp_files: { usage: COUNTER ,description: "Number of temporary files created by queries in this database" } - temp_bytes: { usage: COUNTER ,description: "Total amount of data written to temporary files by queries in this database." } - deadlocks: { usage: COUNTER ,description: "Number of deadlocks detected in this database" } - cks_fails: { usage: COUNTER ,description: "Number of data page checksum failures detected in this database, -1 for not enabled" } - cks_fail_time: { usage: GAUGE ,description: "Time at which the last data page checksum failure was detected in this database" } - blk_read_time: { usage: COUNTER ,scale: 1e-3 ,description: "Time spent reading data file blocks by backends in this database, in seconds" } - blk_write_time: { usage: COUNTER ,scale: 1e-3 ,description: "Time spent writing data file blocks by backends in this database, in seconds" } - session_time: { usage: COUNTER ,scale: 1e-3 ,description: "Time spent by database sessions in this database, in seconds" } - active_time: { usage: COUNTER ,scale: 1e-3 ,description: "Time spent executing SQL statements in this database, in seconds" } - ixact_time: { usage: COUNTER ,scale: 1e-3 ,description: "Time spent idling while in a transaction in this database, in seconds" } - sessions: { usage: COUNTER ,description: "Total number of sessions established to this database" } - sessions_abandoned: { usage: COUNTER ,description: "Number of database sessions to this database that were terminated because connection to the client was lost" } - sessions_fatal: { usage: COUNTER ,description: "Number of database sessions to this database that were terminated by fatal errors" } - sessions_killed: { usage: COUNTER ,description: "Number of database sessions to this database that were terminated by operator intervention" } - reset_time: { usage: GAUGE ,description: "Time at which database statistics were last reset" } pg_db_12: name: pg_db desc: PostgreSQL database stats from pg_stat_database v12 v13 (with 2 new checksum metrics) query: |- SELECT d.datname, datid,age(datfrozenxid) AS age, datistemplate AS is_template, datallowconn AS allow_conn, datconnlimit AS conn_limit, datfrozenxid::TEXT::BIGINT as frozen_xid, numbackends,xact_commit,xact_rollback,xact_rollback + xact_commit AS xact_total,blks_read,blks_hit,blks_read + blks_hit AS blks_access,tup_returned,tup_fetched,tup_inserted,tup_updated,tup_deleted,tup_inserted + tup_updated + tup_deleted AS tup_modified, conflicts,temp_files,temp_bytes,deadlocks,coalesce(checksum_failures, -1) AS cks_fails, checksum_last_failure AS cks_fail_time,blk_read_time,blk_write_time, extract(EPOCH FROM stats_reset) AS reset_time FROM pg_database d JOIN pg_stat_database sd ON d.oid = sd.datid; ttl: 10 min_version: 120000 max_version: 140000 tags: [ cluster ] metrics: - datname: { usage: LABEL ,description: "Name of the database" } - datid: { usage: GAUGE ,description: "OID of the database" } - age: { usage: GAUGE ,description: "Age of database calculated from datfrozenxid" } - is_template: { usage: GAUGE ,description: "If true(1), then this database can be cloned by any user with CREATEDB privileges" } - allow_conn: { usage: GAUGE ,description: "If false(0) then no one can connect to this database." } - conn_limit: { usage: GAUGE ,description: "Sets maximum number of concurrent connections that can be made to this database. -1 means no limit." } - frozen_xid: { usage: GAUGE ,description: "All transaction IDs before this one have been frozen" } - numbackends: { usage: GAUGE ,description: "Number of backends currently connected to this database" } - xact_commit: { usage: COUNTER ,description: "Number of transactions in this database that have been committed" } - xact_rollback: { usage: COUNTER ,description: "Number of transactions in this database that have been rolled back" } - xact_total: { usage: COUNTER ,description: "Number of transactions in this database" } - blks_read: { usage: COUNTER ,description: "Number of disk blocks read in this database" } - blks_hit: { usage: COUNTER ,description: "Number of times disk blocks were found already in the buffer cache" } - blks_access: { usage: COUNTER ,description: "Number of times disk blocks that accessed read+hit" } - tup_returned: { usage: COUNTER ,description: "Number of rows returned by queries in this database" } - tup_fetched: { usage: COUNTER ,description: "Number of rows fetched by queries in this database" } - tup_inserted: { usage: COUNTER ,description: "Number of rows inserted by queries in this database" } - tup_updated: { usage: COUNTER ,description: "Number of rows updated by queries in this database" } - tup_deleted: { usage: COUNTER ,description: "Number of rows deleted by queries in this database" } - tup_modified: { usage: COUNTER ,description: "Number of rows modified by queries in this database" } - conflicts: { usage: COUNTER ,description: "Number of queries canceled due to conflicts with recovery in this database" } - temp_files: { usage: COUNTER ,description: "Number of temporary files created by queries in this database" } - temp_bytes: { usage: COUNTER ,description: "Total amount of data written to temporary files by queries in this database." } - deadlocks: { usage: COUNTER ,description: "Number of deadlocks detected in this database" } - cks_fails: { usage: COUNTER ,description: "Number of data page checksum failures detected in this database, -1 for not enabled" } - cks_fail_time: { usage: GAUGE ,description: "Time at which the last data page checksum failure was detected in this database" } - blk_read_time: { usage: COUNTER ,scale: 1e-3 ,description: "Time spent reading data file blocks by backends in this database, in seconds" } - blk_write_time: { usage: COUNTER ,scale: 1e-3 ,description: "Time spent writing data file blocks by backends in this database, in seconds" } - reset_time: { usage: GAUGE ,description: "Time at which database statistics were last reset" } pg_db_10: name: pg_db desc: PostgreSQL database stats from pg_stat_database v10 v11 (actually since 9.2) query: |- SELECT d.datname, datid,age(datfrozenxid) AS age, datistemplate AS is_template, datallowconn AS allow_conn, datconnlimit AS conn_limit, datfrozenxid::TEXT::BIGINT as frozen_xid, numbackends,xact_commit,xact_rollback,xact_rollback + xact_commit AS xact_total, blks_read,blks_hit,blks_read + blks_hit AS blks_access,tup_returned,tup_fetched,tup_inserted,tup_updated,tup_deleted,tup_inserted + tup_updated + tup_deleted AS tup_modified, conflicts,temp_files,temp_bytes,deadlocks,blk_read_time,blk_write_time, extract(EPOCH FROM stats_reset) AS reset_time FROM pg_database d JOIN pg_stat_database sd ON d.oid = sd.datid; ttl: 10 min_version: 090200 max_version: 120000 tags: [ cluster ] metrics: - datname: { usage: LABEL ,description: "Name of the database" } - datid: { usage: GAUGE ,description: "OID of the database" } - age: { usage: GAUGE ,description: "Age of database calculated from datfrozenxid" } - is_template: { usage: GAUGE ,description: "If true(1), then this database can be cloned by any user with CREATEDB privileges" } - allow_conn: { usage: GAUGE ,description: "If false(0) then no one can connect to this database." } - conn_limit: { usage: GAUGE ,description: "Sets maximum number of concurrent connections that can be made to this database. -1 means no limit." } - frozen_xid: { usage: GAUGE ,description: "All transaction IDs before this one have been frozen" } - numbackends: { usage: GAUGE ,description: "Number of backends currently connected to this database" } - xact_commit: { usage: COUNTER ,description: "Number of transactions in this database that have been committed" } - xact_rollback: { usage: COUNTER ,description: "Number of transactions in this database that have been rolled back" } - xact_total: { usage: COUNTER ,description: "Number of transactions in this database" } - blks_read: { usage: COUNTER ,description: "Number of disk blocks read in this database" } - blks_hit: { usage: COUNTER ,description: "Number of times disk blocks were found already in the buffer cache" } - blks_access: { usage: COUNTER ,description: "Number of times disk blocks that accessed read+hit" } - tup_returned: { usage: COUNTER ,description: "Number of rows returned by queries in this database" } - tup_fetched: { usage: COUNTER ,description: "Number of rows fetched by queries in this database" } - tup_inserted: { usage: COUNTER ,description: "Number of rows inserted by queries in this database" } - tup_updated: { usage: COUNTER ,description: "Number of rows updated by queries in this database" } - tup_deleted: { usage: COUNTER ,description: "Number of rows deleted by queries in this database" } - tup_modified: { usage: COUNTER ,description: "Number of rows modified by queries in this database" } - conflicts: { usage: COUNTER ,description: "Number of queries canceled due to conflicts with recovery in this database" } - temp_files: { usage: COUNTER ,description: "Number of temporary files created by queries in this database" } - temp_bytes: { usage: COUNTER ,description: "Total amount of data written to temporary files by queries in this database." } - deadlocks: { usage: COUNTER ,description: "Number of deadlocks detected in this database" } - blk_read_time: { usage: COUNTER ,scale: 1e-3 ,description: "Time spent reading data file blocks by backends in this database, in seconds" } - blk_write_time: { usage: COUNTER ,scale: 1e-3 ,description: "Time spent writing data file blocks by backends in this database, in seconds" } - reset_time: { usage: GAUGE ,description: "Time at which database statistics were last reset" } ================================================ FILE: config/0620-pg_db_confl.yml ================================================ #==============================================================# # 0620 pg_db_confl #==============================================================# # https://pgpedia.info/p/pg_stat_database_conflicts.html pg_db_confl_16: name: pg_db_confl desc: PostgreSQL database conflicts metrics for PG16+ query: SELECT * FROM pg_stat_database_conflicts; ttl: 10 min_version: 160000 tags: [ cluster, replica ] metrics: - datid: { usage: DISCARD } - datname: { usage: LABEL ,description: "Name of this database" } - confl_tablespace: { usage: COUNTER ,description: "Number of queries in this database that have been canceled due to dropped tablespaces" } - confl_lock: { usage: COUNTER ,description: "Number of queries in this database that have been canceled due to lock timeouts" } - confl_snapshot: { usage: COUNTER ,description: "Number of queries in this database that have been canceled due to old snapshots" } - confl_bufferpin: { usage: COUNTER ,description: "Number of queries in this database that have been canceled due to pinned buffers" } - confl_deadlock: { usage: COUNTER ,description: "Number of queries in this database that have been canceled due to deadlocks" } - confl_active_logicalslot: { usage: COUNTER ,description: "Number of uses of logical slots in this database that have been canceled due to old snapshots or too low a wal_level on the primary" } pg_db_confl_15: name: pg_db_confl desc: PostgreSQL database conflicts metrics for pg 9.1 - 15 query: SELECT * FROM pg_stat_database_conflicts; ttl: 10 min_version: 90100 max_version: 160000 tags: [ cluster, replica ] metrics: - datid: { usage: DISCARD } - datname: { usage: LABEL ,description: "Name of this database" } - confl_tablespace: { usage: COUNTER ,description: "Number of queries in this database that have been canceled due to dropped tablespaces" } - confl_lock: { usage: COUNTER ,description: "Number of queries in this database that have been canceled due to lock timeouts" } - confl_snapshot: { usage: COUNTER ,description: "Number of queries in this database that have been canceled due to old snapshots" } - confl_bufferpin: { usage: COUNTER ,description: "Number of queries in this database that have been canceled due to pinned buffers" } - confl_deadlock: { usage: COUNTER ,description: "Number of queries in this database that have been canceled due to deadlocks" } ================================================ FILE: config/0640-pg_pubrel.yml ================================================ #==============================================================# # 0640 pg_pubrel #==============================================================# pg_pubrel: name: pg_pubrel desc: PostgreSQL publication and relation count query: SELECT CURRENT_CATALOG AS datname, pubname, count(*) AS count FROM pg_publication p, LATERAL pg_get_publication_tables(pubname) GROUP BY pubname; ttl: 10 min_version: 100000 metrics: - datname: { usage: LABEL ,description: "Name of the database which publication belonged" } - pubname: { usage: LABEL ,description: "Name of the publication" } - count: { usage: GAUGE ,description: "Count of relation in the publication" } ================================================ FILE: config/0650-pg_subrel.yml ================================================ #==============================================================# # 0650 pg_subrel #==============================================================# pg_subrel: name: pg_subrel desc: PostgreSQL subscripted relation group by state query: SELECT CURRENT_CATALOG AS datname, subname, srsubstate::TEXT AS state, count(*) AS count FROM pg_subscription_rel sr LEFT JOIN pg_stat_subscription ss ON sr.srsubid = ss.subid GROUP BY 2, 3; ttl: 10 min_version: 100000 metrics: - datname: { usage: LABEL ,description: "Name of the database which publication belonged" } - subname: { usage: LABEL ,description: "Name of the subscription" } - state: { usage: LABEL ,description: "State of table in subscription, i=initialize, d=data copy, s=sync, r=ready" } - count: { usage: GAUGE ,description: "Count of relation in this subscription and corresponding state" } ================================================ FILE: config/0700-pg_table.yml ================================================ #==============================================================# # 0700 pg_table #==============================================================# pg_table_18: name: pg_table desc: PostgreSQL table metrics v18+ query: |- SELECT CURRENT_CATALOG AS datname, nsp.nspname || '.' || c.relname AS relname, c.oid AS relid, ascii(c.relkind) AS kind, c.relpages AS pages, c.reltuples AS tuples, c.relfrozenxid AS frozenxid, age(c.relfrozenxid) AS age, c.relnatts AS ncols, psut.seq_scan,psut.seq_tup_read,psut.idx_scan,psut.idx_tup_fetch,psut.seq_scan + psut.idx_scan AS tbl_scan, psut.seq_tup_read + psut.idx_tup_fetch AS tup_read, psut.n_tup_ins,psut.n_tup_upd,psut.n_tup_del,(psut.n_tup_ins + psut.n_tup_upd + psut.n_tup_del) AS n_tup_mod,psut.n_tup_hot_upd,psut.n_tup_newpage_upd,psut.n_live_tup,psut.n_dead_tup, psut.n_mod_since_analyze,psut.n_ins_since_vacuum,psut.last_vacuum,psut.last_autovacuum,psut.last_analyze,psut.last_autoanalyze,psut.last_seq_scan, psut.vacuum_count,psut.autovacuum_count,psut.analyze_count,psut.autoanalyze_count, psut.total_vacuum_time AS vacuum_time,psut.total_autovacuum_time AS autovacuum_time,psut.total_analyze_time AS analyze_time,psut.total_autoanalyze_time AS autoanalyze_time, psio.heap_blks_read,psio.heap_blks_hit,psio.idx_blks_read,psio.idx_blks_hit,psio.toast_blks_read,psio.toast_blks_hit,psio.tidx_blks_read,psio.tidx_blks_hit FROM pg_class c JOIN pg_namespace nsp ON c.relnamespace = nsp.oid LEFT JOIN pg_stat_user_tables psut ON psut.relid = c.oid LEFT JOIN pg_statio_user_tables psio ON psio.relid = c.oid WHERE nsp.nspname !~ '^pg_' AND nsp.nspname !~ '^_' AND nsp.nspname !~ '^timescaledb' AND nsp.nspname !~ '^citus' AND nsp.nspname !~ '^columnar' AND nsp.nspname NOT IN ('pg_catalog','information_schema','pg_toast','repack','monitor') AND c.relkind = ANY (ARRAY ['r','m','t','p']) ORDER BY c.relpages DESC LIMIT 256; ttl: 10 timeout: 2 min_version: 180000 metrics: - datname: { usage: LABEL ,description: "Database name of this table" } - relname: { usage: LABEL ,description: "Relation name of this table" } - relid: { usage: GAUGE ,description: "Relation oid of this table" } - kind: { usage: GAUGE ,description: "Relation kind r/table/114,m/mview/109,t/toast/116,p/partitioned/112" } - pages: { usage: GAUGE ,description: "Size of the on-disk representation of this table in pages" } - tuples: { usage: GAUGE ,description: "Estimated number of rows in this table" } - frozenxid: { usage: GAUGE ,description: "All txid before this have been frozen on this table" } - age: { usage: GAUGE ,description: "Age of this table in vacuum cycles" } - ncols: { usage: GAUGE ,description: "Number of columns in the table" } - seq_scan: { usage: COUNTER ,default: 0 ,description: "Number of sequential scans initiated on this table" } - seq_tup_read: { usage: COUNTER ,default: 0 ,description: "Number of live rows fetched by sequential scans" } - idx_scan: { usage: COUNTER ,default: 0 ,description: "Number of index scans initiated on this table" } - idx_tup_fetch: { usage: COUNTER ,default: 0 ,description: "Number of live rows fetched by index scans" } - tbl_scan: { usage: COUNTER ,default: 0 ,description: "Number of scans initiated on this table" } - tup_read: { usage: COUNTER ,default: 0 ,description: "Number of live rows fetched by scans" } - n_tup_ins: { usage: COUNTER ,default: 0 ,description: "Number of rows inserted" } - n_tup_upd: { usage: COUNTER ,default: 0 ,description: "Number of rows updated (includes HOT updated rows)" } - n_tup_del: { usage: COUNTER ,default: 0 ,description: "Number of rows deleted" } - n_tup_mod: { usage: COUNTER ,default: 0 ,description: "Number of rows modified (insert + update + delete)" } - n_tup_hot_upd: { usage: COUNTER ,default: 0 ,description: "Number of rows HOT updated (i.e with no separate index update required)" } - n_tup_newpage_upd: { usage: COUNTER ,default: 0 ,description: "Number of rows updated where the successor version goes onto a new heap page" } - n_live_tup: { usage: GAUGE ,description: "Estimated number of live rows" } - n_dead_tup: { usage: GAUGE ,description: "Estimated number of dead rows" } - n_mod_since_analyze: { usage: GAUGE ,description: "Estimated number of rows modified since this table was last analyzed" } - n_ins_since_vacuum: { usage: GAUGE ,description: "Estimated number of rows inserted since this table was last vacuumed" } - last_vacuum: { usage: DISCARD ,description: "Last time at which this table was manually vacuumed (not counting VACUUM FULL)" } - last_autovacuum: { usage: DISCARD ,description: "Last time at which this table was vacuumed by the autovacuum daemon" } - last_analyze: { usage: DISCARD ,description: "Last time at which this table was manually analyzed" } - last_autoanalyze: { usage: DISCARD ,description: "Last time at which this table was analyzed by the autovacuum daemon" } - last_seq_scan: { usage: DISCARD ,description: "The timestamp of the last seq scan on this table" } - vacuum_count: { usage: COUNTER ,default: 0 ,description: "Number of times this table has been manually vacuumed (not counting VACUUM FULL)" } - autovacuum_count: { usage: COUNTER ,default: 0 ,description: "Number of times this table has been vacuumed by the autovacuum daemon" } - analyze_count: { usage: COUNTER ,default: 0 ,description: "Number of times this table has been manually analyzed" } - autoanalyze_count: { usage: COUNTER ,default: 0 ,description: "Number of times this table has been analyzed by the autovacuum daemon" } - vacuum_time: { usage: COUNTER ,default: 0 ,scale: 1e-3 ,description: "Total time this table has been manually vacuumed, in seconds" } - autovacuum_time: { usage: COUNTER ,default: 0 ,scale: 1e-3 ,description: "Total time this table has been vacuumed by the autovacuum daemon, in seconds" } - analyze_time: { usage: COUNTER ,default: 0 ,scale: 1e-3 ,description: "Total time this table has been manually analyzed, in seconds" } - autoanalyze_time: { usage: COUNTER ,default: 0 ,scale: 1e-3 ,description: "Total time this table has been analyzed by the autovacuum daemon, in seconds" } - heap_blks_read: { usage: COUNTER ,default: 0 ,description: "Number of disk blocks read from this table" } - heap_blks_hit: { usage: COUNTER ,default: 0 ,description: "Number of buffer hits in this table" } - idx_blks_read: { usage: COUNTER ,default: 0 ,description: "Number of disk blocks read from all indexes on this table" } - idx_blks_hit: { usage: COUNTER ,default: 0 ,description: "Number of buffer hits in all indexes on this table" } - toast_blks_read: { usage: DISCARD ,default: 0 ,description: "Number of disk blocks read from this table's TOAST table (if any)" } - toast_blks_hit: { usage: DISCARD ,default: 0 ,description: "Number of buffer hits in this table's TOAST table (if any)" } - tidx_blks_read: { usage: DISCARD ,default: 0 ,description: "Number of disk blocks read from this table's TOAST table indexes (if any)" } - tidx_blks_hit: { usage: DISCARD ,default: 0 ,description: "Number of buffer hits in this table's TOAST table indexes (if any)" } pg_table_16: name: pg_table desc: PostgreSQL table metrics 16-17 query: |- SELECT CURRENT_CATALOG AS datname, nsp.nspname || '.' || c.relname AS relname, c.oid AS relid, ascii(c.relkind) AS kind, c.relpages AS pages, c.reltuples AS tuples, c.relfrozenxid AS frozenxid, age(c.relfrozenxid) AS age, c.relnatts AS ncols, psut.seq_scan,psut.seq_tup_read,psut.idx_scan,psut.idx_tup_fetch,psut.seq_scan + psut.idx_scan AS tbl_scan, psut.seq_tup_read + psut.idx_tup_fetch AS tup_read, psut.n_tup_ins,psut.n_tup_upd,psut.n_tup_del,(psut.n_tup_ins + psut.n_tup_upd + psut.n_tup_del) AS n_tup_mod,psut.n_tup_hot_upd,psut.n_tup_newpage_upd,psut.n_live_tup,psut.n_dead_tup, psut.n_mod_since_analyze,psut.n_ins_since_vacuum,psut.last_vacuum,psut.last_autovacuum,psut.last_analyze,psut.last_autoanalyze,psut.last_seq_scan, psut.vacuum_count,psut.autovacuum_count,psut.analyze_count,psut.autoanalyze_count, psio.heap_blks_read,psio.heap_blks_hit,psio.idx_blks_read,psio.idx_blks_hit,psio.toast_blks_read,psio.toast_blks_hit,psio.tidx_blks_read,psio.tidx_blks_hit FROM pg_class c JOIN pg_namespace nsp ON c.relnamespace = nsp.oid LEFT JOIN pg_stat_user_tables psut ON psut.relid = c.oid LEFT JOIN pg_statio_user_tables psio ON psio.relid = c.oid WHERE nsp.nspname !~ '^pg_' AND nsp.nspname !~ '^_' AND nsp.nspname !~ '^timescaledb' AND nsp.nspname !~ '^citus' AND nsp.nspname !~ '^columnar' AND nsp.nspname NOT IN ('pg_catalog','information_schema','pg_toast','repack','monitor') AND c.relkind = ANY (ARRAY ['r','m','t','p']) ORDER BY c.relpages DESC LIMIT 256; ttl: 10 timeout: 2 min_version: 160000 max_version: 180000 metrics: - datname: { usage: LABEL ,description: "Database name of this table" } - relname: { usage: LABEL ,description: "Relation name of this table" } - relid: { usage: GAUGE ,description: "Relation oid of this table" } - kind: { usage: GAUGE ,description: "Relation kind r/table/114,m/mview/109,t/toast/116,p/partitioned/112" } - pages: { usage: GAUGE ,description: "Size of the on-disk representation of this table in pages" } - tuples: { usage: GAUGE ,description: "Estimated number of rows in this table" } - frozenxid: { usage: GAUGE ,description: "All txid before this have been frozen on this table" } - age: { usage: GAUGE ,description: "Age of this table in vacuum cycles" } - ncols: { usage: GAUGE ,description: "Number of columns in the table" } - seq_scan: { usage: COUNTER ,default: 0 ,description: "Number of sequential scans initiated on this table" } - seq_tup_read: { usage: COUNTER ,default: 0 ,description: "Number of live rows fetched by sequential scans" } - idx_scan: { usage: COUNTER ,default: 0 ,description: "Number of index scans initiated on this table" } - idx_tup_fetch: { usage: COUNTER ,default: 0 ,description: "Number of live rows fetched by index scans" } - tbl_scan: { usage: COUNTER ,default: 0 ,description: "Number of scans initiated on this table" } - tup_read: { usage: COUNTER ,default: 0 ,description: "Number of live rows fetched by scans" } - n_tup_ins: { usage: COUNTER ,default: 0 ,description: "Number of rows inserted" } - n_tup_upd: { usage: COUNTER ,default: 0 ,description: "Number of rows updated (includes HOT updated rows)" } - n_tup_del: { usage: COUNTER ,default: 0 ,description: "Number of rows deleted" } - n_tup_mod: { usage: COUNTER ,default: 0 ,description: "Number of rows modified (insert + update + delete)" } - n_tup_hot_upd: { usage: COUNTER ,default: 0 ,description: "Number of rows HOT updated (i.e with no separate index update required)" } - n_tup_newpage_upd: { usage: COUNTER ,default: 0 ,description: "Number of rows updated where the successor version goes onto a new heap page" } - n_live_tup: { usage: GAUGE ,description: "Estimated number of live rows" } - n_dead_tup: { usage: GAUGE ,description: "Estimated number of dead rows" } - n_mod_since_analyze: { usage: GAUGE ,description: "Estimated number of rows modified since this table was last analyzed" } - n_ins_since_vacuum: { usage: GAUGE ,description: "Estimated number of rows inserted since this table was last vacuumed" } - last_vacuum: { usage: DISCARD ,description: "Last time at which this table was manually vacuumed (not counting VACUUM FULL)" } - last_autovacuum: { usage: DISCARD ,description: "Last time at which this table was vacuumed by the autovacuum daemon" } - last_analyze: { usage: DISCARD ,description: "Last time at which this table was manually analyzed" } - last_autoanalyze: { usage: DISCARD ,description: "Last time at which this table was analyzed by the autovacuum daemon" } - last_seq_scan: { usage: DISCARD ,description: "The timestamp of the last seq scan on this table" } - vacuum_count: { usage: COUNTER ,default: 0 ,description: "Number of times this table has been manually vacuumed (not counting VACUUM FULL)" } - autovacuum_count: { usage: COUNTER ,default: 0 ,description: "Number of times this table has been vacuumed by the autovacuum daemon" } - analyze_count: { usage: COUNTER ,default: 0 ,description: "Number of times this table has been manually analyzed" } - autoanalyze_count: { usage: COUNTER ,default: 0 ,description: "Number of times this table has been analyzed by the autovacuum daemon" } - heap_blks_read: { usage: COUNTER ,default: 0 ,description: "Number of disk blocks read from this table" } - heap_blks_hit: { usage: COUNTER ,default: 0 ,description: "Number of buffer hits in this table" } - idx_blks_read: { usage: COUNTER ,default: 0 ,description: "Number of disk blocks read from all indexes on this table" } - idx_blks_hit: { usage: COUNTER ,default: 0 ,description: "Number of buffer hits in all indexes on this table" } - toast_blks_read: { usage: DISCARD ,default: 0 ,description: "Number of disk blocks read from this table's TOAST table (if any)" } - toast_blks_hit: { usage: DISCARD ,default: 0 ,description: "Number of buffer hits in this table's TOAST table (if any)" } - tidx_blks_read: { usage: DISCARD ,default: 0 ,description: "Number of disk blocks read from this table's TOAST table indexes (if any)" } - tidx_blks_hit: { usage: DISCARD ,default: 0 ,description: "Number of buffer hits in this table's TOAST table indexes (if any)" } pg_table_13: name: pg_table desc: PostgreSQL table metrics 13-15 (with n_ins_since_vacuum) query: |- SELECT CURRENT_CATALOG AS datname, nsp.nspname || '.' || c.relname AS relname, c.oid AS relid, ascii(c.relkind) AS kind, c.relpages AS pages, c.reltuples AS tuples, c.relfrozenxid AS frozenxid, age(c.relfrozenxid) AS age, c.relnatts AS ncols, psut.seq_scan,psut.seq_tup_read,psut.idx_scan,psut.idx_tup_fetch,psut.seq_scan + psut.idx_scan AS tbl_scan, psut.seq_tup_read + psut.idx_tup_fetch AS tup_read, psut.n_tup_ins,psut.n_tup_upd,psut.n_tup_del,(psut.n_tup_ins + psut.n_tup_upd + psut.n_tup_del) AS n_tup_mod,psut.n_tup_hot_upd,psut.n_live_tup,psut.n_dead_tup, psut.n_mod_since_analyze,psut.n_ins_since_vacuum,psut.last_vacuum,psut.last_autovacuum,psut.last_analyze,psut.last_autoanalyze, psut.vacuum_count,psut.autovacuum_count,psut.analyze_count,psut.autoanalyze_count, psio.heap_blks_read,psio.heap_blks_hit,psio.idx_blks_read,psio.idx_blks_hit,psio.toast_blks_read,psio.toast_blks_hit,psio.tidx_blks_read,psio.tidx_blks_hit FROM pg_class c JOIN pg_namespace nsp ON c.relnamespace = nsp.oid LEFT JOIN pg_stat_user_tables psut ON psut.relid = c.oid LEFT JOIN pg_statio_user_tables psio ON psio.relid = c.oid WHERE nsp.nspname !~ '^pg_' AND nsp.nspname !~ '^_' AND nsp.nspname !~ '^timescaledb' AND nsp.nspname !~ '^citus' AND nsp.nspname !~ '^columnar' AND nsp.nspname NOT IN ('pg_catalog','information_schema','pg_toast','repack','monitor') AND c.relkind = ANY (ARRAY ['r','m','t','p']) ORDER BY c.relpages DESC LIMIT 256; ttl: 10 timeout: 2 min_version: 130000 max_version: 160000 metrics: - datname: { usage: LABEL ,description: "Database name of this table" } - relname: { usage: LABEL ,description: "Relation name of this table" } - relid: { usage: GAUGE ,description: "Relation oid of this table" } - kind: { usage: GAUGE ,description: "Relation kind r/table/114,m/mview/109,t/toast/116,p/partitioned/112" } - pages: { usage: GAUGE ,description: "Size of the on-disk representation of this table in pages" } - tuples: { usage: GAUGE ,description: "Estimated number of rows in this table" } - frozenxid: { usage: GAUGE ,description: "All txid before this have been frozen on this table" } - age: { usage: GAUGE ,description: "Age of this table in vacuum cycles" } - ncols: { usage: GAUGE ,description: "Number of columns in the table" } - seq_scan: { usage: COUNTER ,default: 0 ,description: "Number of sequential scans initiated on this table" } - seq_tup_read: { usage: COUNTER ,default: 0 ,description: "Number of live rows fetched by sequential scans" } - idx_scan: { usage: COUNTER ,default: 0 ,description: "Number of index scans initiated on this table" } - idx_tup_fetch: { usage: COUNTER ,default: 0 ,description: "Number of live rows fetched by index scans" } - tbl_scan: { usage: COUNTER ,default: 0 ,description: "Number of scans initiated on this table" } - tup_read: { usage: COUNTER ,default: 0 ,description: "Number of live rows fetched by scans" } - n_tup_ins: { usage: COUNTER ,default: 0 ,description: "Number of rows inserted" } - n_tup_upd: { usage: COUNTER ,default: 0 ,description: "Number of rows updated (includes HOT updated rows)" } - n_tup_del: { usage: COUNTER ,default: 0 ,description: "Number of rows deleted" } - n_tup_mod: { usage: COUNTER ,default: 0 ,description: "Number of rows modified (insert + update + delete)" } - n_tup_hot_upd: { usage: COUNTER ,default: 0 ,description: "Number of rows HOT updated (i.e with no separate index update required)" } - n_live_tup: { usage: GAUGE ,description: "Estimated number of live rows" } - n_dead_tup: { usage: GAUGE ,description: "Estimated number of dead rows" } - n_mod_since_analyze: { usage: GAUGE ,description: "Estimated number of rows modified since this table was last analyzed" } - n_ins_since_vacuum: { usage: GAUGE ,description: "Estimated number of rows inserted since this table was last vacuumed" } - last_vacuum: { usage: DISCARD ,description: "Last time at which this table was manually vacuumed (not counting VACUUM FULL)" } - last_autovacuum: { usage: DISCARD ,description: "Last time at which this table was vacuumed by the autovacuum daemon" } - last_analyze: { usage: DISCARD ,description: "Last time at which this table was manually analyzed" } - last_autoanalyze: { usage: DISCARD ,description: "Last time at which this table was analyzed by the autovacuum daemon" } - vacuum_count: { usage: COUNTER ,default: 0 ,description: "Number of times this table has been manually vacuumed (not counting VACUUM FULL)" } - autovacuum_count: { usage: COUNTER ,default: 0 ,description: "Number of times this table has been vacuumed by the autovacuum daemon" } - analyze_count: { usage: COUNTER ,default: 0 ,description: "Number of times this table has been manually analyzed" } - autoanalyze_count: { usage: COUNTER ,default: 0 ,description: "Number of times this table has been analyzed by the autovacuum daemon" } - heap_blks_read: { usage: COUNTER ,default: 0 ,description: "Number of disk blocks read from this table" } - heap_blks_hit: { usage: COUNTER ,default: 0 ,description: "Number of buffer hits in this table" } - idx_blks_read: { usage: COUNTER ,default: 0 ,description: "Number of disk blocks read from all indexes on this table" } - idx_blks_hit: { usage: COUNTER ,default: 0 ,description: "Number of buffer hits in all indexes on this table" } - toast_blks_read: { usage: DISCARD ,default: 0 ,description: "Number of disk blocks read from this table's TOAST table (if any)" } - toast_blks_hit: { usage: DISCARD ,default: 0 ,description: "Number of buffer hits in this table's TOAST table (if any)" } - tidx_blks_read: { usage: DISCARD ,default: 0 ,description: "Number of disk blocks read from this table's TOAST table indexes (if any)" } - tidx_blks_hit: { usage: DISCARD ,default: 0 ,description: "Number of buffer hits in this table's TOAST table indexes (if any)" } pg_table_10: name: pg_table desc: PostgreSQL table metrics 9.4-12 query: |- SELECT CURRENT_CATALOG AS datname, nsp.nspname || '.' || c.relname AS relname, c.oid AS relid, ascii(c.relkind) AS kind, c.relpages AS pages, c.reltuples AS tuples, c.relfrozenxid AS frozenxid, age(c.relfrozenxid) AS age, c.relnatts AS ncols, psut.seq_scan,psut.seq_tup_read,psut.idx_scan,psut.idx_tup_fetch,psut.seq_scan + psut.idx_scan AS tbl_scan, psut.seq_tup_read + psut.idx_tup_fetch AS tup_read, psut.n_tup_ins,psut.n_tup_upd,psut.n_tup_del,(psut.n_tup_ins + psut.n_tup_upd + psut.n_tup_del) AS n_tup_mod,psut.n_tup_hot_upd,psut.n_live_tup,psut.n_dead_tup, psut.n_mod_since_analyze,psut.last_vacuum,psut.last_autovacuum,psut.last_analyze,psut.last_autoanalyze, psut.vacuum_count,psut.autovacuum_count,psut.analyze_count,psut.autoanalyze_count, psio.heap_blks_read,psio.heap_blks_hit,psio.idx_blks_read,psio.idx_blks_hit,psio.toast_blks_read,psio.toast_blks_hit,psio.tidx_blks_read,psio.tidx_blks_hit FROM pg_class c JOIN pg_namespace nsp ON c.relnamespace = nsp.oid LEFT JOIN pg_stat_user_tables psut ON psut.relid = c.oid LEFT JOIN pg_statio_user_tables psio ON psio.relid = c.oid WHERE nsp.nspname !~ '^pg_' AND nsp.nspname !~ '^_' AND nsp.nspname !~ '^timescaledb' AND nsp.nspname !~ '^citus' AND nsp.nspname !~ '^columnar' AND nsp.nspname NOT IN ('pg_catalog','information_schema','pg_toast','repack','monitor') AND c.relkind = ANY (ARRAY ['r','m','t','p']) ORDER BY c.relpages DESC LIMIT 256; ttl: 10 timeout: 2 min_version: 090400 max_version: 130000 metrics: - datname: { usage: LABEL ,description: "Database name of this table" } - relname: { usage: LABEL ,description: "Relation name of this table" } - relid: { usage: GAUGE ,description: "Relation oid of this table" } - kind: { usage: GAUGE ,description: "Relation kind r/table/114,m/mview/109,t/toast/116,p/partitioned/112" } - pages: { usage: GAUGE ,description: "Size of the on-disk representation of this table in pages" } - tuples: { usage: GAUGE ,description: "Estimated number of rows in this table" } - frozenxid: { usage: GAUGE ,description: "All txid before this have been frozen on this table" } - age: { usage: GAUGE ,description: "Age of this table in vacuum cycles" } - ncols: { usage: GAUGE ,description: "Number of columns in the table" } - seq_scan: { usage: COUNTER ,default: 0 ,description: "Number of sequential scans initiated on this table" } - seq_tup_read: { usage: COUNTER ,default: 0 ,description: "Number of live rows fetched by sequential scans" } - idx_scan: { usage: COUNTER ,default: 0 ,description: "Number of index scans initiated on this table" } - idx_tup_fetch: { usage: COUNTER ,default: 0 ,description: "Number of live rows fetched by index scans" } - tbl_scan: { usage: COUNTER ,default: 0 ,description: "Number of scans initiated on this table" } - tup_read: { usage: COUNTER ,default: 0 ,description: "Number of live rows fetched by scans" } - n_tup_ins: { usage: COUNTER ,default: 0 ,description: "Number of rows inserted" } - n_tup_upd: { usage: COUNTER ,default: 0 ,description: "Number of rows updated (includes HOT updated rows)" } - n_tup_del: { usage: COUNTER ,default: 0 ,description: "Number of rows deleted" } - n_tup_mod: { usage: COUNTER ,default: 0 ,description: "Number of rows modified (insert + update + delete)" } - n_tup_hot_upd: { usage: COUNTER ,default: 0 ,description: "Number of rows HOT updated (i.e with no separate index update required)" } - n_live_tup: { usage: GAUGE ,description: "Estimated number of live rows" } - n_dead_tup: { usage: GAUGE ,description: "Estimated number of dead rows" } - n_mod_since_analyze: { usage: GAUGE ,description: "Estimated number of rows modified since this table was last analyzed" } - last_vacuum: { usage: DISCARD ,description: "Last time at which this table was manually vacuumed (not counting VACUUM FULL)" } - last_autovacuum: { usage: DISCARD ,description: "Last time at which this table was vacuumed by the autovacuum daemon" } - last_analyze: { usage: DISCARD ,description: "Last time at which this table was manually analyzed" } - last_autoanalyze: { usage: DISCARD ,description: "Last time at which this table was analyzed by the autovacuum daemon" } - vacuum_count: { usage: COUNTER ,default: 0 ,description: "Number of times this table has been manually vacuumed (not counting VACUUM FULL)" } - autovacuum_count: { usage: COUNTER ,default: 0 ,description: "Number of times this table has been vacuumed by the autovacuum daemon" } - analyze_count: { usage: COUNTER ,default: 0 ,description: "Number of times this table has been manually analyzed" } - autoanalyze_count: { usage: COUNTER ,default: 0 ,description: "Number of times this table has been analyzed by the autovacuum daemon" } - heap_blks_read: { usage: COUNTER ,default: 0 ,description: "Number of disk blocks read from this table" } - heap_blks_hit: { usage: COUNTER ,default: 0 ,description: "Number of buffer hits in this table" } - idx_blks_read: { usage: COUNTER ,default: 0 ,description: "Number of disk blocks read from all indexes on this table" } - idx_blks_hit: { usage: COUNTER ,default: 0 ,description: "Number of buffer hits in all indexes on this table" } - toast_blks_read: { usage: DISCARD ,default: 0 ,description: "Number of disk blocks read from this table's TOAST table (if any)" } - toast_blks_hit: { usage: DISCARD ,default: 0 ,description: "Number of buffer hits in this table's TOAST table (if any)" } - tidx_blks_read: { usage: DISCARD ,default: 0 ,description: "Number of disk blocks read from this table's TOAST table indexes (if any)" } - tidx_blks_hit: { usage: DISCARD ,default: 0 ,description: "Number of buffer hits in this table's TOAST table indexes (if any)" } ================================================ FILE: config/0710-pg_index.yml ================================================ #==============================================================# # 0710 pg_index #==============================================================# pg_index: name: pg_index desc: PostgreSQL index metrics query: |- SELECT CURRENT_CATALOG AS datname, schemaname || '.' || indexrelname AS idxname, schemaname || '.' || relname AS relname ,indexrelid AS relid, relpages, reltuples, idx_scan, idx_tup_read, idx_tup_fetch, idx_blks_read, idx_blks_hit FROM pg_stat_user_indexes psui, LATERAL (SELECT idx_blks_read, idx_blks_hit FROM pg_statio_user_indexes psio WHERE psio.indexrelid = psui.indexrelid LIMIT 1) p2, LATERAL (SELECT relpages,reltuples FROM pg_class c WHERE c.oid = psui.indexrelid LIMIT 1) p3 WHERE schemaname !~ '^pg_' AND schemaname !~ '^_' AND schemaname !~ '^timescaledb' AND schemaname !~ '^citus' AND schemaname !~ '^columnar' AND schemaname NOT IN ('pg_catalog','information_schema','pg_toast','repack','monitor') ORDER BY idx_tup_read DESC LIMIT 512; ttl: 10 timeout: 1 min_version: 090400 metrics: - datname: { usage: LABEL ,description: "Database name of this index" } - idxname: { usage: LABEL ,description: "Name of this index (full-qualified schema name)" } - relname: { usage: LABEL ,description: "Name of the table for this index (full-qualified schema name)" } - relid: { usage: LABEL ,description: "Relation oid of this index" } - relpages: { usage: GAUGE ,description: "Size of the on-disk representation of this index in pages" } - reltuples: { usage: GAUGE ,description: "Estimate relation tuples" } - idx_scan: { usage: COUNTER ,description: "Number of index scans initiated on this index" } - idx_tup_read: { usage: COUNTER ,description: "Number of index entries returned by scans on this index" } - idx_tup_fetch: { usage: COUNTER ,description: "Number of live table rows fetched by simple index scans using this index" } - idx_blks_read: { usage: COUNTER ,description: "Number of disk blocks read from this index" } - idx_blks_hit: { usage: COUNTER ,description: "Number of buffer hits in this index" } ================================================ FILE: config/0720-pg_func.yml ================================================ #==============================================================# # 0720 pg_func #==============================================================# pg_func: desc: PostgreSQL function metrics query: SELECT CURRENT_CATALOG AS datname, schemaname || '.' || funcname AS funcname, sum(calls) AS calls, sum(total_time) AS total_time, sum(self_time) AS self_time FROM pg_stat_user_functions GROUP BY 2 ORDER BY 4 DESC LIMIT 128; ttl: 10 min_version: 090400 metrics: - datname: { usage: LABEL ,description: "Name of belonged database" } - funcname: { usage: LABEL ,description: "Name of this function, may have multiple override" } - calls: { usage: COUNTER ,description: "Number of times this function has been called" } - total_time: { usage: COUNTER ,scale: 1e-3 ,description: "Total time spent in this function and all other functions called by it, in seconds" } - self_time: { usage: COUNTER ,scale: 1e-3 ,description: "Total time spent in this function itself, not including other functions called by it, in seconds" } ================================================ FILE: config/0730-pg_seq.yml ================================================ #==============================================================# # 0730 pg_seq #==============================================================# pg_seq: desc: PostgreSQL sequence metrics query: SELECT CURRENT_CATALOG AS datname, schemaname || '.' || sequencename AS seqname, last_value, blks_read, blks_hit FROM pg_sequences s, LATERAL (SELECT relid, blks_read, blks_hit FROM pg_statio_all_sequences sio WHERE s.schemaname = sio.schemaname AND s.sequencename = sio.relname LIMIT 1) d LIMIT 128; ttl: 10 min_version: 100000 metrics: - datname: { usage: LABEL ,description: "Database name of this sequence" } - seqname: { usage: LABEL ,description: "Fully schema qualified sequence name" } - last_value: { usage: COUNTER ,description: "The last sequence value written to disk" } - blks_read: { usage: COUNTER ,description: "Number of disk blocks read from this sequence" } - blks_hit: { usage: COUNTER ,description: "Number of buffer hits in this sequence" } ================================================ FILE: config/0740-pg_relkind.yml ================================================ #==============================================================# # 0740 pg_relkind #==============================================================# pg_relkind: name: pg_relkind desc: Postgres relation count by kind (category, r,i,m,t,...) query: SELECT CURRENT_CATALOG AS datname, relkind, count(*) AS count FROM pg_class GROUP BY relkind; ttl: 60 timeout: 1 min_version: 090400 metrics: - datname: { usage: LABEL ,description: "Name of database" } - relkind: { usage: LABEL ,description: "Kind of this relation, could be r,i,S,t,v,m,c,f,p,I" } - count: { usage: GAUGE ,description: "Number of relations of corresponding relkind" } ================================================ FILE: config/0750-pg_defpart.yml ================================================ #==============================================================# # 0750 pg_defpart #==============================================================# pg_defpart: name: pg_defpart desc: PostgreSQL default partition tuples query: SELECT CURRENT_CATALOG AS datname, relnamespace::RegNamespace || '.' || relname AS relname, reltuples AS tuples FROM pg_class WHERE relpartbound IS NOT NULL AND pg_catalog.pg_get_expr(relpartbound, oid) = 'DEFAULT' ORDER BY reltuples DESC LIMIT 64; ttl: 60 timeout: 1 min_version: 110000 metrics: - datname: { usage: LABEL ,description: "Database name of this default partition" } - relname: { usage: LABEL ,description: "Schema qualified default partition relation name" } - tuples: { usage: GAUGE ,description: "Number of tuples in this default partition" } ================================================ FILE: config/0810-pg_table_size.yml ================================================ #==============================================================# # 0810 pg_table_size #==============================================================# pg_table_size: desc: PostgreSQL table size metrics, quite slow query: |- SELECT CURRENT_CATALOG AS datname, nsp.nspname || '.' || rel.relname AS relname, pg_total_relation_size(rel.oid) AS bytes, pg_relation_size(rel.oid) AS relsize, pg_indexes_size(rel.oid) AS indexsize, pg_total_relation_size(reltoastrelid) AS toastsize FROM pg_namespace nsp JOIN pg_class rel ON nsp.oid = rel.relnamespace WHERE nspname <> ALL(ARRAY['pg_catalog', 'information_schema']) AND rel.relkind = 'r' ORDER BY 3 DESC NULLS LAST LIMIT 256; ttl: 300 timeout: 2 min_version: 100000 metrics: - datname: { usage: LABEL ,description: "Database name of this table" } - relname: { usage: LABEL ,description: "Schema qualified table name" } - bytes: { usage: GAUGE ,default: 0 ,description: "Total bytes of this table (including toast, index, toast index)" } - relsize: { usage: GAUGE ,default: 0 ,description: "Bytes of this table itself (main, vm, fsm)" } - indexsize: { usage: GAUGE ,default: 0 ,description: "Bytes of all related indexes of this table" } - toastsize: { usage: GAUGE ,default: 0 ,description: "Bytes of toast tables of this table" } ================================================ FILE: config/0820-pg_table_bloat.yml ================================================ #==============================================================# # 0820 pg_table_bloat #==============================================================# # pg_table_bloat require auxiliary view to work. Disable it or create auxiliary view before use: pg_table_bloat: name: pg_table_bloat desc: PostgreSQL table bloat metrics, require auxiliary view pg_table_bloat to work query: SELECT datname, nspname || '.' || relname AS relname, size, ratio FROM pg_table_bloat ORDER BY size DESC LIMIT 64; ttl: 300 timeout: 2 min_version: 090400 skip: true metrics: - datname: { usage: LABEL ,description: "Database name of this table" } - relname: { usage: LABEL ,description: "Schema qualified name of this table" } - size: { usage: GAUGE ,description: "Total bytes of this table" } - ratio: { usage: GAUGE ,description: "Estimated bloat ratio of this table from 0 to 1" } ================================================ FILE: config/0830-pg_index_bloat.yml ================================================ #==============================================================# # 0830 pg_index_bloat #==============================================================# # pg_index_bloat require auxiliary view to work. Disable it or create auxiliary view before use: pg_index_bloat: name: pg_index_bloat desc: PostgreSQL index bloat metrics (btree only), require pg_index_bloat query: SELECT datname, nspname || '.' || relname AS relname, size, ratio FROM pg_index_bloat ORDER BY size DESC LIMIT 64; ttl: 300 timeout: 2 min_version: 090400 skip: true metrics: - datname: { usage: LABEL ,description: "Database name of this index" } - relname: { usage: LABEL ,description: "Schema qualified index name" } - size: { usage: GAUGE ,description: "Total bytes of this index" } - ratio: { usage: GAUGE ,description: "Estimated bloat ratio of this index, 0~1" } ================================================ FILE: config/0910-pgbouncer_list.yml ================================================ #==============================================================# # 0910 pgbouncer_list #==============================================================# # http://www.pgbouncer.org/usage.html#show-lists pgbouncer_list: name: pgbouncer_list desc: Pgbouncer entry list query: SHOW LISTS; ttl: 10 min_version: 10800 fatal: true tags: [ pgbouncer ] metrics: - list: { usage: LABEL ,description: "Pgbouncer internal list name" } - items: { usage: GAUGE ,description: "Number of corresponding pgbouncer object" } ================================================ FILE: config/0920-pgbouncer_database.yml ================================================ #==============================================================# # 0920 pgbouncer_database #==============================================================# # http://www.pgbouncer.org/usage.html#show-databases pgbouncer_database_124: name: pgbouncer_database desc: Pgbouncer database stats (since 1.24) query: SHOW DATABASES; ttl: 10 min_version: 12400 tags: [ pgbouncer ] metrics: - name: { usage: LABEL ,rename: datname ,description: "Name of configured database entry" } - host: { usage: LABEL ,description: "Host that pgbouncer will connects to" } - port: { usage: LABEL ,description: "Port that pgbouncer will connects to" } - database: { usage: LABEL ,rename: real_datname ,description: "The real database name pgbouncer connects to" } - force_user: { usage: DISCARD } - pool_size: { usage: GAUGE ,description: "Maximum number of server connections" } - min_pool_size: { usage: GAUGE ,description: "Minimum number of server connections" } - reserve_pool_size: { usage: GAUGE ,rename: reserve_pool ,description: "Maximum number of additional connections for this database" } - server_lifetime: { usage: GAUGE ,description: "The maximum lifetime of a server connection for this database" } - pool_mode: { usage: DISCARD } - load_balance_hosts: { usage: DISCARD } - max_connections: { usage: GAUGE ,description: "Maximum number of allowed connections for this database" } - current_connections: { usage: GAUGE ,description: "Current number of connections for this database" } - max_client_connections: { usage: GAUGE ,description: "Maximum number of allowed client connections for this pgbouncer instance" } - current_client_connections: { usage: GAUGE ,description: "Current number of client connections for this database" } - paused: { usage: GAUGE ,description: "True(1) if this database is currently paused, else 0" } - disabled: { usage: GAUGE ,description: "True(1) if this database is currently disabled, else 0" } pgbouncer_database_123: name: pgbouncer_database desc: Pgbouncer database stats 1.23 query: SHOW DATABASES; ttl: 10 min_version: 12300 max_version: 12400 tags: [ pgbouncer ] metrics: - name: { usage: LABEL ,rename: datname ,description: "Name of configured database entry" } - host: { usage: LABEL ,description: "Host that pgbouncer will connects to" } - port: { usage: LABEL ,description: "Port that pgbouncer will connects to" } - database: { usage: LABEL ,rename: real_datname ,description: "The real database name pgbouncer connects to" } - force_user: { usage: DISCARD } - pool_size: { usage: GAUGE ,description: "Maximum number of server connections" } - min_pool_size: { usage: GAUGE ,description: "Minimum number of server connections" } - reserve_pool: { usage: GAUGE ,description: "Maximum number of additional connections for this database" } - server_lifetime: { usage: GAUGE ,description: "The maximum lifetime of a server connection for this database" } - pool_mode: { usage: DISCARD } - max_connections: { usage: GAUGE ,description: "Maximum number of allowed connections for this database" } - current_connections: { usage: GAUGE ,description: "Current number of connections for this database" } - paused: { usage: GAUGE ,description: "True(1) if this database is currently paused, else 0" } - disabled: { usage: GAUGE ,description: "True(1) if this database is currently disabled, else 0" } pgbouncer_database_116: name: pgbouncer_database desc: Pgbouncer database stats (1.16-1.22) query: SHOW DATABASES; ttl: 10 min_version: 11600 max_version: 12300 tags: [ pgbouncer ] metrics: - name: { usage: LABEL ,rename: datname ,description: "Name of configured database entry" } - host: { usage: LABEL ,description: "Host that pgbouncer will connects to" } - port: { usage: LABEL ,description: "Port that pgbouncer will connects to" } - database: { usage: LABEL ,rename: real_datname ,description: "The real database name pgbouncer connects to" } - force_user: { usage: DISCARD } - pool_size: { usage: GAUGE ,description: "Maximum number of server connections" } - min_pool_size: { usage: GAUGE ,description: "Minimum number of server connections" } - reserve_pool: { usage: GAUGE ,description: "Maximum number of additional connections for this database" } - pool_mode: { usage: DISCARD } - max_connections: { usage: GAUGE ,description: "Maximum number of allowed connections for this database" } - current_connections: { usage: GAUGE ,description: "Current number of connections for this database" } - paused: { usage: GAUGE ,description: "True(1) if this database is currently paused, else 0" } - disabled: { usage: GAUGE ,description: "True(1) if this database is currently disabled, else 0" } pgbouncer_database_108: name: pgbouncer_database desc: Pgbouncer database stats (1.08-1.15) query: SHOW DATABASES; ttl: 10 min_version: 10800 max_version: 11600 tags: [ pgbouncer ] metrics: - name: { usage: LABEL ,rename: datname ,description: "Name of configured database entry" } - host: { usage: LABEL ,description: "Host that pgbouncer will connects to" } - port: { usage: LABEL ,description: "Port that pgbouncer will connects to" } - database: { usage: LABEL ,rename: real_datname ,description: "The real database name pgbouncer connects to" } - force_user: { usage: DISCARD } - pool_size: { usage: GAUGE ,description: "Maximum number of server connections" } - reserve_pool: { usage: GAUGE ,description: "Maximum number of additional connections for this database" } - pool_mode: { usage: DISCARD } - max_connections: { usage: GAUGE ,description: "Maximum number of allowed connections for this database" } - current_connections: { usage: GAUGE ,description: "Current number of connections for this database" } - paused: { usage: GAUGE ,description: "True(1) if this database is currently paused, else 0" } - disabled: { usage: GAUGE ,description: "True(1) if this database is currently disabled, else 0" } ================================================ FILE: config/0930-pgbouncer_stat.yml ================================================ #==============================================================# # 0930 pgbouncer_stat #==============================================================# # http://www.pgbouncer.org/usage.html#show-stats pgbouncer_stat_124: name: pgbouncer_stat desc: Pgbouncer stats per database (since 1.24) query: SHOW STATS; ttl: 10 min_version: 12400 tags: [ pgbouncer ] metrics: - database: { usage: LABEL ,rename: datname ,description: "Name of database" } - total_xact_count: { usage: COUNTER ,description: "Total number of SQL transactions pooled by pgbouncer" } - total_query_count: { usage: COUNTER ,description: "Total number of SQL queries pooled by pgbouncer" } - total_server_assignment_count: { usage: COUNTER ,description: "Total times a server was assigned to a client" } - total_received: { usage: COUNTER ,description: "Total volume in bytes of network traffic received by pgbouncer" } - total_sent: { usage: COUNTER ,description: "Total volume in bytes of network traffic sent by pgbouncer" } - total_xact_time: { usage: COUNTER ,scale: 1e-6 ,description: "Total number of seconds spent when in a transaction" } - total_query_time: { usage: COUNTER ,scale: 1e-6 ,description: "Total number of seconds spent when executing queries" } - total_wait_time: { usage: COUNTER ,scale: 1e-6 ,description: "Time spent by clients waiting for a server, in seconds" } - total_client_parse_count: { usage: COUNTER ,description: "Total number of prepared statements created by clients" } - total_server_parse_count: { usage: COUNTER ,description: "Total number of prepared statements created on a server." } - total_bind_count: { usage: COUNTER ,description: "Total number of prepared statements readied for execution by clients and forwarded to postgres" } - avg_xact_count: { usage: GAUGE ,description: "Average transactions per second in last stat period" } - avg_query_count: { usage: GAUGE ,description: "Average queries per second in last stat period" } - avg_server_assignment_count: { usage: GAUGE ,description: "Average number of times a server as assigned to a client per second in the last stat period." } - avg_recv: { usage: GAUGE ,description: "Average received (from clients) bytes per second" } - avg_sent: { usage: GAUGE ,description: "Average sent (to clients) bytes per second" } - avg_xact_time: { usage: GAUGE ,scale: 1e-6 ,description: "Average transaction duration, in seconds" } - avg_query_time: { usage: GAUGE ,scale: 1e-6 ,description: "Average query duration, in seconds" } - avg_wait_time: { usage: GAUGE ,scale: 1e-6 ,description: "Time spent by clients waiting for a server, in seconds (average per second)." } - avg_client_parse_count: { usage: GAUGE ,description: "Average number of prepared statements created by clients" } - avg_server_parse_count: { usage: GAUGE ,description: "Average number of prepared statements created on a server." } - avg_bind_count: { usage: GAUGE ,description: "Average number of prepared statements readied for execution by clients and forwarded to postgres" } pgbouncer_stat_123: name: pgbouncer_stat desc: Pgbouncer stats per database (1.23) query: SHOW STATS; ttl: 10 min_version: 12300 max_version: 12400 tags: [ pgbouncer ] metrics: - database: { usage: LABEL ,rename: datname ,description: "Name of database" } - total_xact_count: { usage: COUNTER ,description: "Total number of SQL transactions pooled by pgbouncer" } - total_query_count: { usage: COUNTER ,description: "Total number of SQL queries pooled by pgbouncer" } - total_server_assignment_count: { usage: COUNTER ,description: "Total times a server was assigned to a client" } - total_received: { usage: COUNTER ,description: "Total volume in bytes of network traffic received by pgbouncer" } - total_sent: { usage: COUNTER ,description: "Total volume in bytes of network traffic sent by pgbouncer" } - total_xact_time: { usage: COUNTER ,scale: 1e-6 ,description: "Total number of seconds spent when in a transaction" } - total_query_time: { usage: COUNTER ,scale: 1e-6 ,description: "Total number of seconds spent when executing queries" } - total_wait_time: { usage: COUNTER ,scale: 1e-6 ,description: "Time spent by clients waiting for a server, in seconds" } - avg_xact_count: { usage: GAUGE ,description: "Average transactions per second in last stat period" } - avg_query_count: { usage: GAUGE ,description: "Average queries per second in last stat period" } - avg_server_assignment_count: { usage: GAUGE ,description: "Average number of times a server as assigned to a client per second in the last stat period." } - avg_recv: { usage: GAUGE ,description: "Average received (from clients) bytes per second" } - avg_sent: { usage: GAUGE ,description: "Average sent (to clients) bytes per second" } - avg_xact_time: { usage: GAUGE ,scale: 1e-6 ,description: "Average transaction duration, in seconds" } - avg_query_time: { usage: GAUGE ,scale: 1e-6 ,description: "Average query duration, in seconds" } - avg_wait_time: { usage: GAUGE ,scale: 1e-6 ,description: "Time spent by clients waiting for a server, in seconds (average per second)." } pgbouncer_stat_108: name: pgbouncer_stat desc: Pgbouncer stats per database (1.08 - 1.22) query: SHOW STATS; ttl: 10 min_version: 10800 max_version: 12300 tags: [ pgbouncer ] metrics: - database: { usage: LABEL ,rename: datname ,description: "Name of database" } - total_xact_count: { usage: COUNTER ,description: "Total number of SQL transactions pooled by pgbouncer" } - total_query_count: { usage: COUNTER ,description: "Total number of SQL queries pooled by pgbouncer" } - total_received: { usage: COUNTER ,description: "Total volume in bytes of network traffic received by pgbouncer" } - total_sent: { usage: COUNTER ,description: "Total volume in bytes of network traffic sent by pgbouncer" } - total_xact_time: { usage: COUNTER ,scale: 1e-6 ,description: "Total number of seconds spent when in a transaction" } - total_query_time: { usage: COUNTER ,scale: 1e-6 ,description: "Total number of seconds spent when executing queries" } - total_wait_time: { usage: COUNTER ,scale: 1e-6 ,description: "Time spent by clients waiting for a server, in seconds" } - avg_xact_count: { usage: GAUGE ,description: "Average transactions per second in last stat period" } - avg_query_count: { usage: GAUGE ,description: "Average queries per second in last stat period" } - avg_recv: { usage: GAUGE ,description: "Average received (from clients) bytes per second" } - avg_sent: { usage: GAUGE ,description: "Average sent (to clients) bytes per second" } - avg_xact_time: { usage: GAUGE ,scale: 1e-6 ,description: "Average transaction duration, in seconds" } - avg_query_time: { usage: GAUGE ,scale: 1e-6 ,description: "Average query duration, in seconds" } - avg_wait_time: { usage: GAUGE ,scale: 1e-6 ,description: "Time spent by clients waiting for a server, in seconds (average per second)." } ================================================ FILE: config/0940-pgbouncer_pool.yml ================================================ #==============================================================# # 0940 pgbouncer_pool #==============================================================# # http://www.pgbouncer.org/usage.html#show-pools pgbouncer_pool_124: name: pgbouncer_pool desc: Pgbouncer pool stats (1.24+) query: SHOW POOLS; ttl: 10 min_version: 12400 tags: [ pgbouncer ] metrics: - database: { usage: LABEL, rename: datname, description: "Database name of this pool" } - user: { usage: LABEL, description: "User name of this pool" } - cl_active: { usage: GAUGE, rename: active_clients, description: "Client connections that are linked to server connection and can process queries" } - cl_waiting: { usage: GAUGE, rename: waiting_clients, description: "Client connections that have sent queries but have not yet got a server connection" } - cl_active_cancel_req: { usage: GAUGE, rename: active_cancel_clients, description: "Client connections that have forwarded query cancellations to the server and are waiting for the server response." } - cl_waiting_cancel_req: { usage: GAUGE, rename: cancel_clients, description: "Client connections that have not forwarded query cancellations to the server yet." } - sv_active: { usage: GAUGE, rename: active_servers, description: "Server connections that are linked to a client" } - sv_active_cancel: { usage: GAUGE, rename: active_cancel_servers, description: "Server connections that are currently forwarding a cancel request" } - sv_being_canceled: { usage: GAUGE, rename: cancel_servers, description: "cancel requests have completed that were sent to cancel a query on this server" } - sv_idle: { usage: GAUGE, rename: idle_servers, description: "Server connections that are unused and immediately usable for client queries" } - sv_used: { usage: GAUGE, rename: used_servers, description: "Server connections that have been idle for more than server_check_delay (means have to run check query)" } - sv_tested: { usage: GAUGE, rename: tested_servers, description: "Server connections that are currently running reset or check query" } - sv_login: { usage: GAUGE, rename: login_servers, description: "Server connections currently in the process of logging in" } - maxwait: { usage: GAUGE, description: "How long the first(oldest) client in the queue has waited, in seconds, key metric" } - maxwait_us: { usage: GAUGE, description: "Microsecond part of the maximum waiting time." } - pool_mode: { usage: LABEL, description: "Pooling mode in use" } - load_balance_hosts: { usage: LABEL, description: "The load_balance_hosts in use" } pgbouncer_pool_118: name: pgbouncer_pool desc: Pgbouncer pool stats (1.18-1.23) query: SHOW POOLS; ttl: 10 min_version: 11800 max_version: 12400 tags: [ pgbouncer ] metrics: - database: { usage: LABEL, rename: datname, description: "Database name of this pool" } - user: { usage: LABEL, description: "User name of this pool" } - cl_active: { usage: GAUGE, rename: active_clients, description: "Client connections that are linked to server connection and can process queries" } - cl_waiting: { usage: GAUGE, rename: waiting_clients, description: "Client connections that have sent queries but have not yet got a server connection" } - cl_active_cancel_req: { usage: GAUGE, rename: active_cancel_clients, description: "Client connections that have forwarded query cancellations to the server and are waiting for the server response." } - cl_waiting_cancel_req: { usage: GAUGE, rename: cancel_clients, description: "Client connections that have not forwarded query cancellations to the server yet." } - sv_active: { usage: GAUGE, rename: active_servers, description: "Server connections that are linked to a client" } - sv_active_cancel: { usage: GAUGE, rename: active_cancel_servers, description: "Server connections that are currently forwarding a cancel request" } - sv_being_canceled: { usage: GAUGE, rename: cancel_servers, description: "cancel requests have completed that were sent to cancel a query on this server" } - sv_idle: { usage: GAUGE, rename: idle_servers, description: "Server connections that are unused and immediately usable for client queries" } - sv_used: { usage: GAUGE, rename: used_servers, description: "Server connections that have been idle for more than server_check_delay (means have to run check query)" } - sv_tested: { usage: GAUGE, rename: tested_servers, description: "Server connections that are currently running reset or check query" } - sv_login: { usage: GAUGE, rename: login_servers, description: "Server connections currently in the process of logging in" } - maxwait: { usage: GAUGE, description: "How long the first(oldest) client in the queue has waited, in seconds, key metric" } - maxwait_us: { usage: GAUGE, description: "Microsecond part of the maximum waiting time." } - pool_mode: { usage: LABEL, description: "Pooling mode in use" } pgbouncer_pool_116: name: pgbouncer_pool desc: Pgbouncer pool stats (1.16-1.17) query: SHOW POOLS; ttl: 10 min_version: 11600 max_version: 11800 tags: [ pgbouncer ] metrics: - database: { usage: LABEL, rename: datname, description: "Database name of this pool" } - user: { usage: LABEL, description: "User name of this pool" } - cl_active: { usage: GAUGE, rename: active_clients, description: "Client connections that are linked to server connection and can process queries" } - cl_waiting: { usage: GAUGE, rename: waiting_clients, description: "Client connections that have sent queries but have not yet got a server connection" } - cl_cancel_req: { usage: GAUGE, rename: cancel_clients, description: "Client connections that have not forwarded query cancellations to the server yet." } - sv_active: { usage: GAUGE, rename: active_servers, description: "Server connections that are linked to a client" } - sv_idle: { usage: GAUGE, rename: idle_servers, description: "Server connections that are unused and immediately usable for client queries" } - sv_used: { usage: GAUGE, rename: used_servers, description: "Server connections that have been idle for more than server_check_delay (means have to run check query)" } - sv_tested: { usage: GAUGE, rename: tested_servers, description: "Server connections that are currently running reset or check query" } - sv_login: { usage: GAUGE, rename: login_servers, description: "Server connections currently in the process of logging in" } - maxwait: { usage: GAUGE, description: "How long the first(oldest) client in the queue has waited, in seconds, key metric" } - maxwait_us: { usage: GAUGE, description: "Microsecond part of the maximum waiting time." } - pool_mode: { usage: LABEL, description: "Pooling mode in use" } pgbouncer_pool_108: name: pgbouncer_pool desc: Pgbouncer pool stats (1.08-1.15) query: SHOW POOLS; ttl: 10 min_version: 10800 max_version: 11600 tags: [ pgbouncer ] metrics: - database: { usage: LABEL, rename: datname, description: "Database name of this pool" } - user: { usage: LABEL, description: "User name of this pool" } - cl_active: { usage: GAUGE, rename: active_clients, description: "Client connections that are linked to server connection and can process queries" } - cl_waiting: { usage: GAUGE, rename: waiting_clients, description: "Client connections that have sent queries but have not yet got a server connection" } - sv_active: { usage: GAUGE, rename: active_servers, description: "Server connections that are linked to a client" } - sv_idle: { usage: GAUGE, rename: idle_servers, description: "Server connections that are unused and immediately usable for client queries" } - sv_used: { usage: GAUGE, rename: used_servers, description: "Server connections that have been idle for more than server_check_delay (means have to run check query)" } - sv_tested: { usage: GAUGE, rename: tested_servers, description: "Server connections that are currently running reset or check query" } - sv_login: { usage: GAUGE, rename: login_servers, description: "Server connections currently in the process of logging in" } - maxwait: { usage: GAUGE, description: "How long the first(oldest) client in the queue has waited, in seconds, key metric" } - maxwait_us: { usage: GAUGE, description: "Microsecond part of the maximum waiting time." } - pool_mode: { usage: LABEL, description: "Pooling mode in use" } ================================================ FILE: config/1000-pg_wait_event.yml ================================================ #==============================================================# # 1000 pg_wait_event #==============================================================# pg_wait_event: name: pg_wait_event desc: PostgreSQL wait event sampling based on pg_wait_sampling extension query: SELECT coalesce(event_type, 'Running') AS etype, coalesce(event, 'Running') AS event, sum(count) AS count FROM pg_wait_sampling_profile GROUP BY 1,2; ttl: 10 min_version: 100000 tags: [ cluster, "extension:pg_wait_sampling" ] metrics: - etype: { usage: "LABEL" ,description: "wait event type" } - event: { usage: "LABEL" ,description: "wait event name" } - count: { usage: "COUNTER" ,description: "Total count of wait events sampled" } pg_wait_event_1s: name: pg_wait_event_1s desc: PostgreSQL wait event sampling based on pg_wait_sampling extension query: SELECT coalesce(event_type, 'Running') AS etype, coalesce(event, 'Running') AS event, count(*) FROM pg_wait_sampling_history WHERE ts BETWEEN now() - '1s'::INTERVAL AND now() GROUP BY 1,2; ttl: 10 min_version: 100000 tags: [ cluster, "extension:pg_wait_sampling" ] metrics: - etype: { usage: "LABEL" ,description: "wait event type" } - event: { usage: "LABEL" ,description: "wait event name" } - count: { usage: "GAUGE" ,description: "Number of wait events in last second" } ================================================ FILE: config/1800-pg_tsdb_hypertable.yml ================================================ #==============================================================# # 1800 pg_tsdb_hypertable #==============================================================# # this collector reqires timescaledb extension to be installed pg_tsdb_hypertable: name: pg_tsdb_hypertable desc: TimescaleDB hypertable overview query: |- SELECT current_database() AS datname, format('%I.%I', hypertable_schema, hypertable_name) AS relname, num_dimensions AS dimensions, num_chunks AS chunks, compression_enabled::BOOLEAN::int AS compressed, hypertable_size(format('"%I"."%I"', hypertable_schema, hypertable_name)::RegClass) AS bytes FROM timescaledb_information.hypertables; ttl: 60 timeout: 2 min_version: 100000 skip: true tags: [ "extension:timescaledb", "schema:timescaledb_information" ] metrics: - datname: { usage: LABEL ,description: "database name" } - relname: { usage: LABEL ,description: "Hypertable relation name" } - dimensions: { usage: GAUGE ,description: "Number of partitioning dimensions" } - chunks: { usage: GAUGE ,description: "Total chunks of this hypertable" } - compressed: { usage: GAUGE ,description: "1 if compression enabled" } - bytes: { usage: GAUGE ,description: "Total size of hypertable in bytes" } ================================================ FILE: config/1900-pg_citus.yml ================================================ #==============================================================# # 1900 pg_citus_node #==============================================================# # https://docs.citusdata.com/en/latest/develop/api_metadata.html#worker-node-table pg_citus_node: name: pg_citus_node desc: Citus worker coordinator node inventory query: |- SELECT CONCAT(nodename, ':', nodeport) AS node, current_database() AS datname, nodeid AS id, groupid AS group, hasmetadata::BOOLEAN::INT AS has_meta, isactive::BOOLEAN::INT AS is_active, metadatasynced::BOOLEAN::INT AS meta_synced, shouldhaveshards::BOOLEAN::INT AS have_shards FROM pg_dist_node; ttl: 60 min_version: 100000 tags: [ "extension:citus" ] metrics: - node: { usage: LABEL ,description: "nodename:port of the PostgreSQL instance" } - datname: { usage: LABEL ,description: "database name" } - id: { usage: GAUGE ,description: "auto‑generated node identifier" } - group: { usage: GAUGE ,description: "replication group id (primary + secondaries)" } - has_meta: { usage: GAUGE ,description: "1 = internal use flag set" } - is_active: { usage: GAUGE ,description: "1 = node currently accepts shards" } - meta_synced: { usage: GAUGE ,description: "1 = metadata fully synced to node" } - have_shards: { usage: GAUGE ,description: "1 = rebalancer may place shards here" } ================================================ FILE: config/2000-pg_heartbeat.yml ================================================ #==============================================================# # 2000 heartbeat #==============================================================# # this is a example of application monitoring and predicate queries pg_heartbeat: name: pg_heartbeat desc: monitoring heartbeat in monitor.heartbeat table predicate_queries: - name: if heartbeat table exists predicate_query: | SELECT EXISTS (SELECT 1 FROM information_schema.tables WHERE table_schema = 'monitor' AND table_name = 'heartbeat'); query: |- SELECT id AS cluster_name, extract(EPOCH FROM ts) AS ts, lsn, txid FROM monitor.heartbeat; ttl: 10 min_version: 090100 tags: [ "dbname:postgres", "schema:monitor" ] skip: true metrics: - cluster_name: { usage: LABEL ,description: "cluster_name param of this database cluster" } - ts: { usage: GAUGE ,description: "unix timestamp of the heartbeat" } - lsn: { usage: COUNTER ,description: "lsn of the heartbeat" } - txid: { usage: GAUGE ,description: "txid of the heartbeat" } ================================================ FILE: docker/.dockerignore ================================================ * !go.mod !go.sum !main.go !exporter !pg_exporter.yml ================================================ FILE: docker/README.md ================================================ # Docker Build Scripts This directory contains scripts for building Docker images for pg_exporter. ## Scripts ### `build.sh` - Local Development Build Builds a single-architecture Docker image for local development and testing. ```bash # Build for current architecture ./docker/build.sh # Build for specific architecture ARCH=arm64 ./docker/build.sh # Use custom repository DOCKER_REPO=myrepo/pg_exporter ./docker/build.sh ``` **Features:** - Detects current architecture automatically - Builds only for the current platform - Creates local tags: `:dev`, `:latest-`, `:-` - Does not push to registry - Automatically builds missing Linux binaries if needed ### `release.sh` - Production Multi-Arch Release Builds and pushes multi-architecture Docker images with manifest list support. ```bash # Build and push multi-arch images ./docker/release.sh # Build locally without pushing (for testing) PUSH=false ./docker/release.sh # Custom repository DOCKER_REPO=pgsty/pg_exporter ./docker/release.sh # Custom platforms PLATFORMS=linux/amd64,linux/arm64,linux/arm/v7 ./docker/release.sh ``` **Features:** - Builds for multiple architectures (amd64, arm64 by default) - Creates the manifest list for automatic architecture selection - Pushes to Docker registry - Creates tags: `:`, `:latest` - Requires pre-built Linux binaries (`make release-linux`) ## How Multi-Arch Works The `release.sh` script uses Docker buildx to create a **manifest list** (also called "fat manifest"). This allows users to pull images without specifying architecture: ```bash # Users can simply run: docker pull pgsty/pg_exporter:latest # Docker automatically selects the right architecture: # - On AMD64 systems: pulls linux/amd64 image # - On ARM64 systems: pulls linux/arm64 image ``` ## Prerequisites ### For Local Build (`build.sh`) - Docker - Make (for building binaries if needed) ### For Production Release (`release.sh`) - Docker with buildx support - Docker registry authentication (for pushing) - Pre-built Linux binaries: `make release-linux` ## Environment Variables | Variable | Default | Description | |---------------|---------------------------|------------------------------------------| | `VERSION` | From Makefile | Image version tag | | `DOCKER_REPO` | `pgsty/pg_exporter` | Docker repository | | `ARCH` | Auto-detected | Target architecture (build.sh only) | | `PLATFORMS` | `linux/amd64,linux/arm64` | Target platforms (release.sh only) | | `PUSH` | `true` | Whether to push images (release.sh only) | ## Examples ```bash # Local development ./docker/build.sh docker run --rm pgsty/pg_exporter:dev --version # Production release make release-linux # Build binaries first ./docker/release.sh # Test locally without pushing PUSH=false ./docker/release.sh # Custom repository and platforms DOCKER_REPO=mycompany/pg_exporter \ PLATFORMS=linux/amd64,linux/arm64,linux/arm/v7 \ ./docker/release.sh ``` ================================================ FILE: docker/build.sh ================================================ #!/bin/bash #==============================================================# # File : docker/build.sh # Desc : Build single-arch Docker image locally with Go # Mtime : 2025-07-17 # License : Apache-2.0 @ https://github.com/pgsty/pg_exporter # Copyright : 2018-2026 Ruohang Feng / Vonng (rh@vonng.com) #==============================================================# set -euo pipefail # Get current script directory SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PROJECT_ROOT="$(dirname "${SCRIPT_DIR}")" cd "${PROJECT_ROOT}" # Get version from Makefile or environment VERSION=${VERSION:-$(grep '^VERSION' Makefile | cut -d'=' -f2 | tr -d ' ?')} DOCKER_REPO=${DOCKER_REPO:-vonng/pg_exporter} echo "Building Docker image for pg_exporter ${VERSION} - LOCAL BUILD (Go source)" # Create Dockerfile for local Go build cat > "${SCRIPT_DIR}/Dockerfile.local" << 'EOF' # syntax=docker/dockerfile:1 FROM golang:1.26.2-alpine AS builder-env ARG GOPROXY=https://proxy.golang.org,direct ARG GOSUMDB=sum.golang.org ENV GOPROXY=${GOPROXY} ENV GOSUMDB=${GOSUMDB} WORKDIR /build # Copy dependency files and download deps COPY go.mod go.sum ./ RUN \ --mount=type=cache,target=/go/pkg/mod \ --mount=type=cache,target=/root/.cache/go-build \ CGO_ENABLED=0 GOOS=linux go mod download # Copy source code COPY . /build # Build static binary RUN \ --mount=type=cache,target=/go/pkg/mod \ --mount=type=cache,target=/root/.cache/go-build \ CGO_ENABLED=0 GOOS=linux go build -a -ldflags '-extldflags "-static"' -o /pg_exporter . FROM scratch LABEL org.opencontainers.image.authors="Ruohang Feng " \ org.opencontainers.image.url="https://github.com/pgsty/pg_exporter" \ org.opencontainers.image.source="https://github.com/pgsty/pg_exporter" \ org.opencontainers.image.licenses="Apache-2.0" \ org.opencontainers.image.title="pg_exporter" \ org.opencontainers.image.description="PostgreSQL/Pgbouncer metrics exporter for Prometheus" WORKDIR /bin COPY --from=builder-env /pg_exporter /bin/pg_exporter COPY pg_exporter.yml /etc/pg_exporter.yml EXPOSE 9630/tcp ENTRYPOINT ["/bin/pg_exporter"] EOF echo "Building Docker image with Go source..." # Build image using Go source (local only, no push) docker build \ -f "${SCRIPT_DIR}/Dockerfile.local" \ --build-arg "GOPROXY=${GOPROXY:-https://proxy.golang.org,direct}" \ --build-arg "GOSUMDB=${GOSUMDB:-sum.golang.org}" \ -t "${DOCKER_REPO}:${VERSION}-dev" \ -t "${DOCKER_REPO}:dev" \ "${PROJECT_ROOT}" echo "Docker image built successfully:" echo " ${DOCKER_REPO}:${VERSION}-dev" echo " ${DOCKER_REPO}:dev" echo "" echo "To test the image:" echo " docker run --rm ${DOCKER_REPO}:dev --version" # Clean up rm -f "${SCRIPT_DIR}/Dockerfile.local" ================================================ FILE: docker/release.sh ================================================ #!/bin/bash #==============================================================# # File : docker/release.sh # Desc : Build and release multi-arch Docker images # Mtime : 2025-07-17 # License : Apache-2.0 @ https://github.com/pgsty/pg_exporter # Copyright : 2018-2026 Ruohang Feng / Vonng (rh@vonng.com) #==============================================================# set -euo pipefail # Get current script directory SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" PROJECT_ROOT="$(dirname "${SCRIPT_DIR}")" cd "${PROJECT_ROOT}" # Get version from Makefile or environment VERSION=${VERSION:-$(grep '^VERSION' Makefile | cut -d'=' -f2 | tr -d ' ?')} DOCKER_REPO=${DOCKER_REPO:-pgsty/pg_exporter} PLATFORMS=${PLATFORMS:-linux/amd64,linux/arm64} PUSH=${PUSH:-true} echo "Building and releasing multi-arch Docker images for pg_exporter ${VERSION}" echo "Platforms: ${PLATFORMS}" echo "Repository: ${DOCKER_REPO}" # Check if both arch binaries exist echo "Checking if release binaries exist..." if [[ ! -d "dist/${VERSION}" ]]; then echo "Error: dist/${VERSION} directory not found. Building Linux binaries first..." make release-linux fi AMD64_TAR="dist/${VERSION}/pg_exporter-${VERSION}.linux-amd64.tar.gz" ARM64_TAR="dist/${VERSION}/pg_exporter-${VERSION}.linux-arm64.tar.gz" if [[ ! -f "${AMD64_TAR}" ]] || [[ ! -f "${ARM64_TAR}" ]]; then echo "Error: Required binaries not found. Building them now..." make release-linux if [[ ! -f "${AMD64_TAR}" ]] || [[ ! -f "${ARM64_TAR}" ]]; then echo "Error: Failed to build required binaries" exit 1 fi fi # Create temporary build contexts for each architecture BUILD_DIR=$(mktemp -d) trap "rm -rf ${BUILD_DIR}" EXIT echo "Created temporary build directory: ${BUILD_DIR}" # Extract binaries for both architectures echo "Extracting binaries..." mkdir -p "${BUILD_DIR}/amd64" "${BUILD_DIR}/arm64" tar -xzf "${AMD64_TAR}" -C "${BUILD_DIR}/amd64" --strip-components=1 tar -xzf "${ARM64_TAR}" -C "${BUILD_DIR}/arm64" --strip-components=1 # Create multi-arch Dockerfile that works with buildx cat > "${BUILD_DIR}/Dockerfile" << 'EOF' FROM scratch LABEL org.opencontainers.image.authors="Ruohang Feng " \ org.opencontainers.image.url="https://github.com/pgsty/pg_exporter" \ org.opencontainers.image.source="https://github.com/pgsty/pg_exporter" \ org.opencontainers.image.licenses="Apache-2.0" \ org.opencontainers.image.title="pg_exporter" \ org.opencontainers.image.description="PostgreSQL/Pgbouncer metrics exporter for Prometheus" WORKDIR /bin COPY pg_exporter /bin/pg_exporter COPY pg_exporter.yml /etc/pg_exporter.yml COPY LICENSE /LICENSE EXPOSE 9630/tcp ENTRYPOINT ["/bin/pg_exporter"] EOF # Copy Dockerfile to both arch directories cp "${BUILD_DIR}/Dockerfile" "${BUILD_DIR}/amd64/Dockerfile" cp "${BUILD_DIR}/Dockerfile" "${BUILD_DIR}/arm64/Dockerfile" echo "Setting up Docker buildx..." # Create or use existing buildx builder BUILDER_NAME="pg_exporter_builder" if ! docker buildx ls | grep -q "${BUILDER_NAME}"; then echo "Creating new buildx builder: ${BUILDER_NAME}" docker buildx create --name "${BUILDER_NAME}" --use --bootstrap else echo "Using existing buildx builder: ${BUILDER_NAME}" docker buildx use "${BUILDER_NAME}" fi if [[ "${PUSH}" == "true" ]]; then echo "Building and pushing multi-arch images..." echo "This will create a manifest list that automatically selects the right architecture." # Build and push multi-arch images with manifest list # This creates the "fat manifest" that allows automatic architecture selection docker buildx build \ --platform "${PLATFORMS}" \ --file "${BUILD_DIR}/amd64/Dockerfile" \ --tag "${DOCKER_REPO}:${VERSION}" \ --tag "${DOCKER_REPO}:latest" \ --push \ "${BUILD_DIR}/amd64" else echo "Building multi-arch images locally (no push)..." echo "This will create local images for testing." # Build multi-arch images locally without pushing docker buildx build \ --platform "${PLATFORMS}" \ --file "${BUILD_DIR}/amd64/Dockerfile" \ --tag "${DOCKER_REPO}:${VERSION}" \ --tag "${DOCKER_REPO}:latest" \ --load \ "${BUILD_DIR}/amd64" fi echo "" if [[ "${PUSH}" == "true" ]]; then echo "✅ Multi-arch Docker images released successfully!" echo "" echo "Images pushed:" echo " ${DOCKER_REPO}:${VERSION}" echo " ${DOCKER_REPO}:latest" else echo "✅ Multi-arch Docker images built locally!" echo "" echo "Images created:" echo " ${DOCKER_REPO}:${VERSION}" echo " ${DOCKER_REPO}:latest" fi echo "" echo "These images support the following architectures:" echo " - linux/amd64" echo " - linux/arm64" echo "" if [[ "${PUSH}" == "true" ]]; then echo "Users can now pull without specifying architecture:" echo " docker pull ${DOCKER_REPO}:${VERSION}" echo " docker pull ${DOCKER_REPO}:latest" echo "" echo "Docker will automatically select the correct architecture for their platform." # Verify the manifest echo "" echo "Verifying manifest list..." docker buildx imagetools inspect "${DOCKER_REPO}:${VERSION}" || echo "Note: manifest inspection requires authentication" else echo "To test the local images:" echo " docker run --rm ${DOCKER_REPO}:${VERSION} --version" echo "" echo "To push later:" echo " docker push ${DOCKER_REPO}:${VERSION}" echo " docker push ${DOCKER_REPO}:latest" fi ================================================ FILE: exporter/arg.go ================================================ package exporter import ( "fmt" "os" "runtime" "github.com/alecthomas/kingpin/v2" "github.com/prometheus/exporter-toolkit/web/kingpinflag" ) var ( // exporter settings pgURL = kingpin.Flag("url", "postgres target url").Short('d').Short('u').String() configPath = kingpin.Flag("config", "path to config dir or file").Short('c').String() webConfig = kingpinflag.AddFlags(kingpin.CommandLine, ":9630") constLabels = kingpin.Flag("label", "constant labels: comma separated list of label=value pair").Short('l').Default("").Envar("PG_EXPORTER_LABEL").String() serverTags = kingpin.Flag("tag", "tags,comma separated list of server tag").Default("").Short('t').Envar("PG_EXPORTER_TAG").String() disableCache = kingpin.Flag("disable-cache", "force not using cache").Default("false").Short('C').Envar("PG_EXPORTER_DISABLE_CACHE").Bool() disableIntro = kingpin.Flag("disable-intro", "disable internal/exporter self metrics (only expose query metrics)").Short('m').Default("false").Envar("PG_EXPORTER_DISABLE_INTRO").Bool() autoDiscovery = kingpin.Flag("auto-discovery", "automatically scrape all database for given server").Short('a').Default("true").Envar("PG_EXPORTER_AUTO_DISCOVERY").Bool() excludeDatabase = kingpin.Flag("exclude-database", "excluded databases when enabling auto-discovery").Short('x').Default("template0,template1,postgres").Envar("PG_EXPORTER_EXCLUDE_DATABASE").String() includeDatabase = kingpin.Flag("include-database", "included databases when enabling auto-discovery").Short('i').Default("").Envar("PG_EXPORTER_INCLUDE_DATABASE").String() exporterNamespace = kingpin.Flag("namespace", "prefix of built-in metrics, (pg|pgbouncer) by default").Short('n').Default("").Envar("PG_EXPORTER_NAMESPACE").String() failFast = kingpin.Flag("fail-fast", "fail fast instead of waiting during start-up").Short('f').Envar("PG_EXPORTER_FAIL_FAST").Default("false").Bool() connectTimeout = kingpin.Flag("connect-timeout", "connect timeout in ms, 100 by default").Short('T').Envar("PG_EXPORTER_CONNECT_TIMEOUT").Default("100").Int() // prometheus http // listenAddress = kingpin.Flag("web.listen-address", "prometheus web server listen address").Short('L').Default(":9630").Envar("PG_EXPORTER_LISTEN_ADDRESS").String() metricPath = kingpin.Flag("web.telemetry-path", "URL path under which to expose metrics.").Short('P').Default("/metrics").Envar("PG_EXPORTER_TELEMETRY_PATH").String() // action dryRun = kingpin.Flag("dry-run", "dry run and print raw configs").Default("false").Short('D').Bool() explainOnly = kingpin.Flag("explain", "explain server planned queries").Default("false").Short('E').Bool() // logger setting logLevel = kingpin.Flag("log.level", "log level: debug|info|warn|error").Default("info").String() logFormat = kingpin.Flag("log.format", "log format: logfmt|json").Default("logfmt").String() ) // ParseArgs will parse cli args with kingpin. url and config have special treatment func ParseArgs() { kingpin.Version(fmt.Sprintf("pg_exporter %s (built with %s on %s/%s)\n", Version, runtime.Version(), runtime.GOOS, runtime.GOARCH)) kingpin.HelpFlag.Short('h') // kingpin bool flags don't accept `--flag=false` (only `--no-flag`). // Normalize common `=true/false` forms to avoid confusing "unexpected false" errors. _ = kingpin.MustParse(kingpin.CommandLine.Parse(normalizeKingpinBoolEqualsArgs(os.Args[1:], kingpin.CommandLine.Model()))) Logger = configureLogger(*logLevel, *logFormat) logDebugf("init pg_exporter, configPath=%v constLabels=%v disableCache=%v autoDiscovery=%v excludeDatabase=%v includeDatabase=%v connectTimeout=%vms webConfig=%v metricPath=%v", *configPath, *constLabels, *disableCache, *autoDiscovery, *excludeDatabase, *includeDatabase, *connectTimeout, *webConfig.WebListenAddresses, *metricPath) *pgURL = GetPGURL() *configPath = GetConfig() } ================================================ FILE: exporter/args_normalize.go ================================================ package exporter import ( "strconv" "strings" "github.com/alecthomas/kingpin/v2" ) // normalizeKingpinBoolEqualsArgs rewrites boolean flags passed as `--flag=false` // (or `-f=false`) into kingpin-compatible forms (`--no-flag`). // // kingpin bool flags are "presence flags" and don't accept values, which leads // to confusing parse errors like: "unexpected false, try --help". // // This function is intentionally conservative: it only rewrites flags that are // known boolean flags in the provided kingpin model. func normalizeKingpinBoolEqualsArgs(args []string, model *kingpin.ApplicationModel) []string { if len(args) == 0 || model == nil || model.FlagGroupModel == nil { return args } boolFlags := make(map[string]struct{}) shortToLong := make(map[byte]string) for _, f := range model.FlagGroupModel.Flags { if f == nil || !f.IsBoolFlag() { continue } boolFlags[f.Name] = struct{}{} if f.Short != 0 && f.Short <= 0x7f { // only ASCII shorts are relevant to CLI parsing shortToLong[byte(f.Short)] = f.Name } } if len(boolFlags) == 0 { return args } out := make([]string, 0, len(args)) for _, arg := range args { // Long form: --flag=false if strings.HasPrefix(arg, "--") { name, val, ok := strings.Cut(arg[2:], "=") if ok { if _, isBool := boolFlags[name]; isBool { if b, err := strconv.ParseBool(val); err == nil { if b { out = append(out, "--"+name) } else { out = append(out, "--no-"+name) } continue } } } out = append(out, arg) continue } // Short form: -f=false (only single short flag) if strings.HasPrefix(arg, "-") && !strings.HasPrefix(arg, "--") { name, val, ok := strings.Cut(arg[1:], "=") if ok && len(name) == 1 { if long, exists := shortToLong[name[0]]; exists { if _, isBool := boolFlags[long]; isBool { if b, err := strconv.ParseBool(val); err == nil { if b { out = append(out, "-"+name) } else { out = append(out, "--no-"+long) } continue } } } } out = append(out, arg) continue } out = append(out, arg) } return out } ================================================ FILE: exporter/args_normalize_test.go ================================================ package exporter import ( "reflect" "testing" "github.com/alecthomas/kingpin/v2" ) func TestNormalizeKingpinBoolEqualsArgs_Long(t *testing.T) { app := kingpin.New("test", "") app.Flag("auto-discovery", "").Short('a').Default("true").Bool() app.Flag("disable-cache", "").Short('C').Default("false").Bool() app.Flag("log.level", "").Default("info").String() tests := []struct { in []string want []string }{ {[]string{"--auto-discovery=false"}, []string{"--no-auto-discovery"}}, {[]string{"--auto-discovery=true"}, []string{"--auto-discovery"}}, {[]string{"--disable-cache=false"}, []string{"--no-disable-cache"}}, {[]string{"--disable-cache=true"}, []string{"--disable-cache"}}, {[]string{"--log.level=debug"}, []string{"--log.level=debug"}}, {[]string{"--unknown=false"}, []string{"--unknown=false"}}, } for _, tt := range tests { got := normalizeKingpinBoolEqualsArgs(tt.in, app.Model()) if !reflect.DeepEqual(got, tt.want) { t.Fatalf("normalize(%v) = %v, want %v", tt.in, got, tt.want) } } } func TestNormalizeKingpinBoolEqualsArgs_Short(t *testing.T) { app := kingpin.New("test", "") app.Flag("auto-discovery", "").Short('a').Default("true").Bool() app.Flag("disable-cache", "").Short('C').Default("false").Bool() app.Flag("dry-run", "").Short('D').Default("false").Bool() tests := []struct { in []string want []string }{ {[]string{"-a=false"}, []string{"--no-auto-discovery"}}, {[]string{"-a=true"}, []string{"-a"}}, {[]string{"-C=false"}, []string{"--no-disable-cache"}}, {[]string{"-C=true"}, []string{"-C"}}, {[]string{"-D=false"}, []string{"--no-dry-run"}}, {[]string{"-D=true"}, []string{"-D"}}, {[]string{"-x=false"}, []string{"-x=false"}}, // unknown short } for _, tt := range tests { got := normalizeKingpinBoolEqualsArgs(tt.in, app.Model()) if !reflect.DeepEqual(got, tt.want) { t.Fatalf("normalize(%v) = %v, want %v", tt.in, got, tt.want) } } } ================================================ FILE: exporter/collector.go ================================================ package exporter import ( "context" "database/sql" "errors" "fmt" "strings" "sync" "time" "github.com/prometheus/client_golang/prometheus" ) /* ================ Collector ================ */ type predicateCacheEntry struct { at time.Time pass bool } // Collector holds runtime information of a Query running on a Server // It is deeply coupled with Server. Besides, it can be a collector itself type Collector struct { *Query Server *Server // It's a query, but holds a server // runtime information lock sync.RWMutex // scrape lock result []prometheus.Metric // cached metrics descriptors map[string]*prometheus.Desc // maps column index to descriptor, build on init cacheHit bool // indicate last scrape was served from cache or real execution predicateSkip string // if nonempty, predicate query caused skip of this scrape err error // predicate cache. Entry i caches PredicateQueries[i] if it has a positive TTL. predicateCache []predicateCacheEntry // stats lastScrape time.Time // SERVER's scrape start time (for cache window align) scrapeBegin time.Time // execution begin time scrapeDone time.Time // execution complete time scrapeDuration time.Duration // last real execution duration } // NewCollector will generate query instance from query, Injecting a server object func NewCollector(q *Query, s *Server) *Collector { instance := &Collector{ Query: q, Server: s, result: make([]prometheus.Metric, 0), } if len(q.PredicateQueries) > 0 { instance.predicateCache = make([]predicateCacheEntry, len(q.PredicateQueries)) } instance.makeDescMap() return instance } // Describe implement prometheus.Collector func (q *Collector) Describe(ch chan<- *prometheus.Desc) { q.lock.Lock() defer q.lock.Unlock() q.sendDescriptors(ch) } // Collect implement prometheus.Collector func (q *Collector) Collect(ch chan<- prometheus.Metric) { q.lock.Lock() defer q.lock.Unlock() q.scrapeBegin = time.Now() if q.cacheExpired() || q.Server.DisableCache { q.execute() q.cacheHit = false q.scrapeDone = time.Now() q.scrapeDuration = q.scrapeDone.Sub(q.scrapeBegin) q.lastScrape = q.Server.scrapeBegin } else { // serve from cache q.cacheHit = true q.scrapeDone = time.Now() } q.sendMetrics(ch) // the cache is already reset to zero even execute failed } // ResultSize report last scraped metric count func (q *Collector) ResultSize() int { return len(q.result) } // Error wraps query error (including error in predicate query) func (q *Collector) Error() error { return q.err } // PredicateSkip tells the last scrape skip due to predicate query and if so which predicate query caused the skip? func (q *Collector) PredicateSkip() (bool, string) { return q.predicateSkip != "", q.predicateSkip } // Duration returns last scrape duration in float64 seconds func (q *Collector) Duration() float64 { return q.scrapeDone.Sub(q.scrapeBegin).Seconds() } // CacheHit report whether last scrape was serve from cache func (q *Collector) CacheHit() bool { return q.cacheHit } // Run any predicate queries for this query. Return true only if all predicate queries pass. // As a side effect sets predicateSkip to the first predicate query that failed, using // the predicate query name if specified otherwise the index. func (q *Collector) executePredicateQueries(ctx context.Context) bool { for i, predicateQuery := range q.PredicateQueries { predicateQueryName := predicateQuery.Name if predicateQueryName == "" { predicateQueryName = fmt.Sprintf("%d", i) } q.predicateSkip = predicateQueryName msgPrefix := fmt.Sprintf("predicate query [%s] for query [%s] @ server [%s]", predicateQueryName, q.Name, q.Server.Database) // Optional predicate cache (independent of main query cache). if predicateQuery.TTL > 0 && len(q.predicateCache) == len(q.PredicateQueries) { entry := q.predicateCache[i] if !entry.at.IsZero() { ttl := time.Duration(predicateQuery.TTL * float64(time.Second)) if q.scrapeBegin.Sub(entry.at) < ttl { logDebugf("%s served from predicate cache (ttl=%vs, pass=%v)", msgPrefix, predicateQuery.TTL, entry.pass) if entry.pass { continue } // cached skip return false } } } // Execute the predicate query. logDebugf("%s executing predicate query", msgPrefix) rows, err := q.Server.QueryContext(ctx, predicateQuery.SQL) if err != nil { // If a predicate query fails that's treated as a skip, and the err // flag is set so Fatal will be respected if set. if errors.Is(err, context.DeadlineExceeded) { // timeout q.err = fmt.Errorf("%s timeout because duration %v exceed limit %v", msgPrefix, time.Since(q.scrapeBegin), q.TimeoutDuration()) } else { q.err = fmt.Errorf("%s failed: %w", msgPrefix, err) } return false } // The predicate passes if it returns exactly one row with one column // that is a boolean true. colTypes, err := rows.ColumnTypes() if err != nil { q.err = fmt.Errorf("%s failed to get column types: %w", msgPrefix, err) _ = rows.Close() return false } if len(colTypes) != 1 { q.err = fmt.Errorf("%s failed because it returned %d columns, expected 1", msgPrefix, len(colTypes)) _ = rows.Close() return false } typeName := strings.ToUpper(colTypes[0].DatabaseTypeName()) if typeName != "BOOL" && typeName != "BOOLEAN" { q.err = fmt.Errorf("%s failed because it returned a column of type %s, expect bool. consider a cast(colname as boolean) or colname::boolean in the query", msgPrefix, colTypes[0].DatabaseTypeName()) _ = rows.Close() return false } firstRow := true predicatePass := sql.NullBool{} for rows.Next() { if !firstRow { q.err = fmt.Errorf("%s failed because it returned more than one row", msgPrefix) _ = rows.Close() return false } firstRow = false err = rows.Scan(&predicatePass) if err != nil { q.err = fmt.Errorf("%s failed scanning in expected 1-row 1-column nullable boolean result: %w", msgPrefix, err) _ = rows.Close() return false } } if err = rows.Err(); err != nil { q.err = fmt.Errorf("%s failed while iterating rows: %w", msgPrefix, err) _ = rows.Close() return false } if err = rows.Close(); err != nil { q.err = fmt.Errorf("%s failed closing rows: %w", msgPrefix, err) return false } pass := predicatePass.Valid && predicatePass.Bool if predicateQuery.TTL > 0 && len(q.predicateCache) == len(q.PredicateQueries) { q.predicateCache[i] = predicateCacheEntry{at: q.scrapeBegin, pass: pass} } if !pass { // successfully executed predicate query requested a skip logDebugf("%s returned false, null or zero rows, skipping query", msgPrefix) return false } logDebugf("%s returned true", msgPrefix) } // If we get here, all predicate queries passed. q.predicateSkip = "" return true } // execute will run this query to registered server, result and err are registered func (q *Collector) execute() { q.result = q.result[:0] // reset cache q.err = nil q.predicateSkip = "" var rows *sql.Rows var err error ctx := context.Background() if q.Timeout != 0 { // if timeout is provided, use context logDebugf("query [%s] @ server [%s] executing begin with time limit: %v", q.Name, q.Server.Database, q.TimeoutDuration()) var cancel context.CancelFunc ctx, cancel = context.WithTimeout(context.Background(), q.TimeoutDuration()) defer cancel() } else { logDebugf("query [%s] @ server [%s] executing begin", q.Name, q.Server.Database) } // check predicate queries if any if predicatePass := q.executePredicateQueries(ctx); !predicatePass { // predicateSkip and err if appropriate were set as side effects return } // main query execution rows, err = q.Server.QueryContext(ctx, q.SQL) // error handling: if query failed because of timeout or error, record and return if err != nil { if errors.Is(err, context.DeadlineExceeded) { // timeout q.err = fmt.Errorf("query [%s] timeout because duration %v exceed limit %v", q.Name, time.Since(q.scrapeBegin), q.TimeoutDuration()) } else { q.err = fmt.Errorf("query [%s] failed: %w", q.Name, err) } return } // close rows defer func(rows *sql.Rows) { _ = rows.Close() }(rows) // parsing meta: fetch column metadata for dynamic name lookup columnNames, err := rows.Columns() if err != nil { q.err = fmt.Errorf("query [%s] fail retriving rows meta: %w", q.Name, err) return } columnIndexes := make(map[string]int, len(columnNames)) // column name to index for i, n := range columnNames { columnIndexes[n] = i } nColumn := len(columnNames) colData := make([]interface{}, nColumn) colArgs := make([]interface{}, nColumn) for i := range colData { colArgs[i] = &colData[i] } if len(columnNames) != len(q.Columns) { // warn if column count not match logWarnf("query [%s] column count not match, result %d ≠ config %d", q.Name, len(columnNames), len(q.Columns)) } // scan loop: for each row, extract labels from all label columns, then generate a new metric for each metric column for rows.Next() { err = rows.Scan(colArgs...) if err != nil { q.err = fmt.Errorf("fail scanning rows: %w", err) return } // get labels, sequence matters, empty string for null or bad labels labels := make([]string, len(q.LabelNames)) for i, labelName := range q.LabelNames { if dataIndex, found := columnIndexes[labelName]; found { labels[i] = castString(colData[dataIndex]) } else { //if label column is not found in result, we just warn and send a empty string logWarnf("missing label %s.%s", q.Name, labelName) labels[i] = "" } } // get metrics, warn if column not exist for _, metricName := range q.MetricNames { if dataIndex, found := columnIndexes[metricName]; found { // the metric column is found in result q.result = append(q.result, prometheus.MustNewConstMetric( q.descriptors[metricName], // always find desc & column via name q.Columns[metricName].PrometheusValueType(), castFloat64(colData[dataIndex], q.Columns[metricName]), labels..., )) } else { logWarnf("missing metric column %s.%s in result", q.Name, metricName) } } } if err = rows.Err(); err != nil { q.err = fmt.Errorf("query [%s] failed while iterating rows: %w", q.Name, err) return } q.err = nil logDebugf("query [%s] executing complete in %v, metrics count: %d", q.Name, time.Since(q.scrapeBegin), len(q.result)) } /* ================ Collector Auxiliary ================ */ // makeDescMap will generate descriptor map from Query func (q *Collector) makeDescMap() { descriptors := make(map[string]*prometheus.Desc) // rename label name if label column have rename option labelNames := make([]string, len(q.LabelNames)) for i, labelName := range q.LabelNames { labelColumn := q.Columns[labelName] if labelColumn.Rename != "" { labelNames[i] = labelColumn.Rename } else { labelNames[i] = labelColumn.Name } } // rename metric if metric column have a rename option for _, metricName := range q.MetricNames { metricColumn := q.Columns[metricName] // always found metricName := fmt.Sprintf("%s_%s", q.Name, metricColumn.Name) if metricColumn.Rename != "" { metricName = fmt.Sprintf("%s_%s", q.Name, metricColumn.Rename) } descriptors[metricColumn.Name] = prometheus.NewDesc( metricName, metricColumn.Desc, labelNames, q.Server.labels, ) } q.descriptors = descriptors } func (q *Collector) sendDescriptors(ch chan<- *prometheus.Desc) { for _, desc := range q.descriptors { ch <- desc } } // cacheExpired report whether this instance needs actual execution // Note you have to use Server.scrapeBegin as "now", and set that timestamp as func (q *Collector) cacheExpired() bool { return q.Server.scrapeBegin.Sub(q.lastScrape) > time.Duration(q.TTL*float64(time.Second)) } func (q *Collector) cacheTTL() float64 { return q.TTL - q.Server.scrapeBegin.Sub(q.lastScrape).Seconds() } // sendMetrics will send cached result to ch func (q *Collector) sendMetrics(ch chan<- prometheus.Metric) { for _, metric := range q.result { ch <- metric } } ================================================ FILE: exporter/column.go ================================================ package exporter import ( "fmt" "strconv" "strings" "github.com/prometheus/client_golang/prometheus" ) /* ================ Column ================ */ const ( DISCARD = "DISCARD" // Ignore this column (when SELECT *) LABEL = "LABEL" // Use this column as a label COUNTER = "COUNTER" // Use this column as a counter GAUGE = "GAUGE" // Use this column as a gauge HISTOGRAM = "HISTOGRAM" // Use this column as a histogram (not implemented yet) ) // ColumnUsage determine how to use query result column var ColumnUsage = map[string]bool{ DISCARD: false, LABEL: false, COUNTER: true, GAUGE: true, } // Column holds the metadata of query result type Column struct { Name string `yaml:"name"` Usage string `yaml:"usage,omitempty"` // column usage Rename string `yaml:"rename,omitempty"` // rename column Bucket []float64 `yaml:"bucket,omitempty"` // histogram bucket Scale string `yaml:"scale,omitempty"` // scale factor Default string `yaml:"default,omitempty"` // default value Desc string `yaml:"description,omitempty"` // Parsed numeric options (filled during config parsing). scaleFactor float64 hasScale bool defaultValue float64 hasDefault bool } func (c *Column) parseNumbers() error { if c.Scale != "" { f, err := strconv.ParseFloat(c.Scale, 64) if err != nil { return fmt.Errorf("invalid scale %q: %w", c.Scale, err) } c.scaleFactor = f c.hasScale = true } if c.Default != "" { f, err := strconv.ParseFloat(c.Default, 64) if err != nil { return fmt.Errorf("invalid default %q: %w", c.Default, err) } c.defaultValue = f c.hasDefault = true } return nil } // PrometheusValueType returns column's corresponding prometheus value type func (c *Column) PrometheusValueType() prometheus.ValueType { switch strings.ToUpper(c.Usage) { case GAUGE: return prometheus.GaugeValue case COUNTER: return prometheus.CounterValue default: // it's user's responsibility to make sure this is a value column panic(fmt.Errorf("column %s does not have a valid value type %s", c.Name, c.Usage)) } } // String turns column into a one-line text representation func (c *Column) String() string { return fmt.Sprintf("%-8s %-20s %s", c.Usage, c.Name, c.Desc) } // MetricDesc will generate MetricDesc from column and additional information func (c *Column) MetricDesc(prefix string, labels []string) *MetricDesc { metricName := fmt.Sprintf("%s_%s{%s}", prefix, c.Name, strings.Join(labels, ",")) if c.Rename != "" { metricName = fmt.Sprintf("%s_%s{%s}", prefix, c.Rename, strings.Join(labels, ",")) } return &MetricDesc{ metricName, labels, c, } } // MetricDesc is generated by collector's column definition type MetricDesc struct { Name string Labels []string Column *Column } // Signature will print metric signature such as pg_db_age{datname} func (m *MetricDesc) String() string { return fmt.Sprintf("%s %-8s %s", m.Name, m.Column.Usage, m.Column.Desc) } ================================================ FILE: exporter/concurrency_test.go ================================================ package exporter import ( "net/http" "net/http/httptest" "sync" "sync/atomic" "testing" "time" ) func newMockExporter(up bool, recovery bool) *Exporter { s := &Server{ Database: "postgres", Databases: map[string]bool{"postgres": true}, UP: up, Recovery: recovery, } s.beforeScrape = func(s *Server) error { s.UP = up s.Recovery = recovery return nil } return &Exporter{server: s} } func TestReloadAndHealthHandlersNoDeadlock(t *testing.T) { originalExporter := PgExporter defer setCurrentExporter(originalExporter) originalLogger := Logger Logger = configureLogger("error", "logfmt") defer func() { Logger = originalLogger }() e1 := newMockExporter(true, false) e2 := newMockExporter(true, true) setCurrentExporter(e1) var failed atomic.Int32 var wg sync.WaitGroup // Concurrent simulated reloads. wg.Add(1) go func() { defer wg.Done() for i := 0; i < 400; i++ { ReloadLock.Lock() if i%2 == 0 { setCurrentExporter(e2) } else { setCurrentExporter(e1) } ReloadLock.Unlock() time.Sleep(time.Millisecond) } }() // Concurrent health/status requests. for i := 0; i < 8; i++ { wg.Add(1) go func() { defer wg.Done() for j := 0; j < 200; j++ { req := httptest.NewRequest(http.MethodGet, "/up", nil) w1 := httptest.NewRecorder() e1.UpCheckFunc(w1, req) if w1.Code != http.StatusOK && w1.Code != http.StatusServiceUnavailable { failed.Add(1) } w2 := httptest.NewRecorder() e1.PrimaryCheckFunc(w2, req) if w2.Code != http.StatusOK && w2.Code != http.StatusNotFound && w2.Code != http.StatusServiceUnavailable { failed.Add(1) } w3 := httptest.NewRecorder() e1.ReplicaCheckFunc(w3, req) if w3.Code != http.StatusOK && w3.Code != http.StatusNotFound && w3.Code != http.StatusServiceUnavailable { failed.Add(1) } w4 := httptest.NewRecorder() e1.StatFunc(w4, req) if w4.Code != http.StatusOK && w4.Code != http.StatusServiceUnavailable { failed.Add(1) } } }() } done := make(chan struct{}) go func() { wg.Wait() close(done) }() select { case <-done: case <-time.After(10 * time.Second): t.Fatal("concurrent reload/health requests timed out, possible deadlock") } if failed.Load() > 0 { t.Fatalf("unexpected HTTP status count: %d", failed.Load()) } } ================================================ FILE: exporter/config.go ================================================ package exporter import ( "fmt" "os" "path/filepath" "strings" "gopkg.in/yaml.v3" ) // GetConfig will try load config from target path func GetConfig() (res string) { // priority: cli-args > env > default settings (check exist) if res = *configPath; res != "" { logInfof("retrieve config path %s from command line", res) return res } if res = os.Getenv("PG_EXPORTER_CONFIG"); res != "" { logInfof("retrieve config path %s from PG_EXPORTER_CONFIG", res) return res } candidate := []string{"pg_exporter.yml", "/etc/pg_exporter.yml", "/etc/pg_exporter"} for _, res = range candidate { if _, err := os.Stat(res); err == nil { // default1 exist logInfof("fallback on default config path: %s", res) return res } } return "" } // ParseConfig turn config content into Query struct func ParseConfig(content []byte) (queries map[string]*Query, err error) { queries = make(map[string]*Query) if err = yaml.Unmarshal(content, &queries); err != nil { return nil, fmt.Errorf("malformed config: %w", err) } // parse additional fields for branch, query := range queries { if query == nil { return nil, fmt.Errorf("query %q is null", branch) } query.Branch = branch if query.Name == "" { query.Name = branch } if strings.TrimSpace(query.SQL) == "" { return nil, fmt.Errorf("query %q has empty SQL", branch) } if query.TTL < 0 { return nil, fmt.Errorf("query %q has negative ttl: %v", branch, query.TTL) } for i, pq := range query.PredicateQueries { if strings.TrimSpace(pq.SQL) == "" { return nil, fmt.Errorf("query %q has empty predicate_query at index %d", branch, i) } if pq.TTL < 0 { return nil, fmt.Errorf("query %q has negative predicate_queries[%d].ttl: %v", branch, i, pq.TTL) } } if len(query.Metrics) == 0 { return nil, fmt.Errorf("query %q has no metrics definition", branch) } // parse query column info columns := make(map[string]*Column, len(query.Metrics)) var allColumns, labelColumns, metricColumns []string for _, colMap := range query.Metrics { if len(colMap) == 0 { return nil, fmt.Errorf("query %q has an empty metrics entry", branch) } if len(colMap) != 1 { return nil, fmt.Errorf("query %q has invalid metrics entry with %d columns, expect exactly 1", branch, len(colMap)) } for colName, column := range colMap { // one-entry map if column == nil { return nil, fmt.Errorf("query %q has null column definition for %q", branch, colName) } if column.Name == "" { column.Name = colName } usage := strings.ToUpper(strings.TrimSpace(column.Usage)) if usage == "" { return nil, fmt.Errorf("query %q column %q has empty usage", branch, colName) } if _, isValid := ColumnUsage[usage]; !isValid { return nil, fmt.Errorf("query %q column %q has unsupported usage: %s", branch, colName, column.Usage) } column.Usage = usage if err := column.parseNumbers(); err != nil { return nil, fmt.Errorf("query %q column %q: %w", branch, colName, err) } switch column.Usage { case LABEL: labelColumns = append(labelColumns, column.Name) case GAUGE, COUNTER: metricColumns = append(metricColumns, column.Name) } allColumns = append(allColumns, column.Name) if _, exists := columns[column.Name]; exists { return nil, fmt.Errorf("query %q has duplicate column name %q", branch, column.Name) } columns[column.Name] = column } } if len(metricColumns) == 0 { return nil, fmt.Errorf("query %q defines no GAUGE/COUNTER columns", branch) } query.Columns, query.ColumnNames, query.LabelNames, query.MetricNames = columns, allColumns, labelColumns, metricColumns // Validate prometheus label names and metric names. This prevents panics at scrape time. seenLabels := make(map[string]bool, len(query.LabelNames)) for _, labelColName := range query.LabelNames { c := query.Columns[labelColName] if c == nil { return nil, fmt.Errorf("query %q missing label column %q", branch, labelColName) } lbl := c.Name if c.Rename != "" { lbl = c.Rename } if err := validatePromLabelName(lbl); err != nil { return nil, fmt.Errorf("query %q label %q: %w", branch, lbl, err) } if seenLabels[lbl] { return nil, fmt.Errorf("query %q has duplicate label name %q", branch, lbl) } seenLabels[lbl] = true } seenMetrics := make(map[string]bool, len(query.MetricNames)) for _, metricColName := range query.MetricNames { c := query.Columns[metricColName] if c == nil { return nil, fmt.Errorf("query %q missing metric column %q", branch, metricColName) } suffix := c.Name if c.Rename != "" { suffix = c.Rename } metricName := fmt.Sprintf("%s_%s", query.Name, suffix) if err := validatePromMetricName(metricName); err != nil { return nil, fmt.Errorf("query %q metric %q: %w", branch, metricName, err) } if seenMetrics[metricName] { return nil, fmt.Errorf("query %q has duplicate metric name %q", branch, metricName) } seenMetrics[metricName] = true } } return } func FinalizeQueries(queries map[string]*Query, source string) error { for branch, q := range queries { if q == nil { return fmt.Errorf("query %q is null", branch) } q.Path = source // If timeout is not set, set to 100ms by default. // If timeout is set to a negative number, set to 0 (disabled). if q.Timeout == 0 { q.Timeout = 0.1 } if q.Timeout < 0 { q.Timeout = 0 } } return nil } // ParseQuery generate a single query from config string func ParseQuery(config string) (*Query, error) { queries, err := ParseConfig([]byte(config)) if err != nil { return nil, err } if len(queries) == 0 { return nil, fmt.Errorf("no query definition found") } if len(queries) > 1 { return nil, fmt.Errorf("multiple query definition found") } if err := FinalizeQueries(queries, ""); err != nil { return nil, err } for _, q := range queries { return q, nil // return the only query instance } return nil, fmt.Errorf("no query definition found") } // LoadConfig will read single conf file or read multiple conf file if a dir is given // conf file in a dir will be load in alphabetic order, query with same name will overwrite predecessor func LoadConfig(configPath string) (queries map[string]*Query, err error) { stat, err := os.Stat(configPath) if err != nil { return nil, fmt.Errorf("invalid config path: %s: %w", configPath, err) } if stat.IsDir() { // iterate conf files (non-recursive) if a dir is given files, err := os.ReadDir(configPath) if err != nil { return nil, fmt.Errorf("fail reading config dir: %s: %w", configPath, err) } logDebugf("load config from dir: %s", configPath) confFiles := make([]string, 0) for _, conf := range files { if conf.IsDir() { continue // skip subdirectories } if !(strings.HasSuffix(conf.Name(), ".yaml") || strings.HasSuffix(conf.Name(), ".yml")) { continue // skip non-yaml files } confFiles = append(confFiles, filepath.Join(configPath, conf.Name())) } // make global config map and assign priority according to config file alphabetic orders // priority is an integer range from 1 to 999, where 1 - 99 is reserved for user queries = make(map[string]*Query) var queryCount, configCount int var firstErr error for _, confPath := range confFiles { if singleQueries, err := LoadConfig(confPath); err != nil { logWarnf("skip config %s due to error: %s", confPath, err.Error()) if firstErr == nil { firstErr = err } } else { configCount++ for name, query := range singleQueries { queryCount++ if query.Priority == 0 { // set to config rank if not manually set query.Priority = 100 + configCount } queries[name] = query // so the later one will overwrite former one } } } if len(confFiles) > 0 && len(queries) == 0 { if firstErr != nil { return nil, fmt.Errorf("no valid queries loaded from config dir %s (%d yaml files), first error: %w", configPath, len(confFiles), firstErr) } return nil, fmt.Errorf("no queries loaded from config dir %s (%d yaml files)", configPath, len(confFiles)) } logDebugf("load %d of %d queries from %d config files", len(queries), queryCount, configCount) return queries, nil } // single file case: recursive exit condition content, err := os.ReadFile(configPath) if err != nil { return nil, fmt.Errorf("fail reading config file %s: %w", configPath, err) } queries, err = ParseConfig(content) if err != nil { return nil, err } if err := FinalizeQueries(queries, stat.Name()); err != nil { return nil, err } logDebugf("load %d queries from %s", len(queries), configPath) return queries, nil } ================================================ FILE: exporter/config_coverage_pg9_test.go ================================================ package exporter import ( "os" "path/filepath" "runtime" "testing" ) // Ensure the legacy config (legacy/config) covers PG9.1..PG9.6 without version // gaps for collectors that are supposed to work on legacy PG9.x. func TestConfigCoveragePG9(t *testing.T) { _, thisFile, _, ok := runtime.Caller(0) if !ok { t.Fatal("runtime.Caller failed") } configDir := filepath.Clean(filepath.Join(filepath.Dir(thisFile), "..", "legacy", "config")) if _, err := os.Stat(configDir); err != nil { t.Skipf("legacy config dir not found: %s: %v", configDir, err) } queries, err := LoadConfig(configDir) if err != nil { t.Fatalf("LoadConfig(%s) failed: %v", configDir, err) } byName := make(map[string][]*Query) for _, q := range queries { if q.HasTag("pgbouncer") { // PG and pgbouncer versions are in different namespaces. continue } byName[q.Name] = append(byName[q.Name], q) } versions := []int{90100, 90200, 90300, 90400, 90500, 90600} // PG9.1..PG9.6 for name, qs := range byName { minMin := 0 for i, q := range qs { if i == 0 || q.MinVersion < minMin { minMin = q.MinVersion } } for _, v := range versions { // Collectors introduced after v are allowed to have gaps for older versions. if minMin != 0 && v < minMin { continue } var appl []*Query for _, q := range qs { if q.MinVersion != 0 && v < q.MinVersion { continue } if q.MaxVersion != 0 && v >= q.MaxVersion { continue } appl = append(appl, q) } if len(appl) == 0 { t.Errorf("collector %q has no branch for server_version_num=%d", name, v) continue } // Multiple branches for the same Name are only acceptable when they are // mutually exclusive via tags (e.g. primary vs replica). if len(appl) > 1 { if name == "pg" && len(appl) == 2 && ((appl[0].HasTag("primary") && appl[1].HasTag("replica")) || (appl[0].HasTag("replica") && appl[1].HasTag("primary"))) { continue } t.Errorf("collector %q has %d overlapping branches for server_version_num=%d: %v", name, len(appl), v, func() []string { out := make([]string, 0, len(appl)) for _, q := range appl { out = append(out, q.Branch) } return out }()) } } } } ================================================ FILE: exporter/config_coverage_test.go ================================================ package exporter import ( "os" "path/filepath" "runtime" "testing" ) // Ensure the repo-bundled config/ covers PG10..PG18 without version gaps for // collectors that are supposed to work on PG10+. This is a cheap static check // (no DB required) to catch off-by-one mistakes on min/max_version splits. func TestConfigCoveragePG10To18(t *testing.T) { _, thisFile, _, ok := runtime.Caller(0) if !ok { t.Fatal("runtime.Caller failed") } configDir := filepath.Clean(filepath.Join(filepath.Dir(thisFile), "..", "config")) if _, err := os.Stat(configDir); err != nil { t.Skipf("config dir not found: %s: %v", configDir, err) } queries, err := LoadConfig(configDir) if err != nil { t.Fatalf("LoadConfig(%s) failed: %v", configDir, err) } byName := make(map[string][]*Query) for _, q := range queries { if q.HasTag("pgbouncer") { // PG and pgbouncer versions are in different namespaces. continue } byName[q.Name] = append(byName[q.Name], q) } for name, qs := range byName { minMin := 0 for i, q := range qs { if i == 0 || q.MinVersion < minMin { minMin = q.MinVersion } } // Collectors introduced after PG10 are allowed to have gaps for PG10-. if minMin > 100000 { continue } for v := 100000; v <= 180000; v += 10000 { // PG10..PG18 var appl []*Query for _, q := range qs { if q.MinVersion != 0 && v < q.MinVersion { continue } if q.MaxVersion != 0 && v >= q.MaxVersion { // exclude continue } appl = append(appl, q) } if len(appl) == 0 { t.Errorf("collector %q has no branch for server_version_num=%d", name, v) continue } // Multiple branches for the same Name are only acceptable when they are // mutually exclusive via tags (e.g. primary vs replica). if len(appl) > 1 { if name == "pg" && len(appl) == 2 && ((appl[0].HasTag("primary") && appl[1].HasTag("replica")) || (appl[0].HasTag("replica") && appl[1].HasTag("primary"))) { continue } t.Errorf("collector %q has %d overlapping branches for server_version_num=%d: %v", name, len(appl), v, func() []string { out := make([]string, 0, len(appl)) for _, q := range appl { out = append(out, q.Branch) } return out }()) } } } } ================================================ FILE: exporter/config_merged_test.go ================================================ package exporter import ( "os" "path/filepath" "reflect" "runtime" "slices" "testing" ) func parseConfigDirLikeMerge(t *testing.T, dir string) map[string]*Query { t.Helper() entries, err := os.ReadDir(dir) if err != nil { t.Fatalf("ReadDir(%s) failed: %v", dir, err) } names := make([]string, 0, len(entries)) for _, entry := range entries { if entry.IsDir() { continue } ext := filepath.Ext(entry.Name()) if ext != ".yml" && ext != ".yaml" { continue } names = append(names, entry.Name()) } slices.Sort(names) queries := make(map[string]*Query) for _, name := range names { path := filepath.Join(dir, name) content, err := os.ReadFile(path) if err != nil { t.Fatalf("ReadFile(%s) failed: %v", path, err) } parsed, err := ParseConfig(content) if err != nil { t.Fatalf("ParseConfig(%s) failed: %v", path, err) } for branch, q := range parsed { queries[branch] = q } } return queries } func TestMergedConfigsMatchSplitDirectories(t *testing.T) { _, thisFile, _, ok := runtime.Caller(0) if !ok { t.Fatal("runtime.Caller failed") } repoRoot := filepath.Clean(filepath.Join(filepath.Dir(thisFile), "..")) cases := []struct { name string dir string merged string }{ { name: "current", dir: filepath.Join(repoRoot, "config"), merged: filepath.Join(repoRoot, "pg_exporter.yml"), }, { name: "legacy", dir: filepath.Join(repoRoot, "legacy", "config"), merged: filepath.Join(repoRoot, "legacy", "pg_exporter.yml"), }, } for _, tc := range cases { t.Run(tc.name, func(t *testing.T) { splitQueries := parseConfigDirLikeMerge(t, tc.dir) mergedContent, err := os.ReadFile(tc.merged) if err != nil { t.Fatalf("ReadFile(%s) failed: %v", tc.merged, err) } mergedQueries, err := ParseConfig(mergedContent) if err != nil { t.Fatalf("ParseConfig(%s) failed: %v", tc.merged, err) } if len(splitQueries) != len(mergedQueries) { t.Fatalf("query count mismatch: split=%d merged=%d", len(splitQueries), len(mergedQueries)) } for branch, splitQuery := range splitQueries { mergedQuery, ok := mergedQueries[branch] if !ok { t.Fatalf("branch %q missing from merged config %s", branch, tc.merged) } if !reflect.DeepEqual(splitQuery, mergedQuery) { t.Fatalf("branch %q differs between split dir %s and merged config %s", branch, tc.dir, tc.merged) } } }) } } ================================================ FILE: exporter/config_style_test.go ================================================ package exporter import ( "bufio" "bytes" "os" "path/filepath" "regexp" "runtime" "strings" "testing" ) var inlineMetricDescriptionRE = regexp.MustCompile(`^\s*-\s*[^:]+:\s*\{.*\bdescription:\s*(.+)\}\s*$`) func TestInlineMetricDescriptionsUseDoubleQuotes(t *testing.T) { _, thisFile, _, ok := runtime.Caller(0) if !ok { t.Fatal("runtime.Caller failed") } repoRoot := filepath.Clean(filepath.Join(filepath.Dir(thisFile), "..")) for _, rel := range []string{"config", filepath.Join("legacy", "config")} { dir := filepath.Join(repoRoot, rel) t.Run(rel, func(t *testing.T) { entries, err := os.ReadDir(dir) if err != nil { t.Fatalf("ReadDir(%s) failed: %v", dir, err) } for _, entry := range entries { if entry.IsDir() || filepath.Ext(entry.Name()) != ".yml" { continue } path := filepath.Join(dir, entry.Name()) f, err := os.Open(path) if err != nil { t.Fatalf("Open(%s) failed: %v", path, err) } scanner := bufio.NewScanner(f) for lineNo := 1; scanner.Scan(); lineNo++ { line := scanner.Text() m := inlineMetricDescriptionRE.FindStringSubmatch(line) if m == nil { continue } desc := strings.TrimSpace(m[1]) if len(desc) < 2 || desc[0] != '"' || desc[len(desc)-1] != '"' { t.Errorf("%s:%d inline metric description must use double quotes: %s", path, lineNo, desc) } } if err := scanner.Err(); err != nil { _ = f.Close() t.Fatalf("Scan(%s) failed: %v", path, err) } if err := f.Close(); err != nil { t.Fatalf("Close(%s) failed: %v", path, err) } } }) } } func TestLegacySplitConfigsEndWithTwoBlankLines(t *testing.T) { _, thisFile, _, ok := runtime.Caller(0) if !ok { t.Fatal("runtime.Caller failed") } dir := filepath.Clean(filepath.Join(filepath.Dir(thisFile), "..", "legacy", "config")) entries, err := os.ReadDir(dir) if err != nil { t.Fatalf("ReadDir(%s) failed: %v", dir, err) } for _, entry := range entries { if entry.IsDir() || filepath.Ext(entry.Name()) != ".yml" { continue } path := filepath.Join(dir, entry.Name()) data, err := os.ReadFile(path) if err != nil { t.Fatalf("ReadFile(%s) failed: %v", path, err) } trailingNewlines := len(data) - len(bytes.TrimRight(data, "\n")) if trailingNewlines != 3 { t.Errorf("%s must end with exactly two blank lines (3 trailing newlines), got %d", path, trailingNewlines) } } } ================================================ FILE: exporter/config_test.go ================================================ package exporter import ( "os" "path/filepath" "testing" ) func TestParseConfigUsageCaseInsensitive(t *testing.T) { config := ` test_query: query: SELECT 1 AS metric, 'db' AS datname metrics: - metric: usage: gauge description: metric value - datname: usage: label description: database name ` queries, err := ParseConfig([]byte(config)) if err != nil { t.Fatalf("ParseConfig returned error: %v", err) } query, ok := queries["test_query"] if !ok { t.Fatalf("query test_query not found") } if got := query.Columns["metric"].Usage; got != GAUGE { t.Fatalf("metric usage = %s, want %s", got, GAUGE) } if got := query.Columns["datname"].Usage; got != LABEL { t.Fatalf("datname usage = %s, want %s", got, LABEL) } } func TestParseConfigInvalidUsage(t *testing.T) { config := ` bad_query: query: SELECT 1 AS metric metrics: - metric: usage: bad_usage description: metric value ` if _, err := ParseConfig([]byte(config)); err == nil { t.Fatal("ParseConfig should fail on unsupported usage") } } func TestParseConfigRejectsMultiColumnMetricsEntry(t *testing.T) { config := ` bad_query: query: SELECT 1 AS a, 2 AS b metrics: - a: usage: gauge b: usage: gauge ` if _, err := ParseConfig([]byte(config)); err == nil { t.Fatal("ParseConfig should fail when one metrics entry defines multiple columns") } } func TestParseQueryErrors(t *testing.T) { if _, err := ParseQuery(`{}`); err == nil { t.Fatal("ParseQuery should fail when no query is defined") } multi := ` q1: query: SELECT 1 AS metric metrics: - metric: usage: gauge q2: query: SELECT 2 AS metric metrics: - metric: usage: gauge ` if _, err := ParseQuery(multi); err == nil { t.Fatal("ParseQuery should fail when multiple queries are defined") } } func TestLoadConfigDirectoryPriorityAndOverride(t *testing.T) { dir := t.TempDir() f1 := filepath.Join(dir, "0100-a.yml") f2 := filepath.Join(dir, "0200-b.yml") cfg1 := ` q_common: query: SELECT 1 AS metric metrics: - metric: usage: gauge ` cfg2 := ` q_common: query: SELECT 2 AS metric metrics: - metric: usage: gauge q_extra: query: SELECT 3 AS metric metrics: - metric: usage: gauge ` if err := os.WriteFile(f1, []byte(cfg1), 0o644); err != nil { t.Fatalf("write config 1 failed: %v", err) } if err := os.WriteFile(f2, []byte(cfg2), 0o644); err != nil { t.Fatalf("write config 2 failed: %v", err) } queries, err := LoadConfig(dir) if err != nil { t.Fatalf("LoadConfig dir failed: %v", err) } if len(queries) != 2 { t.Fatalf("LoadConfig query count = %d, want 2", len(queries)) } if queries["q_common"].SQL != "SELECT 2 AS metric" { t.Fatalf("q_common should be overridden by later file, got: %s", queries["q_common"].SQL) } // 2nd config file gets default priority 102. if queries["q_common"].Priority != 102 { t.Fatalf("q_common priority = %d, want 102", queries["q_common"].Priority) } if queries["q_extra"].Priority != 102 { t.Fatalf("q_extra priority = %d, want 102", queries["q_extra"].Priority) } } func TestLoadConfigDirectoryAllInvalidReturnsError(t *testing.T) { dir := t.TempDir() bad := ` q_bad: query: SELECT 1 AS metric metrics: - metric: usage: bad_usage ` if err := os.WriteFile(filepath.Join(dir, "0100-bad.yml"), []byte(bad), 0o644); err != nil { t.Fatalf("write config failed: %v", err) } if _, err := LoadConfig(dir); err == nil { t.Fatal("LoadConfig should fail when no valid queries are loaded from a config directory") } } func TestGetConfigPrecedence(t *testing.T) { originConfigPath := *configPath t.Cleanup(func() { *configPath = originConfigPath }) originEnv := os.Getenv("PG_EXPORTER_CONFIG") t.Cleanup(func() { _ = os.Setenv("PG_EXPORTER_CONFIG", originEnv) }) *configPath = "/tmp/from-cli.yml" _ = os.Setenv("PG_EXPORTER_CONFIG", "/tmp/from-env.yml") if got := GetConfig(); got != "/tmp/from-cli.yml" { t.Fatalf("GetConfig CLI precedence failed: got %s", got) } *configPath = "" if got := GetConfig(); got != "/tmp/from-env.yml" { t.Fatalf("GetConfig env fallback failed: got %s", got) } } ================================================ FILE: exporter/exporter.go ================================================ package exporter import ( "errors" "fmt" "io" "net/http" "sync" "sync/atomic" "time" "github.com/prometheus/client_golang/prometheus" ) /* ================ Exporter ================ */ const ( healthStatusUnknown int32 = iota healthStatusDown healthStatusStarting healthStatusPrimary healthStatusReplica ) // Exporter implement prometheus.Collector interface // exporter contains one or more (auto-discover-database) servers that can scrape metrics with Query type Exporter struct { // config params provided from ExporterOpt dsn string // primary dsn configPath string // config file path /directory configReader io.Reader // reader to a config file, one of configPath or configReader must be set disableCache bool // always execute query when been scraped disableIntro bool // disable internal/exporter self metrics (only expose query metrics) autoDiscovery bool // discovery other database on primary server pgbouncerMode bool // is primary server a pgbouncer ? failFast bool // fail fast instead fof waiting during start-up ? excludeDatabase map[string]bool // excluded database for auto discovery includeDatabase map[string]bool // include database for auto discovery constLabels prometheus.Labels // prometheus const k=v labels tags []string // tags passed to this exporter for scheduling purpose namespace string // metrics prefix ('pg' or 'pgbouncer' by default) connectTimeout int // timeout in ms when perform server pre-check // internal status lock sync.RWMutex // export lock server *Server // primary server sLock sync.RWMutex // server map lock servers map[string]*Server // auto discovered peripheral servers queries map[string]*Query // metrics query definition // internal stats scrapeBegin time.Time // server level scrape begin scrapeDone time.Time // server last scrape done // internal metrics: global, exporter, server, query up prometheus.Gauge // cluster level: primary target server is alive version prometheus.Gauge // cluster level: postgres main server version num recovery prometheus.Gauge // cluster level: postgres is in recovery ? buildInfo prometheus.Gauge // exporter level: build information exporterUp prometheus.Gauge // exporter level: always set ot 1 exporterUptime prometheus.Gauge // exporter level: primary target server uptime (exporter itself) lastScrapeTime prometheus.Gauge // exporter level: last scrape timestamp scrapeDuration prometheus.Gauge // exporter level: seconds spend on scrape scrapeTotalCount prometheus.Counter // exporter level: total scrape count of this server scrapeErrorCount prometheus.Counter // exporter level: error scrape count // Dynamic series (auto-discovered DBs, config reload) are emitted as const // metrics on each scrape to avoid GaugeVec Reset() overhead and stale series. serverScrapeDurationDesc *prometheus.Desc // {datname} database level: last scrape duration serverScrapeTotalSecondsDesc *prometheus.Desc // {datname} database level: cumulative scrape seconds serverScrapeTotalCountDesc *prometheus.Desc // {datname} database level: total scrape count serverScrapeErrorCountDesc *prometheus.Desc // {datname} database level: cumulative fatal scrape error count queryCacheTTLDesc *prometheus.Desc // {datname,query} query cache ttl queryScrapeTotalCountDesc *prometheus.Desc // {datname,query} query level: total executions queryScrapeErrorCountDesc *prometheus.Desc // {datname,query} query level: error count queryScrapePredicateSkipCountDesc *prometheus.Desc // {datname,query} query level: predicate skip count queryScrapeDurationDesc *prometheus.Desc // {datname,query} query level: execution duration (seconds) queryScrapeMetricCountDesc *prometheus.Desc // {datname,query} query level: returned metric count queryScrapeHitCountDesc *prometheus.Desc // {datname,query} query level: cache hit count // lock-free health snapshot for high-frequency probes healthUp atomic.Bool healthRecovery atomic.Bool healthStatus atomic.Int32 healthLoopLock sync.Mutex healthLoopStop chan struct{} healthLoopDone chan struct{} } // Up will delegate aliveness check to primary server func (e *Exporter) Up() bool { return e.healthUp.Load() } // Recovery will delegate primary/replica check to primary server func (e *Exporter) Recovery() bool { return e.healthRecovery.Load() } // Status will report available status: primary|replica|starting|down|unknown func (e *Exporter) Status() string { switch e.healthStatus.Load() { case healthStatusPrimary: return `primary` case healthStatusReplica: return `replica` case healthStatusStarting: return `starting` case healthStatusDown: return `down` default: return `unknown` } } func (e *Exporter) updateHealthState(up, recovery bool) { e.updateHealthStateWithStartup(up, recovery, false) } func (e *Exporter) updateHealthStateWithStartup(up, recovery, starting bool) { e.healthUp.Store(up) if starting { e.healthRecovery.Store(false) e.healthStatus.Store(healthStatusStarting) return } e.healthRecovery.Store(up && recovery) if !up { e.healthStatus.Store(healthStatusDown) return } if recovery { e.healthStatus.Store(healthStatusReplica) return } e.healthStatus.Store(healthStatusPrimary) } func (e *Exporter) updateHealthStateFromServer() { if e.server == nil { e.healthUp.Store(false) e.healthRecovery.Store(false) e.healthStatus.Store(healthStatusUnknown) return } e.server.lock.RLock() up := e.server.UP recovery := e.server.Recovery e.server.lock.RUnlock() e.updateHealthState(up, recovery) } func (e *Exporter) probeAndUpdateHealthState() error { if e.server == nil { e.healthUp.Store(false) e.healthRecovery.Store(false) e.healthStatus.Store(healthStatusUnknown) return errors.New("primary server is nil") } up, recovery, starting, err := e.server.ProbeHealth() e.updateHealthStateWithStartup(up, recovery, starting) return err } func (e *Exporter) startHealthLoop() { e.healthLoopLock.Lock() if e.healthLoopStop != nil { e.healthLoopLock.Unlock() return } stopCh := make(chan struct{}) doneCh := make(chan struct{}) e.healthLoopStop = stopCh e.healthLoopDone = doneCh e.healthLoopLock.Unlock() go func() { defer close(doneCh) ticker := time.NewTicker(1 * time.Second) defer ticker.Stop() // Health probing is intentionally decoupled from /metrics scraping so that: // - HTTP health handlers never block on network calls // - the exporter can recover health status even when scrapes are failing // // Keep Server.ProbeHealth cheap and log-noise-free: it runs once per second. _ = e.probeAndUpdateHealthState() for { select { case <-stopCh: return case <-ticker.C: _ = e.probeAndUpdateHealthState() } } }() } func (e *Exporter) stopHealthLoop() { e.healthLoopLock.Lock() stopCh := e.healthLoopStop doneCh := e.healthLoopDone e.healthLoopStop = nil e.healthLoopDone = nil e.healthLoopLock.Unlock() if stopCh == nil { return } close(stopCh) if doneCh != nil { <-doneCh } } // Describe implement prometheus.Collector func (e *Exporter) Describe(ch chan<- *prometheus.Desc) { // Intentionally leave this exporter "unchecked". // // Query metrics are dynamic: // - config reload can add/remove collectors and metrics // - auto-discovery can add/remove databases // // If we emitted any descriptors here, the Prometheus registry would enforce // that Collect() only returns described metrics, which does not hold for a // dynamic exporter. Exporter-toolkit and client_golang both support this // pattern (Describe emits nothing). } // Collect implement prometheus.Collector func (e *Exporter) Collect(ch chan<- prometheus.Metric) { e.lock.Lock() defer e.lock.Unlock() if !e.disableIntro { e.scrapeTotalCount.Add(1) } e.scrapeBegin = time.Now() // scrape primary server s := e.server s.Collect(ch) // scrape extra servers if exists for _, srv := range e.IterateServer() { srv.Collect(ch) } e.scrapeDone = time.Now() if !e.disableIntro { e.lastScrapeTime.Set(float64(e.scrapeDone.Unix())) e.scrapeDuration.Set(e.scrapeDone.Sub(e.scrapeBegin).Seconds()) } s.lock.RLock() version := s.Version up := s.UP recovery := s.Recovery s.lock.RUnlock() e.updateHealthState(up, recovery) if !e.disableIntro { e.version.Set(float64(version)) if up { e.up.Set(1) if recovery { e.recovery.Set(1) } else { e.recovery.Set(0) } } else { e.up.Set(0) e.scrapeErrorCount.Add(1) } e.exporterUptime.Set(e.server.Uptime()) e.collectServerMetrics(ch) e.collectInternalMetrics(ch) } } func (e *Exporter) collectServerMetrics(ch chan<- prometheus.Metric) { servers := e.IterateServer() if e.server != nil { servers = append(servers, e.server) // append primary server to extra server list } for _, s := range servers { if s == nil { continue } s.lock.RLock() datname := s.Database scrapeDur := s.scrapeDone.Sub(s.scrapeBegin).Seconds() totalSeconds := s.totalTime totalCount := s.totalCount errorCount := s.errorCount // Snapshot query maps (they are replaced as a whole on ResetStats). queryCacheTTL := s.queryCacheTTL queryScrapeTotalCount := s.queryScrapeTotalCount queryScrapeHitCount := s.queryScrapeHitCount queryScrapeErrorCount := s.queryScrapeErrorCount queryScrapePredicateSkipCount := s.queryScrapePredicateSkipCount queryScrapeMetricCount := s.queryScrapeMetricCount queryScrapeDuration := s.queryScrapeDuration s.lock.RUnlock() ch <- prometheus.MustNewConstMetric(e.serverScrapeDurationDesc, prometheus.GaugeValue, scrapeDur, datname) ch <- prometheus.MustNewConstMetric(e.serverScrapeTotalSecondsDesc, prometheus.GaugeValue, totalSeconds, datname) ch <- prometheus.MustNewConstMetric(e.serverScrapeTotalCountDesc, prometheus.GaugeValue, totalCount, datname) ch <- prometheus.MustNewConstMetric(e.serverScrapeErrorCountDesc, prometheus.GaugeValue, errorCount, datname) for queryName, v := range queryCacheTTL { ch <- prometheus.MustNewConstMetric(e.queryCacheTTLDesc, prometheus.GaugeValue, v, datname, queryName) } for queryName, v := range queryScrapeTotalCount { ch <- prometheus.MustNewConstMetric(e.queryScrapeTotalCountDesc, prometheus.GaugeValue, v, datname, queryName) } for queryName, v := range queryScrapeHitCount { ch <- prometheus.MustNewConstMetric(e.queryScrapeHitCountDesc, prometheus.GaugeValue, v, datname, queryName) } for queryName, v := range queryScrapeErrorCount { ch <- prometheus.MustNewConstMetric(e.queryScrapeErrorCountDesc, prometheus.GaugeValue, v, datname, queryName) } for queryName, v := range queryScrapePredicateSkipCount { ch <- prometheus.MustNewConstMetric(e.queryScrapePredicateSkipCountDesc, prometheus.GaugeValue, v, datname, queryName) } for queryName, v := range queryScrapeMetricCount { ch <- prometheus.MustNewConstMetric(e.queryScrapeMetricCountDesc, prometheus.GaugeValue, v, datname, queryName) } for queryName, v := range queryScrapeDuration { ch <- prometheus.MustNewConstMetric(e.queryScrapeDurationDesc, prometheus.GaugeValue, v, datname, queryName) } } } // Explain is a thin wrapper of server.Explain (plain text). func (e *Exporter) Explain() string { return e.server.Explain() } // Stat is just yet another wrapper of server.Stat func (e *Exporter) Stat() string { logDebugf("stats invoked") return e.server.Stat() } // Check will perform an immediate server health check func (e *Exporter) Check() { if err := e.probeAndUpdateHealthState(); err != nil { logErrorf("exporter check failure: %s", err.Error()) } else { logDebugf("exporter check ok") } } // Close will close all underlying servers func (e *Exporter) Close() { e.stopHealthLoop() if e.server != nil { if e.server.DB != nil { err := e.server.Close() if err != nil { logErrorf("fail closing server %s: %s", e.server.Name(), err.Error()) } } } // close peripheral servers (we may skip acquire lock here) for _, srv := range e.IterateServer() { if srv != nil { if srv.DB != nil { err := srv.Close() if err != nil { logErrorf("fail closing server %s: %s", srv.Name(), err.Error()) } } } } logInfof("pg exporter closed") } // setupInternalMetrics will init internal metrics func (e *Exporter) setupInternalMetrics() { if e.namespace == "" { if e.pgbouncerMode { e.namespace = "pgbouncer" } else { e.namespace = "pg" } } // major fact e.up = prometheus.NewGauge(prometheus.GaugeOpts{ Namespace: e.namespace, ConstLabels: e.constLabels, Name: "up", Help: "last scrape was able to connect to the server: 1 for yes, 0 for no", }) e.version = prometheus.NewGauge(prometheus.GaugeOpts{ Namespace: e.namespace, ConstLabels: e.constLabels, Name: "version", Help: "server version number", }) e.recovery = prometheus.NewGauge(prometheus.GaugeOpts{ Namespace: e.namespace, ConstLabels: e.constLabels, Name: "in_recovery", Help: "server is in recovery mode? 1 for yes 0 for no", }) // build info buildInfoLabels := prometheus.Labels{ "version": Version, "revision": Revision, "branch": Branch, "builddate": BuildDate, "goversion": GoVersion, "goos": GOOS, "goarch": GOARCH, } // Merge with user-provided constant labels for k, v := range e.constLabels { buildInfoLabels[k] = v } e.buildInfo = prometheus.NewGauge(prometheus.GaugeOpts{ Namespace: e.namespace, Name: "exporter_build_info", Help: "A metric with a constant '1' value labeled with version, revision, branch, goversion, builddate, goos, and goarch from which pg_exporter was built.", ConstLabels: buildInfoLabels, }) // Set the build info value e.buildInfo.Set(1) // exporter level metrics e.exporterUp = prometheus.NewGauge(prometheus.GaugeOpts{ Namespace: e.namespace, ConstLabels: e.constLabels, Subsystem: "exporter", Name: "up", Help: "always be 1 if your could retrieve metrics", }) e.exporterUptime = prometheus.NewGauge(prometheus.GaugeOpts{ Namespace: e.namespace, ConstLabels: e.constLabels, Subsystem: "exporter", Name: "uptime", Help: "seconds since exporter primary server inited", }) e.scrapeTotalCount = prometheus.NewCounter(prometheus.CounterOpts{ Namespace: e.namespace, ConstLabels: e.constLabels, Subsystem: "exporter", Name: "scrape_total_count", Help: "times exporter was scraped for metrics", }) e.scrapeErrorCount = prometheus.NewCounter(prometheus.CounterOpts{ Namespace: e.namespace, ConstLabels: e.constLabels, Subsystem: "exporter", Name: "scrape_error_count", Help: "times exporter was scraped for metrics and failed", }) e.scrapeDuration = prometheus.NewGauge(prometheus.GaugeOpts{ Namespace: e.namespace, ConstLabels: e.constLabels, Subsystem: "exporter", Name: "scrape_duration", Help: "seconds exporter spending on scraping", }) e.lastScrapeTime = prometheus.NewGauge(prometheus.GaugeOpts{ Namespace: e.namespace, ConstLabels: e.constLabels, Subsystem: "exporter", Name: "last_scrape_time", Help: "last scrape timestamp", }) // Dynamic per-server/per-query series. // These are described via *prometheus.Desc and emitted as const metrics on each scrape. e.serverScrapeDurationDesc = prometheus.NewDesc( prometheus.BuildFQName(e.namespace, "exporter_server", "scrape_duration"), "seconds exporter server spending on scraping last scrape", []string{"datname"}, e.constLabels, ) e.serverScrapeTotalSecondsDesc = prometheus.NewDesc( prometheus.BuildFQName(e.namespace, "exporter_server", "scrape_total_seconds"), "cumulative total seconds exporter server spending on scraping", []string{"datname"}, e.constLabels, ) e.serverScrapeTotalCountDesc = prometheus.NewDesc( prometheus.BuildFQName(e.namespace, "exporter_server", "scrape_total_count"), "times exporter server was scraped for metrics", []string{"datname"}, e.constLabels, ) e.serverScrapeErrorCountDesc = prometheus.NewDesc( prometheus.BuildFQName(e.namespace, "exporter_server", "scrape_error_count"), "cumulative times exporter server scrape failed (fatal scrape failures only)", []string{"datname"}, e.constLabels, ) e.queryCacheTTLDesc = prometheus.NewDesc( prometheus.BuildFQName(e.namespace, "exporter_query", "cache_ttl"), "times to live of query cache", []string{"datname", "query"}, e.constLabels, ) e.queryScrapeTotalCountDesc = prometheus.NewDesc( prometheus.BuildFQName(e.namespace, "exporter_query", "scrape_total_count"), "times exporter server was scraped for metrics", []string{"datname", "query"}, e.constLabels, ) e.queryScrapeErrorCountDesc = prometheus.NewDesc( prometheus.BuildFQName(e.namespace, "exporter_query", "scrape_error_count"), "times the query failed", []string{"datname", "query"}, e.constLabels, ) e.queryScrapePredicateSkipCountDesc = prometheus.NewDesc( prometheus.BuildFQName(e.namespace, "exporter_query", "scrape_predicate_skip_count"), "times the query was skipped due to a predicate returning false", []string{"datname", "query"}, e.constLabels, ) e.queryScrapeDurationDesc = prometheus.NewDesc( prometheus.BuildFQName(e.namespace, "exporter_query", "scrape_duration"), "seconds query spending on scraping", []string{"datname", "query"}, e.constLabels, ) e.queryScrapeMetricCountDesc = prometheus.NewDesc( prometheus.BuildFQName(e.namespace, "exporter_query", "scrape_metric_count"), "numbers of metrics been scraped from this query", []string{"datname", "query"}, e.constLabels, ) e.queryScrapeHitCountDesc = prometheus.NewDesc( prometheus.BuildFQName(e.namespace, "exporter_query", "scrape_hit_count"), "numbers been scraped from this query", []string{"datname", "query"}, e.constLabels, ) e.exporterUp.Set(1) // always be true e.healthStatus.Store(healthStatusUnknown) } func (e *Exporter) collectInternalMetrics(ch chan<- prometheus.Metric) { ch <- e.up ch <- e.version ch <- e.recovery ch <- e.buildInfo ch <- e.exporterUp ch <- e.exporterUptime ch <- e.lastScrapeTime ch <- e.scrapeTotalCount ch <- e.scrapeErrorCount ch <- e.scrapeDuration } /* ================ Exporter Creation ================ */ // NewExporter construct a PG Exporter instance for given dsn func NewExporter(dsn string, opts ...ExporterOpt) (e *Exporter, err error) { e = &Exporter{dsn: dsn} e.servers = make(map[string]*Server) for _, opt := range opts { opt(e) } if len(e.configPath) > 0 && e.configReader != nil { return nil, errors.New("exporter configPath and configReader options are mutually exclusive") } if len(e.configPath) > 0 { if e.queries, err = LoadConfig(e.configPath); err != nil { return nil, fmt.Errorf("fail loading config file %s: %w", e.configPath, err) } } if e.configReader != nil { b, rerr := io.ReadAll(e.configReader) if rerr != nil { return nil, fmt.Errorf("fail reading config file: %w", rerr) } if e.queries, err = ParseConfig(b); err != nil { return nil, fmt.Errorf("fail parsing config file: %w", err) } if err := FinalizeQueries(e.queries, ""); err != nil { return nil, fmt.Errorf("fail finalizing config: %w", err) } } if err := validateConstLabelConflicts(e.constLabels, e.queries, e.disableIntro); err != nil { return nil, fmt.Errorf("invalid constant labels: %w", err) } logDebugf("exporter init with %d queries", len(e.queries)) // note here the server is still not connected. it will trigger connecting when being scraped e.server = NewServer( dsn, WithQueries(e.queries), WithConstLabel(e.constLabels), WithCachePolicy(e.disableCache), WithServerTags(e.tags), WithServerConnectTimeout(e.connectTimeout), ) // register db change callback if e.autoDiscovery { logInfof("auto discovery is enabled, excludeDatabase=%v, includeDatabase=%v", e.excludeDatabase, e.includeDatabase) e.server.onDatabaseChange = e.OnDatabaseChange } logDebugf("check primary server connectivity") // Best-effort check: we don't block the exporter startup if the target is down. // The actual scrape path will reconnect and re-plan when the target comes back. if err = e.server.Check(); err != nil { if e.failFast { return nil, fmt.Errorf("fail connecting to primary server: %w", err) } logErrorf("fail connecting to primary server: %s (startup will continue)", err.Error()) // NewExporter has named return values; make sure we don't propagate the // precheck error when failFast is disabled. err = nil } e.pgbouncerMode = e.server.PgbouncerMode e.setupInternalMetrics() e.updateHealthStateFromServer() // Always start the health loop so probes can recover once the target becomes reachable. e.startHealthLoop() return } // OnDatabaseChange will spawn new Server when new database is created // and destroy Server if corresponding database is dropped func (e *Exporter) OnDatabaseChange(change map[string]bool) { for dbname, add := range change { verb := "del" if add { verb = "add" } if dbname == e.server.Database { continue // skip primary database change } if _, found := e.excludeDatabase[dbname]; found { logInfof("skip database change: %v %v according to in excluded database list", verb, dbname) continue // skip exclude databases changes } if len(e.includeDatabase) > 0 { if _, found := e.includeDatabase[dbname]; !found { logInfof("skip database change: %v %v according to not in include database list", verb, dbname) continue // skip non-include databases changes } } if add { // spawn new server e.CreateServer(dbname) } else { // close old server e.RemoveServer(dbname) } } } // CreateServer will spawn new database server from a database name combined with existing dsn string // This happens when a database is newly created func (e *Exporter) CreateServer(dbname string) { newDSN := ReplaceDatname(e.dsn, dbname) logInfof("spawn new server for database %s : %s", dbname, ShadowPGURL(newDSN)) newServer := NewServer( newDSN, WithQueries(e.queries), WithConstLabel(e.constLabels), WithCachePolicy(e.disableCache), WithServerTags(e.tags), WithServerConnectTimeout(e.connectTimeout), ) newServer.Forked = true // important! e.sLock.Lock() e.servers[dbname] = newServer logInfof("database %s is installed due to auto-discovery", dbname) defer e.sLock.Unlock() } // RemoveServer will destroy Server from Exporter according to database name // This happens when a database is dropped func (e *Exporter) RemoveServer(dbname string) { e.sLock.Lock() srv, ok := e.servers[dbname] if ok { delete(e.servers, dbname) } e.sLock.Unlock() if ok && srv != nil { if srv.DB != nil { // Close asynchronously to avoid blocking the scrape path. go func(dbname string, srv *Server) { if err := srv.Close(); err != nil { logErrorf("fail closing removed database server %s: %s", dbname, err.Error()) } }(dbname, srv) } } logWarnf("database %s is removed due to auto-discovery", dbname) } // IterateServer will get snapshot of extra servers func (e *Exporter) IterateServer() (res []*Server) { e.sLock.RLock() defer e.sLock.RUnlock() if len(e.servers) == 0 { return nil } res = make([]*Server, 0, len(e.servers)) for _, srv := range e.servers { res = append(res, srv) } return } // ExporterOpt configures Exporter type ExporterOpt func(*Exporter) // WithConfig add config path to Exporter func WithConfig(configPath string) ExporterOpt { return func(e *Exporter) { e.configPath = configPath } } // WithConfigReader uses a the provided reader to load a configuration for the Exporter func WithConfigReader(reader io.Reader) ExporterOpt { return func(e *Exporter) { e.configReader = reader } } // WithConstLabels add const label to exporter. 0 length label returns nil func WithConstLabels(s string) ExporterOpt { return func(e *Exporter) { e.constLabels = parseConstLabels(s) } } // WithCacheDisabled set cache param to exporter func WithCacheDisabled(disableCache bool) ExporterOpt { return func(e *Exporter) { e.disableCache = disableCache } } // WithIntroDisabled will pass introspection option to server func WithIntroDisabled(disableIntro bool) ExporterOpt { return func(s *Exporter) { s.disableIntro = disableIntro } } // WithFailFast marks exporter fail instead of waiting during start-up func WithFailFast(failFast bool) ExporterOpt { return func(e *Exporter) { e.failFast = failFast } } // WithNamespace will specify metric namespace, by default is pg or pgbouncer func WithNamespace(namespace string) ExporterOpt { return func(e *Exporter) { e.namespace = namespace } } // WithTags will register given tags to Exporter and all belonged servers func WithTags(tags string) ExporterOpt { return func(e *Exporter) { e.tags = parseCSV(tags) } } // WithAutoDiscovery configures exporter with excluded database func WithAutoDiscovery(flag bool) ExporterOpt { return func(e *Exporter) { e.autoDiscovery = flag } } // WithExcludeDatabase configures exporter with excluded database func WithExcludeDatabase(excludeStr string) ExporterOpt { return func(e *Exporter) { exclMap := make(map[string]bool) exclList := parseCSV(excludeStr) for _, item := range exclList { exclMap[item] = true } e.excludeDatabase = exclMap } } // WithIncludeDatabase configures exporter with included database func WithIncludeDatabase(includeStr string) ExporterOpt { return func(e *Exporter) { inclMap := make(map[string]bool) inclList := parseCSV(includeStr) for _, item := range inclList { inclMap[item] = true } e.includeDatabase = inclMap } } // WithConnectTimeout will specify timeout for conn pre-check. // It's useful to increase this value when monitoring a remote instance (cross DC, cross AZ) func WithConnectTimeout(timeout int) ExporterOpt { return func(e *Exporter) { e.connectTimeout = timeout } } /* ================ Exporter RESTAPI ================ */ func currentExporter() *Exporter { if target := currentExporterPt.Load(); target != nil { return target } ReloadLock.RLock() defer ReloadLock.RUnlock() return PgExporter } // ExplainFunc expose explain document func (e *Exporter) ExplainFunc(w http.ResponseWriter, r *http.Request) { // The explain output is plain text. Serving it as text/plain avoids // browsers interpreting config content as HTML. w.Header().Set("Content-Type", "text/plain; charset=utf-8") target := currentExporter() if target == nil { w.WriteHeader(http.StatusServiceUnavailable) _, _ = w.Write([]byte("exporter unavailable")) return } _, _ = w.Write([]byte(target.Explain())) } // StatFunc exposes plain text runtime statistics. func (e *Exporter) StatFunc(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "text/plain; charset=UTF-8") target := currentExporter() if target == nil { w.WriteHeader(http.StatusServiceUnavailable) _, _ = w.Write([]byte("exporter unavailable")) return } _, _ = w.Write([]byte(target.Stat())) } // UpCheckFunc tells whether target instance is alive, 200 up 503 down func (e *Exporter) UpCheckFunc(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "text/plain; charset=utf-8") target := currentExporter() if target == nil { w.WriteHeader(http.StatusServiceUnavailable) _, _ = w.Write([]byte("unknown")) return } // Note: /up reports the latest state from the background health loop. // It does not actively probe the target on each HTTP request. status := target.Status() if target.Up() { w.WriteHeader(200) _, _ = w.Write([]byte(status)) } else { w.WriteHeader(503) _, _ = w.Write([]byte(status)) } } // PrimaryCheckFunc tells whether target instance is a primary, 200 yes 404 no 503 unknown func (e *Exporter) PrimaryCheckFunc(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "text/plain; charset=utf-8") target := currentExporter() if target == nil { w.WriteHeader(http.StatusServiceUnavailable) _, _ = w.Write([]byte("unknown")) return } status := target.Status() if target.Up() { if target.Recovery() { w.WriteHeader(404) _, _ = w.Write([]byte(status)) } else { w.WriteHeader(200) _, _ = w.Write([]byte(status)) } } else { w.WriteHeader(503) _, _ = w.Write([]byte(status)) } } // ReplicaCheckFunc tells whether target instance is a replica, 200 yes 404 no 503 unknown func (e *Exporter) ReplicaCheckFunc(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "text/plain; charset=utf-8") target := currentExporter() if target == nil { w.WriteHeader(http.StatusServiceUnavailable) _, _ = w.Write([]byte("unknown")) return } status := target.Status() if target.Up() { if target.Recovery() { w.WriteHeader(200) _, _ = w.Write([]byte(status)) } else { w.WriteHeader(404) _, _ = w.Write([]byte(status)) } } else { w.WriteHeader(503) _, _ = w.Write([]byte(status)) } } // VersionFunc responding current pg_exporter version func VersionFunc(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "text/plain; charset=utf-8") payload := fmt.Sprintf("pg_exporter version %s\nrevision: %s\nbranch: %s\ngo version: %s\nbuild date: %s\ngoos: %s\ngoarch: %s", Version, Revision, Branch, GoVersion, BuildDate, GOOS, GOARCH) _, _ = w.Write([]byte(payload)) } // TitleFunc responding a description message func TitleFunc(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "text/html; charset=UTF-8") _, _ = w.Write([]byte(`PG Exporter

PG Exporter

Metrics

`)) } // ReloadFunc handles reload request func ReloadFunc(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "text/plain; charset=utf-8") if r.Method != http.MethodPost && r.Method != http.MethodGet { w.Header().Set("Allow", "GET, POST") w.WriteHeader(http.StatusMethodNotAllowed) _, _ = w.Write([]byte("method not allowed")) return } if err := Reload(); err != nil { w.WriteHeader(500) _, _ = w.Write([]byte(fmt.Sprintf("fail to reload: %s", err.Error()))) } else { _, _ = w.Write([]byte(`server reloaded`)) } } ================================================ FILE: exporter/exporter_handlers_opts_test.go ================================================ package exporter import ( "net/http" "net/http/httptest" "strings" "sync/atomic" "testing" ) func TestExporterOptionHelpers(t *testing.T) { e := &Exporter{} WithConfig("/tmp/c.yml")(e) WithConstLabels("k=v,env=prod")(e) WithCacheDisabled(true)(e) WithIntroDisabled(true)(e) WithFailFast(true)(e) WithNamespace("custom")(e) WithTags("a,b")(e) WithAutoDiscovery(true)(e) WithExcludeDatabase("template0,template1")(e) WithIncludeDatabase("app,metrics")(e) WithConnectTimeout(500)(e) if e.configPath != "/tmp/c.yml" { t.Fatalf("configPath = %s", e.configPath) } if e.constLabels["k"] != "v" || e.constLabels["env"] != "prod" { t.Fatalf("constLabels = %v", e.constLabels) } if !e.disableCache || !e.disableIntro || !e.failFast || !e.autoDiscovery { t.Fatalf("boolean options not applied: cache=%v intro=%v failFast=%v auto=%v", e.disableCache, e.disableIntro, e.failFast, e.autoDiscovery) } if e.namespace != "custom" { t.Fatalf("namespace = %s", e.namespace) } if len(e.tags) != 2 || e.tags[0] != "a" || e.tags[1] != "b" { t.Fatalf("tags = %#v", e.tags) } if !e.excludeDatabase["template0"] || !e.excludeDatabase["template1"] { t.Fatalf("excludeDatabase = %v", e.excludeDatabase) } if !e.includeDatabase["app"] || !e.includeDatabase["metrics"] { t.Fatalf("includeDatabase = %v", e.includeDatabase) } if e.connectTimeout != 500 { t.Fatalf("connectTimeout = %d", e.connectTimeout) } } func TestPublicHandlers(t *testing.T) { req := httptest.NewRequest(http.MethodGet, "/", nil) wTitle := httptest.NewRecorder() TitleFunc(wTitle, req) if wTitle.Code != http.StatusOK || !strings.Contains(wTitle.Body.String(), "PG Exporter") { t.Fatalf("TitleFunc unexpected response: code=%d body=%s", wTitle.Code, wTitle.Body.String()) } wVersion := httptest.NewRecorder() VersionFunc(wVersion, req) if wVersion.Code != http.StatusOK || !strings.Contains(wVersion.Body.String(), "pg_exporter version") { t.Fatalf("VersionFunc unexpected response: code=%d body=%s", wVersion.Code, wVersion.Body.String()) } } func TestExplainAndStatHandlersWhenExporterUnavailable(t *testing.T) { origin := PgExporter setCurrentExporter(nil) defer setCurrentExporter(origin) e := &Exporter{} req := httptest.NewRequest(http.MethodGet, "/explain", nil) wExplain := httptest.NewRecorder() e.ExplainFunc(wExplain, req) if wExplain.Code != http.StatusServiceUnavailable { t.Fatalf("ExplainFunc status = %d, want 503", wExplain.Code) } wStat := httptest.NewRecorder() e.StatFunc(wStat, req) if wStat.Code != http.StatusServiceUnavailable { t.Fatalf("StatFunc status = %d, want 503", wStat.Code) } } func TestHealthHandlersPassiveModeNoActiveProbe(t *testing.T) { var checkCount atomic.Int32 s := &Server{ Database: "postgres", Databases: map[string]bool{"postgres": true}, } s.beforeScrape = func(s *Server) error { checkCount.Add(1) s.UP = false s.Recovery = false return nil } e := &Exporter{server: s} e.updateHealthState(true, false) // cached primary/up state origin := PgExporter setCurrentExporter(e) defer setCurrentExporter(origin) req := httptest.NewRequest(http.MethodGet, "/up", nil) w := httptest.NewRecorder() e.UpCheckFunc(w, req) if w.Code != http.StatusOK { t.Fatalf("passive health check should use cached up status, got %d", w.Code) } if checkCount.Load() != 0 { t.Fatalf("passive health check should not probe DB, count=%d", checkCount.Load()) } } ================================================ FILE: exporter/global.go ================================================ package exporter import ( "log/slog" "runtime" "sync" "sync/atomic" ) /* ================ Parameters ================ */ // Version is read by make build procedure var Version = "1.2.2" // Build information. Populated at build-time. var ( Branch = "main" Revision = "HEAD" BuildDate = "20250421212100" // will be overwritten during release GoVersion = runtime.Version() GOOS = runtime.GOOS GOARCH = runtime.GOARCH ) var defaultPGURL = "postgresql:///?sslmode=disable" /* ================ Global Vars ================ */ // PgExporter is the global singleton of Exporter var ( PgExporter *Exporter currentExporterPt atomic.Pointer[Exporter] ReloadLock sync.RWMutex Logger = slog.Default() ) func setCurrentExporter(e *Exporter) { PgExporter = e currentExporterPt.Store(e) } ================================================ FILE: exporter/health_state_test.go ================================================ package exporter import ( "errors" "fmt" "testing" "github.com/lib/pq" ) func TestIsPostgresStartupError(t *testing.T) { if !isPostgresStartupError(&pq.Error{Code: pq.ErrorCode(pgSQLStateCannotConnectNow)}) { t.Fatal("expected SQLSTATE 57P03 to be recognized as startup error") } wrapped := fmt.Errorf("wrapped: %w", &pq.Error{Code: pq.ErrorCode(pgSQLStateCannotConnectNow)}) if !isPostgresStartupError(wrapped) { t.Fatal("expected wrapped SQLSTATE 57P03 to be recognized as startup error") } if isPostgresStartupError(errors.New("plain error")) { t.Fatal("plain error should not be recognized as startup error") } if isPostgresStartupError(&pq.Error{Code: "08006"}) { t.Fatal("non-57P03 postgres error should not be recognized as startup error") } } func TestUpdateHealthStateWithStartup(t *testing.T) { e := &Exporter{} e.updateHealthStateWithStartup(false, false, true) if e.Up() { t.Fatal("startup state should not be considered up") } if e.Recovery() { t.Fatal("startup state should not expose recovery=true") } if got := e.Status(); got != "starting" { t.Fatalf("status = %s, want starting", got) } e.updateHealthStateWithStartup(false, false, false) if got := e.Status(); got != "down" { t.Fatalf("status = %s, want down", got) } e.updateHealthStateWithStartup(true, true, false) if got := e.Status(); got != "replica" { t.Fatalf("status = %s, want replica", got) } e.updateHealthStateWithStartup(true, false, false) if got := e.Status(); got != "primary" { t.Fatalf("status = %s, want primary", got) } } ================================================ FILE: exporter/main.go ================================================ package exporter import ( "fmt" "net/http" "os" "os/signal" "sort" "time" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/client_golang/prometheus/promhttp" "github.com/prometheus/exporter-toolkit/web" ) // DryRun will explain all query fetched from configs func DryRun() { configs, err := LoadConfig(*configPath) if err != nil { logErrorf("fail loading config %s, %v", *configPath, err) os.Exit(1) } var queries []*Query for _, query := range configs { queries = append(queries, query) } sort.Slice(queries, func(i, j int) bool { return queries[i].Priority < queries[j].Priority }) for _, query := range queries { fmt.Println(query.Explain()) } fmt.Println() os.Exit(0) } // Reload will launch a new pg exporter instance func Reload() error { ReloadLock.Lock() defer ReloadLock.Unlock() logDebugf("reload request received, reloading configuration") if *configPath == "" { return fmt.Errorf("no valid config path") } queries, err := LoadConfig(*configPath) if err != nil { return fmt.Errorf("fail loading config %s: %w", *configPath, err) } target := PgExporter if target == nil { return fmt.Errorf("exporter unavailable") } if err := validateConstLabelConflicts(target.constLabels, queries, target.disableIntro); err != nil { return fmt.Errorf("invalid configuration with current constant labels: %w", err) } // Block scrapes while we swap the query set and invalidate plans. target.lock.Lock() defer target.lock.Unlock() target.queries = queries // Update queries for primary + discovered servers, and force re-plan on next scrape. servers := target.IterateServer() if target.server != nil { servers = append(servers, target.server) } for _, s := range servers { if s == nil { continue } s.lock.Lock() s.queries = queries s.Collectors = nil s.Planned = false s.ResetStats() s.lock.Unlock() } logInfof("server reloaded, %d queries applied", len(queries)) return nil } // Run pg_exporter func Run() { ParseArgs() // Clean up unsupported libpq environment variables that would cause panic // lib/pq driver does not support these PostgreSQL environment variables // and will panic if they are set. We clear them to ensure stable operation. // See: https://github.com/lib/pq/blob/master/conn.go#L2019 unsupportedEnvs := []string{ "PGSYSCONFDIR", // PostgreSQL system configuration directory "PGSERVICEFILE", // PostgreSQL connection service file "PGSERVICE", // PostgreSQL service name "PGLOCALEDIR", // PostgreSQL locale directory "PGREALM", // Kerberos realm } for _, env := range unsupportedEnvs { if val := os.Getenv(env); val != "" { logWarnf("clearing unsupported environment variable %s=%s (lib/pq limitation)", env, val) os.Unsetenv(env) } } // explain config only if *dryRun { DryRun() } if *configPath == "" { Logger.Error("no valid config path, exit") os.Exit(1) } if len(*webConfig.WebListenAddresses) == 0 { Logger.Error("invalid listen address", "addresses", *webConfig.WebListenAddresses) os.Exit(1) } listenAddr := (*webConfig.WebListenAddresses)[0] // Create exporter. It will connect on scrape and keep health probes running in background. var err error newExporter, err := NewExporter( *pgURL, WithConfig(*configPath), WithConstLabels(*constLabels), WithCacheDisabled(*disableCache), WithIntroDisabled(*disableIntro), WithFailFast(*failFast), WithNamespace(*exporterNamespace), WithAutoDiscovery(*autoDiscovery), WithExcludeDatabase(*excludeDatabase), WithIncludeDatabase(*includeDatabase), WithTags(*serverTags), WithConnectTimeout(*connectTimeout), ) if err != nil { logErrorf("fail creating pg_exporter: %s", err.Error()) os.Exit(2) } setCurrentExporter(newExporter) // trigger a manual planning before explain if *explainOnly { PgExporter.server.Plan() fmt.Println(PgExporter.Explain()) os.Exit(0) } prometheus.MustRegister(PgExporter) defer PgExporter.Close() // reload conf when receiving configured reload signals (SIGHUP, and SIGUSR1 on non-Windows) sigs := make(chan os.Signal, 1) signal.Notify(sigs, reloadSignals...) go func() { for sig := range sigs { logInfof("%v received, reloading", sig) if err := Reload(); err != nil { logErrorf("reload failed: %s", err.Error()) } } }() /* ================ REST API ================ */ // basic http.HandleFunc("/", TitleFunc) http.HandleFunc("/version", VersionFunc) // reload http.HandleFunc("/reload", ReloadFunc) // explain & stat http.HandleFunc("/stat", PgExporter.StatFunc) http.HandleFunc("/explain", PgExporter.ExplainFunc) // alive http.HandleFunc("/up", PgExporter.UpCheckFunc) http.HandleFunc("/read", PgExporter.UpCheckFunc) http.HandleFunc("/health", PgExporter.UpCheckFunc) http.HandleFunc("/liveness", PgExporter.UpCheckFunc) http.HandleFunc("/readiness", PgExporter.UpCheckFunc) // primary http.HandleFunc("/primary", PgExporter.PrimaryCheckFunc) http.HandleFunc("/leader", PgExporter.PrimaryCheckFunc) http.HandleFunc("/master", PgExporter.PrimaryCheckFunc) http.HandleFunc("/read-write", PgExporter.PrimaryCheckFunc) http.HandleFunc("/rw", PgExporter.PrimaryCheckFunc) // replica http.HandleFunc("/replica", PgExporter.ReplicaCheckFunc) http.HandleFunc("/standby", PgExporter.ReplicaCheckFunc) http.HandleFunc("/slave", PgExporter.ReplicaCheckFunc) http.HandleFunc("/read-only", PgExporter.ReplicaCheckFunc) http.HandleFunc("/ro", PgExporter.ReplicaCheckFunc) http.Handle(*metricPath, promhttp.Handler()) logInfof("pg_exporter for %s start, listen on %s%s", ShadowPGURL(*pgURL), listenAddr, *metricPath) srv := &http.Server{ ReadHeaderTimeout: 5 * time.Second, ReadTimeout: 10 * time.Second, WriteTimeout: 30 * time.Second, IdleTimeout: 2 * time.Minute, } if err := web.ListenAndServe(srv, webConfig, Logger); err != nil { logFatalf("http server failed: %s", err.Error()) } } ================================================ FILE: exporter/metrics_lifecycle_test.go ================================================ package exporter import ( "testing" "time" "github.com/prometheus/client_golang/prometheus" ) func makeCachedCollectorForServer(s *Server, name string, val float64) *Collector { q := makeGaugeQuery(name, 1) c := NewCollector(q, s) c.TTL = 3600 c.lastScrape = time.Now() metric := prometheus.MustNewConstMetric(c.descriptors["value"], prometheus.GaugeValue, val, "db") c.result = []prometheus.Metric{metric} c.err = nil return c } func TestExporterCollectAndInternalMetrics(t *testing.T) { primary := NewServer("postgresql://u:p@localhost:5432/postgres") primary.beforeScrape = func(s *Server) error { s.UP = true s.Version = 160000 s.Recovery = false return nil } primary.Planned = true primary.Collectors = []*Collector{makeCachedCollectorForServer(primary, "q_primary", 1)} primary.ResetStats() extra := NewServer("postgresql://u:p@localhost:5432/otherdb") extra.Forked = true extra.beforeScrape = func(s *Server) error { s.UP = true s.Version = 160000 s.Recovery = false return nil } extra.Planned = true extra.Collectors = []*Collector{makeCachedCollectorForServer(extra, "q_extra", 2)} extra.ResetStats() e := &Exporter{ server: primary, servers: map[string]*Server{"otherdb": extra}, } e.setupInternalMetrics() ch := make(chan prometheus.Metric, 256) e.Collect(ch) if !e.Up() { t.Fatal("Exporter should be UP after successful collect") } if e.Status() != "primary" { t.Fatalf("Exporter status = %s, want primary", e.Status()) } } func TestExporterDescribeAndCloseNoPanic(t *testing.T) { s := NewServer("postgresql://u:p@localhost:5432/postgres") s.beforeScrape = func(s *Server) error { s.UP = true return nil } s.Planned = true s.Collectors = []*Collector{makeCachedCollectorForServer(s, "q", 1)} s.ResetStats() e := &Exporter{ server: s, servers: map[string]*Server{}, } e.setupInternalMetrics() descCh := make(chan *prometheus.Desc, 32) e.Describe(descCh) if len(descCh) != 0 { t.Fatalf("Describe should not emit descriptors for a dynamic/unchecked exporter, got %d", len(descCh)) } // server DB pointers are nil in this synthetic test; Close should not panic. e.Close() } func TestDisableIntroSuppressesInternalMetrics(t *testing.T) { s := NewServer("postgresql://u:p@localhost:5432/postgres") s.beforeScrape = func(s *Server) error { s.UP = true s.Version = 160000 s.Recovery = false return nil } s.Planned = true s.Collectors = []*Collector{makeCachedCollectorForServer(s, "q", 1)} s.ResetStats() e := &Exporter{ server: s, servers: map[string]*Server{}, disableIntro: true, } e.setupInternalMetrics() r := prometheus.NewRegistry() if err := r.Register(e); err != nil { t.Fatalf("register exporter failed: %v", err) } mfs, err := r.Gather() if err != nil { t.Fatalf("Gather failed: %v", err) } found := map[string]bool{} for _, mf := range mfs { found[mf.GetName()] = true } if !found["q_value"] { t.Fatalf("expected query metric q_value to be present, got: %#v", found) } // Default internal metrics namespace for postgres is "pg". if found["pg_up"] || found["pg_version"] || found["pg_in_recovery"] || found["pg_exporter_build_info"] || found["pg_exporter_up"] { t.Fatalf("disable-intro should suppress internal metrics, got: %#v", found) } } func TestServerIntrospectionHelpers(t *testing.T) { s := NewServer("postgresql://u:p@localhost:5432/postgres") c := makeCachedCollectorForServer(s, "q", 1) s.Collectors = []*Collector{c} s.ResetStats() if s.Error() != nil { t.Fatalf("new server Error should be nil, got %v", s.Error()) } if got := s.Duration(); got != 0 { t.Fatalf("new server Duration = %v, want 0", got) } if got := s.Uptime(); got < 0 { t.Fatalf("Uptime should be non-negative, got %v", got) } if got := c.ResultSize(); got != 1 { t.Fatalf("collector ResultSize = %d, want 1", got) } if skip, _ := c.PredicateSkip(); skip { t.Fatal("collector PredicateSkip should be false by default") } if got := c.Duration(); got != 0 { t.Fatalf("collector Duration = %v, want 0", got) } if exp := s.Explain(); exp == "" { t.Fatal("Explain should not be empty") } if html := s.ExplainHTML(); html == "" { t.Fatal("ExplainHTML should not be empty") } } ================================================ FILE: exporter/pgurl.go ================================================ package exporter import ( "net/url" "os" "strings" ) // GetPGURL will retrieve, parse, modify postgres connection string func GetPGURL() string { return ProcessPGURL(RetrievePGURL()) } // RetrievePGURL retrieve pg target url from multiple sources according to precedence // priority: cli-args > env > env file path // 1. Command Line Argument (--url -u -d) // 2. Environment PG_EXPORTER_URL // 3. From file specified via Environment PG_EXPORTER_URL_FILE // 4. Default url // // The default URL intentionally targets local libpq defaults. This is a // local-first behavior for on-host deployments, where pg_exporter usually // runs on the same machine as PostgreSQL/PgBouncer. func RetrievePGURL() (res string) { // command line args if *pgURL != "" { logInfof("retrieve target url %s from command line", ShadowPGURL(*pgURL)) return *pgURL } // env PG_EXPORTER_URL if res = os.Getenv("PG_EXPORTER_URL"); res != "" { logInfof("retrieve target url %s from PG_EXPORTER_URL", ShadowPGURL(res)) return res } // env PGURL if res = os.Getenv("PGURL"); res != "" { logInfof("retrieve target url %s from PGURL", ShadowPGURL(res)) return res } // file content from file PG_EXPORTER_URL_FILE if filename := os.Getenv("PG_EXPORTER_URL_FILE"); filename != "" { if fileContents, err := os.ReadFile(filename); err != nil { logFatalf("PG_EXPORTER_URL_FILE=%s is specified, fail loading url: %s", filename, err.Error()) } else { res = strings.TrimSpace(string(fileContents)) logInfof("retrieve target url %s from PG_EXPORTER_URL_FILE", ShadowPGURL(res)) return res } } // DEFAULT logWarnf("fail retrieving target url, fallback on default url: %s", defaultPGURL) return defaultPGURL } // ProcessPGURL will fix URL with default options. // // Design decision: // If sslmode is omitted, force sslmode=disable. pg_exporter is typically // deployed as an on-host/local exporter, where TLS on loopback adds overhead // without meaningful security benefit. Users can always override by passing an // explicit sslmode in the URL. func ProcessPGURL(pgurl string) string { u, err := url.Parse(pgurl) if err != nil { logErrorf("invalid url format %s", pgurl) return "" } // add sslmode = disable if not exists qs := u.Query() if sslmode := qs.Get(`sslmode`); sslmode == "" { qs.Set(`sslmode`, `disable`) } u.RawQuery = qs.Encode() return u.String() } // ShadowPGURL will hide password part of dsn func ShadowPGURL(pgurl string) string { parsedURL, err := url.Parse(pgurl) // That means we got a bad connection string. Fail early if err != nil { logFatalf("Could not parse connection string %s", err.Error()) } // We need to handle two cases: // 1. The password is in the format postgresql://localhost:5432/postgres?sslmode=disable&user=&password= // 2. The password is in the format postgresql://:@localhost:5432/postgres?sslmode=disable qs := parsedURL.Query() for k, values := range qs { if strings.EqualFold(k, "password") { for i := range values { values[i] = "xxxxx" } qs[k] = values } } parsedURL.RawQuery = qs.Encode() return parsedURL.Redacted() } // ParseDatname extract database name part of a pgurl func ParseDatname(pgurl string) string { u, err := url.Parse(pgurl) if err != nil { return "" } if datname := strings.TrimLeft(u.Path, "/"); datname != "" { return datname } if datname := strings.TrimSpace(u.Query().Get("dbname")); datname != "" { return datname } return "" } // ReplaceDatname will replace pgurl with new database name func ReplaceDatname(pgurl, datname string) string { u, err := url.Parse(pgurl) if err != nil { logErrorf("invalid url format %s", pgurl) return "" } if strings.TrimLeft(u.Path, "/") == "" { qs := u.Query() if qs.Get("dbname") != "" { qs.Set("dbname", datname) u.RawQuery = qs.Encode() return u.String() } } u.Path = "/" + datname return u.String() } ================================================ FILE: exporter/pgurl_test.go ================================================ package exporter import ( "net/url" "os" "path/filepath" "testing" ) func TestProcessPGURLKeepsEncodedQueryValues(t *testing.T) { input := "postgresql://user:pass@localhost:5432/postgres?application_name=a%26b&password=p%3Dq" output := ProcessPGURL(input) if output == "" { t.Fatalf("ProcessPGURL returned empty output") } parsed, err := url.Parse(output) if err != nil { t.Fatalf("failed to parse output URL: %v", err) } qs := parsed.Query() if got := qs.Get("application_name"); got != "a&b" { t.Fatalf("application_name = %q, want %q", got, "a&b") } if got := qs.Get("password"); got != "p=q" { t.Fatalf("password = %q, want %q", got, "p=q") } if got := qs.Get("sslmode"); got != "disable" { t.Fatalf("sslmode = %q, want %q", got, "disable") } } func TestShadowPGURLRedactsQueryPassword(t *testing.T) { input := "postgresql://user:pass@localhost:5432/postgres?password=p%26q%3D1&application_name=test" output := ShadowPGURL(input) parsed, err := url.Parse(output) if err != nil { t.Fatalf("failed to parse redacted URL: %v", err) } if got := parsed.Query().Get("password"); got != "xxxxx" { t.Fatalf("password = %q, want %q", got, "xxxxx") } } func TestParseDatnameAndReplaceDatname(t *testing.T) { src := "postgresql://user:pass@localhost:5432/postgres?sslmode=disable" if got := ParseDatname(src); got != "postgres" { t.Fatalf("ParseDatname = %q, want %q", got, "postgres") } replaced := ReplaceDatname(src, "otherdb") if got := ParseDatname(replaced); got != "otherdb" { t.Fatalf("ParseDatname(replaced) = %q, want %q", got, "otherdb") } srcWithDbname := "postgresql://user:pass@localhost:5432?sslmode=disable&dbname=pgbouncer" if got := ParseDatname(srcWithDbname); got != "pgbouncer" { t.Fatalf("ParseDatname(dbname=) = %q, want %q", got, "pgbouncer") } replacedDbname := ReplaceDatname(srcWithDbname, "postgres") if got := ParseDatname(replacedDbname); got != "postgres" { t.Fatalf("ParseDatname(replaced dbname=) = %q, want %q", got, "postgres") } } func TestRetrievePGURLPriority(t *testing.T) { originPGURL := *pgURL *pgURL = "" t.Cleanup(func() { *pgURL = originPGURL }) originExporterURL := os.Getenv("PG_EXPORTER_URL") originPGURLenv := os.Getenv("PGURL") originFile := os.Getenv("PG_EXPORTER_URL_FILE") t.Cleanup(func() { _ = os.Setenv("PG_EXPORTER_URL", originExporterURL) _ = os.Setenv("PGURL", originPGURLenv) _ = os.Setenv("PG_EXPORTER_URL_FILE", originFile) }) _ = os.Setenv("PG_EXPORTER_URL", "postgresql://env-user:env-pass@localhost:5432/envdb") _ = os.Setenv("PGURL", "postgresql://pgurl-user:pgurl-pass@localhost:5432/pgurldb") *pgURL = "postgresql://cli-user:cli-pass@localhost:5432/clidb" if got := RetrievePGURL(); got != *pgURL { t.Fatalf("RetrievePGURL CLI precedence failed: got %s", got) } *pgURL = "" if got := RetrievePGURL(); got != os.Getenv("PG_EXPORTER_URL") { t.Fatalf("RetrievePGURL env precedence failed: got %s", got) } _ = os.Unsetenv("PG_EXPORTER_URL") if got := RetrievePGURL(); got != os.Getenv("PGURL") { t.Fatalf("RetrievePGURL PGURL fallback failed: got %s", got) } _ = os.Unsetenv("PGURL") file := filepath.Join(t.TempDir(), "dsn.txt") fileURL := "postgresql://file-user:file-pass@localhost:5432/filedb" if err := os.WriteFile(file, []byte(fileURL), 0o644); err != nil { t.Fatalf("write dsn file failed: %v", err) } _ = os.Setenv("PG_EXPORTER_URL_FILE", file) if got := RetrievePGURL(); got != fileURL { t.Fatalf("RetrievePGURL file fallback failed: got %s", got) } _ = os.Unsetenv("PG_EXPORTER_URL_FILE") if got := RetrievePGURL(); got != defaultPGURL { t.Fatalf("RetrievePGURL default fallback failed: got %s", got) } } ================================================ FILE: exporter/predicate_cache_test.go ================================================ package exporter import ( "context" "testing" "time" ) func TestPredicateCacheHitSkipsDBQuery(t *testing.T) { q := &Query{ Name: "q", Branch: "q", PredicateQueries: []PredicateQuery{ {Name: "p1", SQL: "SELECT true", TTL: 10}, }, } s := &Server{Database: "postgres"} // DB is nil; any QueryContext call would panic. c := NewCollector(q, s) now := time.Now() c.scrapeBegin = now // Cache hit: pass=true should continue and ultimately return true without touching DB. c.predicateCache[0] = predicateCacheEntry{at: now.Add(-time.Second), pass: true} if ok := c.executePredicateQueries(context.Background()); !ok { t.Fatal("expected cached pass=true to allow query execution") } // Cache hit: pass=false should return false without touching DB. c.scrapeBegin = now c.predicateCache[0] = predicateCacheEntry{at: now.Add(-time.Second), pass: false} if ok := c.executePredicateQueries(context.Background()); ok { t.Fatal("expected cached pass=false to skip query execution") } } func TestPredicateCacheMissTriggersDBQuery(t *testing.T) { q := &Query{ Name: "q", Branch: "q", PredicateQueries: []PredicateQuery{ {Name: "p1", SQL: "SELECT true", TTL: 10}, }, } s := &Server{Database: "postgres"} // DB is nil; QueryContext must panic if called. c := NewCollector(q, s) c.scrapeBegin = time.Now() defer func() { if r := recover(); r == nil { t.Fatal("expected panic due to DB access on predicate cache miss") } }() _ = c.executePredicateQueries(context.Background()) } func TestPredicateCacheDisabledByTTLZero(t *testing.T) { q := &Query{ Name: "q", Branch: "q", PredicateQueries: []PredicateQuery{ {Name: "p1", SQL: "SELECT true", TTL: 0}, }, } s := &Server{Database: "postgres"} // DB is nil; QueryContext must panic if called. c := NewCollector(q, s) c.scrapeBegin = time.Now() c.predicateCache[0] = predicateCacheEntry{at: time.Now(), pass: true} defer func() { if r := recover(); r == nil { t.Fatal("expected panic due to DB access when predicate TTL is 0 (cache disabled)") } }() _ = c.executePredicateQueries(context.Background()) } ================================================ FILE: exporter/probehealth_pgbouncer_test.go ================================================ package exporter import ( "context" "database/sql" "database/sql/driver" "errors" "io" "testing" ) var errProbeHealthTestPingCalled = errors.New("ping called") // probeHealthTestDriver is a tiny database/sql driver used to verify that // ProbeHealth in pgbouncer mode does not use db.PingContext (lib/pq Ping uses // a ";" query which PgBouncer rejects). If Ping is called we return a sentinel // error to fail the test. type probeHealthTestDriver struct{} func (d probeHealthTestDriver) Open(name string) (driver.Conn, error) { return &probeHealthTestConn{}, nil } type probeHealthTestConn struct{} func (c *probeHealthTestConn) Prepare(query string) (driver.Stmt, error) { return nil, errors.New("prepare not supported") } func (c *probeHealthTestConn) Close() error { return nil } func (c *probeHealthTestConn) Begin() (driver.Tx, error) { return nil, errors.New("tx not supported") } func (c *probeHealthTestConn) Ping(ctx context.Context) error { return errProbeHealthTestPingCalled } func (c *probeHealthTestConn) QueryContext(ctx context.Context, query string, args []driver.NamedValue) (driver.Rows, error) { // Return an empty resultset; database/sql will surface this as sql.ErrNoRows // to QueryRowContext, which ProbeHealth should treat as a successful probe // for PgBouncer (SHOW VERSION may return via NOTICE only). return &probeHealthTestRows{}, nil } type probeHealthTestRows struct{} func (r *probeHealthTestRows) Columns() []string { return []string{"version"} } func (r *probeHealthTestRows) Close() error { return nil } func (r *probeHealthTestRows) Next(dest []driver.Value) error { return io.EOF } func init() { sql.Register("probehealth_test", probeHealthTestDriver{}) } func TestProbeHealthPgbouncerDoesNotPingAndTreatsNoRowsAsUp(t *testing.T) { db, err := sql.Open("probehealth_test", "") if err != nil { t.Fatalf("sql.Open: %v", err) } defer db.Close() s := &Server{ DB: db, PgbouncerMode: true, ConnectTimeout: 500, } up, recovery, starting, err := s.ProbeHealth() if err != nil { t.Fatalf("ProbeHealth error = %v", err) } if !up { t.Fatalf("up = %v, want true", up) } if recovery { t.Fatalf("recovery = %v, want false", recovery) } if starting { t.Fatalf("starting = %v, want false", starting) } } ================================================ FILE: exporter/prom_validate.go ================================================ package exporter import ( "fmt" "strings" "github.com/prometheus/common/model" ) func validatePromLabelName(name string) error { if name == "" { return fmt.Errorf("empty label name") } if strings.HasPrefix(name, model.ReservedLabelPrefix) { return fmt.Errorf("label name %q uses reserved prefix %q", name, model.ReservedLabelPrefix) } if !model.LegacyValidation.IsValidLabelName(name) { return fmt.Errorf("invalid label name %q", name) } return nil } func validatePromMetricName(name string) error { if name == "" { return fmt.Errorf("empty metric name") } if !model.LegacyValidation.IsValidMetricName(name) { return fmt.Errorf("invalid metric name %q", name) } return nil } ================================================ FILE: exporter/query.go ================================================ package exporter import ( "bytes" "fmt" htmltmpl "html/template" "slices" texttmpl "text/template" "time" "gopkg.in/yaml.v3" ) /* ================ Query ================ */ // Query hold the information of how to fetch metric and parse them type Query struct { Name string `yaml:"name,omitempty"` // actual query name, used as metric prefix Desc string `yaml:"desc,omitempty"` // description of this metric query SQL string `yaml:"query"` // SQL command to fetch metrics PredicateQueries []PredicateQuery `yaml:"predicate_queries,omitempty"` // SQL command to filter metrics Branch string `yaml:"-"` // branch name, top layer key of config file // control query behaviour Tags []string `yaml:"tags,omitempty"` // tags are used for execution control TTL float64 `yaml:"ttl,omitempty"` // caching ttl in seconds Timeout float64 `yaml:"timeout,omitempty"` // query execution timeout in seconds Priority int `yaml:"priority,omitempty"` // execution priority, from 1 to 999 MinVersion int `yaml:"min_version,omitempty"` // minimal supported version, include MaxVersion int `yaml:"max_version,omitempty"` // maximal supported version, not include Fatal bool `yaml:"fatal,omitempty"` // if query marked fatal fail, entire scrape will fail Skip bool `yaml:"skip,omitempty"` // if query marked skip, it will be omit while loading Metrics []map[string]*Column `yaml:"metrics"` // metric definition list // metrics parsing auxiliaries Path string `yaml:"-"` // where am I from ? Columns map[string]*Column `yaml:"-"` // column map ColumnNames []string `yaml:"-"` // column names in origin orders LabelNames []string `yaml:"-"` // column (name) that used as label, sequences matters MetricNames []string `yaml:"-"` // column (name) that used as metric } // A PredicateQuery is a query that returns a 1-column resultset that's used to decide whether // to run the main query. type PredicateQuery struct { Name string `yaml:"name,omitempty"` // predicate query name, only used for logging SQL string `yaml:"predicate_query"` // SQL command to return a predicate TTL float64 `yaml:"ttl,omitempty"` // How long to cache results for } var queryTemplate, _ = texttmpl.New("Query").Parse(`## # SYNOPSIS # {{ .Name }}{{ if ne .Name .Branch }}.{{ .Branch }}{{ end }}_* # # DESCRIPTION # {{ with .Desc }}{{ . }}{{ else }}N/A{{ end }} # # OPTIONS # Tags [{{ range $i, $e := .Tags }}{{ if $i }}, {{ end }}{{ $e }}{{ end }}] # TTL {{ .TTL }} # Priority {{ .Priority }} # Timeout {{ .TimeoutDuration }} # Fatal {{ .Fatal }} # Version {{ if ne .MinVersion 0 }}{{ .MinVersion }}{{ else }}lower{{ end }} ~ {{ if ne .MaxVersion 0 }}{{ .MaxVersion }}{{ else }}higher{{ end }} # Source {{ .Path }} # # METRICS {{- range .ColumnList }} # {{ .Name }} ({{ .Usage }}) # {{ with .Desc }}{{ . }}{{ else }}N/A{{ end }}{{ end }} # {{.MarshalYAML -}} `) var htmlTemplate, _ = htmltmpl.New("Query").Parse(`

{{ .Name }}

{{ .Desc }}

{{ if len .PredicateQueries }}

Predicate queries

{{ range .PredicateQueries }} {{ end }}
Name SQL Cache TTL
{{ .Name }}{{ .SQL }}{{if ne .TTL 0}}{{ .TTL }}s{{else}}not cached{{end}}
{{ end }}

Query

{{ .SQL }}

Attribution

Branch {{ .Branch }}
TTL {{ .TTL }}
Priority {{ .Priority }}
Timeout {{ .TimeoutDuration }}
Fatal {{ .Fatal }}
Version {{if ne .MinVersion 0}}{{ .MinVersion }}{{else}}lower{{end}} ~ {{if ne .MaxVersion 0}}{{ .MaxVersion }}{{else}}higher{{end}}
Tags {{ .Tags }}
Source {{ .Path }}

Columns

{{ range .ColumnList }}{{ end }}
Name Usage Rename Bucket Scale Default Description
{{ .Name }}{{ .Usage }}{{ .Rename }}{{ .Bucket }}{{ .Scale }}{{ .Default }}{{ .Desc }}

Metrics

{{ range .MetricList }}{{ end }}
Name Usage Desc
{{ .Name }}{{ .Column.Usage }}{{ .Column.Desc }}
`) // MarshalYAML will turn query into YAML format func (q *Query) MarshalYAML() string { // buf := new(bytes.Buffer) v := make(map[string]Query, 1) v[q.Branch] = *q buf, err := yaml.Marshal(v) if err != nil { msg := fmt.Sprintf("fail to marshall query yaml: %s", err.Error()) logError(msg) return msg } return string(buf) } // Explain will turn query into text format func (q *Query) Explain() string { buf := new(bytes.Buffer) err := queryTemplate.Execute(buf, q) if err != nil { msg := fmt.Sprintf("fail to explain query: %s", err.Error()) logError(msg) return msg } return buf.String() } // HTML will turn Query into HTML format func (q *Query) HTML() string { buf := new(bytes.Buffer) err := htmlTemplate.Execute(buf, q) if err != nil { msg := fmt.Sprintf("fail to generate query html: %s", err.Error()) logError(msg) return msg } return buf.String() } // HasTag tells whether this query have specific tag // since only few tags is provided, we don't really need a map here func (q *Query) HasTag(tag string) bool { return slices.Contains(q.Tags, tag) } // ColumnList return ordered column list func (q *Query) ColumnList() (res []*Column) { res = make([]*Column, len(q.ColumnNames)) for i, colName := range q.ColumnNames { res[i] = q.Columns[colName] } return } // LabelList returns a list of label column names func (q *Query) LabelList() []string { labelNames := make([]string, len(q.LabelNames)) for i, labelName := range q.LabelNames { labelColumn := q.Columns[labelName] if labelColumn.Rename != "" { labelNames[i] = labelColumn.Rename } else { labelNames[i] = labelColumn.Name } } return labelNames } // MetricList returns a list of MetricDesc generated by this query func (q *Query) MetricList() (res []*MetricDesc) { res = make([]*MetricDesc, len(q.MetricNames)) for i, metricName := range q.MetricNames { column := q.Columns[metricName] res[i] = column.MetricDesc(q.Name, q.LabelList()) } return } // TimeoutDuration will turn timeout settings into time.Duration func (q *Query) TimeoutDuration() time.Duration { return time.Duration(float64(time.Second) * q.Timeout) } ================================================ FILE: exporter/query_column_test.go ================================================ package exporter import ( "strings" "testing" "time" "github.com/prometheus/client_golang/prometheus" ) func makeSampleQuery() *Query { return &Query{ Name: "sample", Branch: "sample_branch", Desc: "sample query", Tags: []string{"tag1", "tag2"}, Timeout: 1.5, Columns: map[string]*Column{ "datname": { Name: "datname", Usage: LABEL, Rename: "db", Desc: "database name", }, "value": { Name: "value", Usage: GAUGE, Rename: "val", Desc: "metric value", }, }, ColumnNames: []string{"datname", "value"}, LabelNames: []string{"datname"}, MetricNames: []string{"value"}, Metrics: []map[string]*Column{ {"datname": {Name: "datname", Usage: LABEL, Rename: "db", Desc: "database name"}}, {"value": {Name: "value", Usage: GAUGE, Rename: "val", Desc: "metric value"}}, }, } } func TestColumnPrometheusValueType(t *testing.T) { cGauge := &Column{Name: "g", Usage: GAUGE} if got := cGauge.PrometheusValueType(); got != prometheus.GaugeValue { t.Fatalf("gauge type = %v, want GaugeValue", got) } cCounter := &Column{Name: "c", Usage: COUNTER} if got := cCounter.PrometheusValueType(); got != prometheus.CounterValue { t.Fatalf("counter type = %v, want CounterValue", got) } defer func() { if r := recover(); r == nil { t.Fatal("PrometheusValueType should panic for non-value usage") } }() _ = (&Column{Name: "x", Usage: LABEL}).PrometheusValueType() } func TestColumnAndMetricDescString(t *testing.T) { c := &Column{Name: "value", Usage: GAUGE, Desc: "desc"} if !strings.Contains(c.String(), "value") { t.Fatalf("column string does not contain name: %s", c.String()) } md := c.MetricDesc("sample", []string{"db"}) if !strings.Contains(md.Name, "sample_value") { t.Fatalf("metric desc name = %s", md.Name) } if !strings.Contains(md.Name, "{db}") { t.Fatalf("metric desc should contain labels signature, got %s", md.Name) } if !strings.Contains(md.String(), "desc") { t.Fatalf("metric desc string = %s", md.String()) } } func TestQueryHelpersAndRender(t *testing.T) { q := makeSampleQuery() if !q.HasTag("tag1") || q.HasTag("missing") { t.Fatalf("HasTag result unexpected for tags %v", q.Tags) } cols := q.ColumnList() if len(cols) != 2 || cols[0].Name != "datname" || cols[1].Name != "value" { t.Fatalf("ColumnList order unexpected: %#v", cols) } labels := q.LabelList() if len(labels) != 1 || labels[0] != "db" { t.Fatalf("LabelList = %#v, want [db]", labels) } metrics := q.MetricList() if len(metrics) != 1 { t.Fatalf("MetricList len = %d, want 1", len(metrics)) } if !strings.Contains(metrics[0].Name, "sample_val") { t.Fatalf("MetricList name = %s", metrics[0].Name) } if got := q.TimeoutDuration(); got != 1500*time.Millisecond { t.Fatalf("TimeoutDuration = %v, want 1500ms", got) } yaml := q.MarshalYAML() if !strings.Contains(yaml, "sample_branch:") { t.Fatalf("MarshalYAML missing branch key: %s", yaml) } explain := q.Explain() if !strings.Contains(explain, "SYNOPSIS") { t.Fatalf("Explain output unexpected: %s", explain) } html := q.HTML() if !strings.Contains(html, "

sample

") { t.Fatalf("HTML output unexpected: %s", html) } } ================================================ FILE: exporter/reload_signals_unix.go ================================================ //go:build !windows package exporter import ( "os" "syscall" ) var reloadSignals = []os.Signal{syscall.SIGHUP, syscall.SIGUSR1} ================================================ FILE: exporter/reload_signals_windows.go ================================================ //go:build windows package exporter import ( "os" "syscall" ) var reloadSignals = []os.Signal{syscall.SIGHUP} ================================================ FILE: exporter/reload_test.go ================================================ package exporter import ( "os" "path/filepath" "testing" ) func TestReloadUpdatesQueriesInPlace(t *testing.T) { originExporter := PgExporter t.Cleanup(func() { setCurrentExporter(originExporter) }) originConfigPath := *configPath t.Cleanup(func() { *configPath = originConfigPath }) // Seed exporter with an initial query set and a planned server. s := NewServer("postgresql://u:p@localhost:5432/postgres") s.beforeScrape = func(s *Server) error { return nil } s.Planned = true s.queries = map[string]*Query{"old": makeGaugeQuery("old", 1)} s.Collectors = []*Collector{NewCollector(makeGaugeQuery("old", 1), s)} s.ResetStats() e := &Exporter{ server: s, servers: map[string]*Server{}, queries: s.queries, } setCurrentExporter(e) // Write a new config and reload it. dir := t.TempDir() cfgPath := filepath.Join(dir, "pg_exporter.yml") cfg := ` q_new: query: SELECT 1 AS value, 'db' AS datname metrics: - datname: usage: label description: db - value: usage: gauge description: value ` if err := os.WriteFile(cfgPath, []byte(cfg), 0o644); err != nil { t.Fatalf("write config failed: %v", err) } *configPath = cfgPath if err := Reload(); err != nil { t.Fatalf("Reload failed: %v", err) } if _, ok := e.queries["q_new"]; !ok { t.Fatalf("expected new query to be loaded, got: %#v", e.queries) } if e.server.queries == nil || e.server.queries["q_new"] == nil { t.Fatalf("server queries not updated, got: %#v", e.server.queries) } if e.server.Planned { t.Fatalf("server should be marked unplanned after reload") } if e.server.Collectors != nil { t.Fatalf("server collectors should be cleared after reload") } } ================================================ FILE: exporter/server.go ================================================ package exporter import ( "bytes" "context" "database/sql" "errors" "fmt" "regexp" "sort" "strconv" "strings" "sync" "time" "github.com/lib/pq" "github.com/prometheus/client_golang/prometheus" ) /* ================ Const ================ */ const connMaxLifeTime = 1 * time.Minute // close connection after 1 minute to avoid conn leak const pgSQLStateCannotConnectNow = "57P03" /* ================ Server ================ */ var semverRe = regexp.MustCompile(`(\d+)\.(\d+)\.(\d+)`) // Server represent a postgres connection, with additional fact, conf, runtime info type Server struct { *sql.DB // database instance (do not close this due to the stupid implementation in database/sql) dsn string // data source name lock sync.RWMutex // server scrape lock err error // last error // notice handling (primarily for pgbouncer, where SHOW VERSION may return via NOTICE) noticeMu sync.Mutex lastNotice string // hooks beforeScrape func(s *Server) error // hook: execute before scrape onDatabaseChange func(change map[string]bool) // hook: invoke when database list is changed // postgres fact gather from server UP bool // indicate whether target server is connectable Recovery bool // is server in recovering Version int // pg server version num Database string // database name of current server connection Username string // current username Databases map[string]bool // all available database in target cluster dblistLock sync.Mutex // lock when access Databases map Namespaces map[string]bool // all available schema in target cluster Extensions map[string]bool // all available extension in target cluster Tags []string // server tags set by cli arg --tag PgbouncerMode bool // indicate it is a pgbouncer server DisableCache bool // force executing, ignoring caching policy ExcludeDbnames []string // if ExcludeDbnames is provided, Auto Database Discovery is enabled Forked bool // is this a forked server ? (does not run cluster level query) Planned bool // if false, server will trigger a plan before collect MaxConn int // max connection for this server ConnectTimeout int // connect timeout for this server in ms ConnMaxLifetime int // connection max lifetime for this server in seconds // query Collectors []*Collector // query collector instance (installed query) queries map[string]*Query // queries map, keys are config file top layer key labels prometheus.Labels // constant labels // internal stats serverInit time.Time // server init timestamp scrapeBegin time.Time // server last scrape begin time scrapeDone time.Time // server last scrape done time errorCount float64 // total scrape error count on this server (fatal scrape failures only) totalCount float64 // total scrape count on this server totalTime float64 // total time spend on scraping queryCacheTTL map[string]float64 // internal query metrics: cache time to live queryScrapeTotalCount map[string]float64 // internal query metrics: total executed queryScrapeHitCount map[string]float64 // internal query metrics: times serving from hit cache queryScrapeErrorCount map[string]float64 // internal query metrics: times failed queryScrapePredicateSkipCount map[string]float64 // internal query metrics: times skipped due to predicate queryScrapeMetricCount map[string]float64 // internal query metrics: number of metrics scraped queryScrapeDuration map[string]float64 // internal query metrics: time spend on executing } func (s *Server) GetConnectTimeout() time.Duration { if s.ConnectTimeout <= 0 { return 100 * time.Millisecond } return time.Duration(s.ConnectTimeout) * time.Millisecond } // Name is coalesce(s.Database, dsn) func (s *Server) Name() string { if s.Database != "" { return s.Database } return ShadowPGURL(s.dsn) } func (s *Server) Error() error { return s.err } // Check will issue a connection and executing precheck hook function func (s *Server) Check() error { s.lock.Lock() defer s.lock.Unlock() return s.beforeScrape(s) } // ProbeHealth performs a lightweight probe for exporter health checks. // It returns whether the database is reachable, whether it's in recovery, // and whether PostgreSQL is still starting up (SQLSTATE 57P03). func (s *Server) ProbeHealth() (up, recovery, starting bool, err error) { // Snapshot pointers/flags quickly under lock to avoid races with first-time DB init. s.lock.RLock() db := s.DB pgbouncerMode := s.PgbouncerMode timeout := s.GetConnectTimeout() s.lock.RUnlock() if db == nil { return false, false, false, errors.New("database connection is not initialized") } ctx, cancel := context.WithTimeout(context.Background(), timeout) defer cancel() if pgbouncerMode { // IMPORTANT: do NOT use db.PingContext() for PgBouncer. // // lib/pq implements Ping() by sending a simpleQuery(";") (empty command). // PostgreSQL tolerates that, but PgBouncer rejects it with: // invalid command ';', use SHOW HELP; // This becomes a log-spam regression once ProbeHealth is run periodically. // // For health probes we only need a cheap PgBouncer admin command. // Some PgBouncer versions return SHOW VERSION via NOTICE (no result rows), // so treat sql.ErrNoRows as success. var dummy string qerr := db.QueryRowContext(ctx, `SHOW VERSION;`).Scan(&dummy) if qerr != nil && !errors.Is(qerr, sql.ErrNoRows) { return false, false, false, qerr } return true, false, false, nil } if err = db.QueryRowContext(ctx, `SELECT pg_catalog.pg_is_in_recovery();`).Scan(&recovery); err != nil { starting = isPostgresStartupError(err) return false, false, starting, err } return true, recovery, false, nil } func isPostgresStartupError(err error) bool { var pgErr *pq.Error return errors.As(err, &pgErr) && string(pgErr.Code) == pgSQLStateCannotConnectNow } // PgbouncerPrecheck checks pgbouncer connection before scrape func PgbouncerPrecheck(s *Server) (err error) { if s.DB == nil { // if db is not initialized, create a new DB with a NOTICE handler base, cerr := pq.NewConnector(s.dsn) if cerr != nil { s.UP = false return cerr } connector := pq.ConnectorWithNoticeHandler(base, func(notice *pq.Error) { s.noticeMu.Lock() s.lastNotice = notice.Message s.noticeMu.Unlock() }) s.DB = sql.OpenDB(connector) s.DB.SetMaxIdleConns(1) s.DB.SetMaxOpenConns(1) s.DB.SetConnMaxLifetime(connMaxLifeTime) } // Clear last notice before issuing SHOW VERSION so we can reliably parse it. s.noticeMu.Lock() s.lastNotice = "" s.noticeMu.Unlock() ctx, cancel := context.WithTimeout(context.Background(), s.GetConnectTimeout()) defer cancel() var versionStr string qerr := s.DB.QueryRowContext(ctx, `SHOW VERSION;`).Scan(&versionStr) if qerr != nil && !errors.Is(qerr, sql.ErrNoRows) { // Connection/auth errors should fail precheck (and thus scrape). s.UP = false return fmt.Errorf("fail fetching pgbouncer version: %w", qerr) } s.UP = true // Version may come from: // 1) a normal 1-row resultset (older versions / some builds), or // 2) a server NOTICE message (pgbouncer 1.12+). newVer := 0 if qerr == nil { newVer = ParseSemver(versionStr) } if newVer == 0 { s.noticeMu.Lock() notice := s.lastNotice s.noticeMu.Unlock() if notice != "" { newVer = ParseSemver(notice) } } if newVer != 0 { if s.Version != newVer { logInfof("server [%s] pgbouncer version changed: from [%d] to [%d]", s.Name(), s.Version, newVer) s.Planned = false } s.Version = newVer } else if qerr == nil || errors.Is(qerr, sql.ErrNoRows) { // Connected, but couldn't parse version string. logWarnf("server [%s] connected but failed to parse pgbouncer version (row=%q)", s.Name(), versionStr) } return nil } // ParseSemver will turn semantic version string into integer func ParseSemver(semverStr string) int { semver := semverRe.FindStringSubmatch(semverStr) logDebugf("parse pgbouncer semver string %s", semverStr) if len(semver) != 4 { return 0 } verNum := 0 if major, err := strconv.Atoi(semver[1]); err != nil { return 0 } else { verNum += major * 10000 } if minor, err := strconv.Atoi(semver[2]); err != nil { return 0 } else { verNum += minor * 100 } if release, err := strconv.Atoi(semver[3]); err != nil { return 0 } else { verNum += release } return verNum } // PostgresPrecheck checks postgres connection and gathering facts // if any important fact changed, it will trigger a plan before next scrape func PostgresPrecheck(s *Server) (err error) { if s.DB == nil { // if db is not initialized, create a new DB if s.DB, err = sql.Open("postgres", s.dsn); err != nil { s.UP = false return } if s.Forked { s.MaxConn = 1 s.DB.SetMaxIdleConns(1) s.DB.SetMaxOpenConns(1) s.DB.SetConnMaxLifetime(connMaxLifeTime) } else { s.MaxConn = 3 s.DB.SetMaxIdleConns(3) s.DB.SetMaxOpenConns(3) s.DB.SetConnMaxLifetime(1 * time.Minute) } } // retrieve version info var version int ctx, cancel := context.WithTimeout(context.Background(), s.GetConnectTimeout()) defer cancel() if err = s.DB.QueryRowContext(ctx, `SHOW server_version_num;`).Scan(&version); err != nil { s.UP = false return fmt.Errorf("fail fetching server version: %w", err) } s.UP = true // fact change triggers a new planning if s.Version != version { logInfof("server [%s] version changed: from [%d] to [%d]", s.Name(), s.Version, version) s.Planned = false } s.Version = version ctxSet, cancelSet := context.WithTimeout(context.Background(), s.GetConnectTimeout()) defer cancelSet() if _, err = s.DB.ExecContext(ctxSet, `SET application_name = pg_exporter;`); err != nil { s.UP = false return fmt.Errorf("fail setting application name: %w", err) } // get important metadata var recovery bool var datname, username string var databases, namespaces, extensions []string precheckSQL := `SELECT current_catalog, current_user, pg_catalog.pg_is_in_recovery(), (SELECT pg_catalog.array_agg(d.datname)::text[] AS databases FROM pg_catalog.pg_database d WHERE d.datallowconn AND NOT d.datistemplate), (SELECT pg_catalog.array_agg(n.nspname)::text[] AS namespaces FROM pg_catalog.pg_namespace n), (SELECT pg_catalog.array_agg(e.extname)::text[] AS extensions FROM pg_catalog.pg_extension e);` ctx2, cancel2 := context.WithTimeout(context.Background(), s.GetConnectTimeout()) defer cancel2() if err = s.DB.QueryRowContext(ctx2, precheckSQL).Scan(&datname, &username, &recovery, pq.Array(&databases), pq.Array(&namespaces), pq.Array(&extensions)); err != nil { s.UP = false return fmt.Errorf("fail fetching server version: %w", err) } if s.Recovery != recovery { logInfof("server [%s] recovery status changed: from [%v] to [%v]", s.Name(), s.Recovery, recovery) s.Planned = false } s.Recovery = recovery s.Username = username if s.Database != datname { logInfof("server [%s] datname changed: from [%s] to [%s]", s.Name(), s.Database, datname) s.Planned = false } s.Database = datname s.Databases[datname] = true // update schema & extension list s.Namespaces = make(map[string]bool, len(namespaces)) for _, nsname := range namespaces { s.Namespaces[nsname] = true } s.Extensions = make(map[string]bool, len(extensions)) for _, extname := range extensions { s.Extensions[extname] = true } // detect db change s.dblistLock.Lock() defer s.dblistLock.Unlock() newDBList := make(map[string]bool, len(databases)) changes := make(map[string]bool) // if new db is not found in old db list, add a change entry [NewDBName:true] for _, dbname := range databases { newDBList[dbname] = true if _, found := s.Databases[dbname]; !found { logDebugf("server [%s] found new database %s", s.Name(), dbname) changes[dbname] = true } } // if old db is not found in new db list, add a change entry [OldDBName:false] for dbname := range s.Databases { if _, found := newDBList[dbname]; !found { logDebugf("server [%s] found vanished database %s", s.Name(), dbname) changes[dbname] = false } } // invoke hook if there are changes on database list if len(changes) > 0 && s.onDatabaseChange != nil { logDebugf("server [%s] auto discovery database list change : %v", s.Name(), changes) s.onDatabaseChange(changes) // if doing something long, launch another goroutine } s.Databases = newDBList return nil } // Plan will install queries that compatible with server fact (version, level, recovery, plugin, tags,...) func (s *Server) Plan(queries ...*Query) { // if queries are explicitly given, use it instead of server.queries if len(queries) > 0 { newQueries := make(map[string]*Query) for _, q := range queries { newQueries[q.Name] = q } s.queries = newQueries } // check query compatibility instances := make([]*Collector, 0) var installedNames, discardedNames []string for name, query := range s.queries { if ok, reason := s.Compatible(query); ok { instances = append(instances, NewCollector(query, s)) installedNames = append(installedNames, query.Branch) } else { discardedNames = append(discardedNames, query.Branch) logDebugf("query [%s].%s discarded because of %s", query.Name, name, reason) } } // sort by priority sort.Slice(instances, func(i, j int) bool { return instances[i].Priority < instances[j].Priority }) s.Collectors = instances // reset statistics after planning s.ResetStats() s.Planned = true logInfof("server [%s] planned with %d queries, %d installed, %d discarded, installed: %s , discarded: %s", s.Name(), len(s.queries), len(installedNames), len(discardedNames), strings.Join(installedNames, ", "), strings.Join(discardedNames, ", ")) } // ResetStats will clear all statistic info func (s *Server) ResetStats() { n := len(s.Collectors) s.queryCacheTTL = make(map[string]float64, n) s.queryScrapeTotalCount = make(map[string]float64, n) s.queryScrapeHitCount = make(map[string]float64, n) s.queryScrapeErrorCount = make(map[string]float64, n) s.queryScrapePredicateSkipCount = make(map[string]float64, n) s.queryScrapeMetricCount = make(map[string]float64, n) s.queryScrapeDuration = make(map[string]float64, n) for _, query := range s.Collectors { s.queryCacheTTL[query.Name] = 0 s.queryScrapeTotalCount[query.Name] = 0 s.queryScrapeHitCount[query.Name] = 0 s.queryScrapeErrorCount[query.Name] = 0 if len(query.PredicateQueries) > 0 { s.queryScrapePredicateSkipCount[query.Name] = 0 } s.queryScrapeMetricCount[query.Name] = 0 s.queryScrapeDuration[query.Name] = 0 } } // Compatible tells whether a query is compatible with current server func (s *Server) Compatible(query *Query) (res bool, reason string) { // check skip flag if query.Skip { return false, fmt.Sprintf("query %s is marked skip", query.Name) } // check mode if pgbouncerQuery := query.HasTag("pgbouncer"); pgbouncerQuery != s.PgbouncerMode { if s.PgbouncerMode { return false, fmt.Sprintf("pgbouncer server doese not match with normal postgres query %s", query.Name) } return false, fmt.Sprintf("pgbouncer query %s does not match with normal postgres server", query.Name) } // check version if s.Version != 0 { // if version is not determined yet, just let it go if query.MinVersion != 0 && s.Version < query.MinVersion { return false, fmt.Sprintf("server version %v lower than query min version %v", s.Version, query.MinVersion) } if query.MaxVersion != 0 && s.Version >= query.MaxVersion { // exclude return false, fmt.Sprintf("server version %v higher than query max version %v", s.Version, query.MaxVersion) } } // check query side tags for _, tag := range query.Tags { // check extension is installed on target database if strings.HasPrefix(tag, "extension:") { if _, found := s.Extensions[strings.TrimPrefix(tag, "extension:")]; !found { return false, fmt.Sprintf("server [%s] does not have extension %s", s.Name(), tag) } continue } // check schema exist on target database if strings.HasPrefix(tag, "schema:") { if _, found := s.Namespaces[strings.TrimPrefix(tag, "schema:")]; !found { return false, fmt.Sprintf("server [%s] does not have schema %s", s.Name(), tag) } continue } // check if dbname prefix tag match server.Database if strings.HasPrefix(tag, "dbname:") { if s.Database != strings.TrimPrefix(tag, "dbname:") { return false, fmt.Sprintf("server [%s] dbname does %s not match with query tag %s", s.Name(), s.Database, tag) } continue } // check if username prefix tag match server.Username if strings.HasPrefix(tag, "username:") { if s.Username != strings.TrimPrefix(tag, "username:") { return false, fmt.Sprintf("server [%s] username [%s] does not match %s", s.Name(), s.Username, tag) } continue } // check server does not have given tag if strings.HasPrefix(tag, "not:") { if negTag := strings.TrimPrefix(tag, "not:"); s.HasTag(negTag) { return false, fmt.Sprintf("server [%s] has tag %s that query %s forbid", s.Name(), negTag, query.Name) } continue } // check 3 default tags: cluster, primary, standby|replica switch tag { case "cluster": if s.Forked { return false, fmt.Sprintf("cluster level query %s will not run on forked server %v", query.Name, s.Name()) } continue case "primary", "master", "leader": if s.Recovery { return false, fmt.Sprintf("primary-only query %s will not run on standby server %v", query.Name, s.Name()) } continue case "standby", "replica", "slave": if !s.Recovery { return false, fmt.Sprintf("standby-only query %s will not run on primary server %v", query.Name, s.Name()) } continue case "pgbouncer": continue default: // if this tag is nether a pre-defined tag nor a prefixed pattern tag, check whether server have that tag if !s.HasTag(tag) { return false, fmt.Sprintf("server [%s] does not have tag %s that query %s require", s.Name(), tag, query.Name) } } } return true, "" } // Explain will print all queries that registered to server func (s *Server) Explain() string { s.lock.RLock() defer s.lock.RUnlock() var res []string for _, i := range s.Collectors { res = append(res, i.Explain()) } return strings.Join(res, "\n") } // Stat will turn Server internal stats into HTML func (s *Server) Stat() string { s.lock.RLock() defer s.lock.RUnlock() buf := new(bytes.Buffer) //err := statsTemplate.Execute(buf, s) //if err != nil { // logErrorf("fail to generate server stats html") // return fmt.Sprintf("fail to generate server stat html, %s", err.Error()) //} buf.WriteString(fmt.Sprintf("%-24s %-10s %-10s %-10s %-10s %-10s %-6s %-10s\n", "name", "total", "hit", "error", "skip", "metric", "ttl/s", "duration/ms")) for _, query := range s.Collectors { buf.WriteString(fmt.Sprintf("%-24s %-10d %-10d %-10d %-10d %-10d %-6d %-10f\n", query.Name, int(s.queryScrapeTotalCount[query.Name]), int(s.queryScrapeHitCount[query.Name]), int(s.queryScrapeErrorCount[query.Name]), int(s.queryScrapePredicateSkipCount[query.Name]), int(s.queryScrapeMetricCount[query.Name]), int(s.queryCacheTTL[query.Name]), s.queryScrapeDuration[query.Name]*1000, )) } return buf.String() } // ExplainHTML will print server stats in HTML format func (s *Server) ExplainHTML() string { s.lock.RLock() defer s.lock.RUnlock() var res []string for _, i := range s.Collectors { res = append(res, i.HTML()) } return strings.Join(res, "

") } // Describe implement prometheus.Collector func (s *Server) Describe(ch chan<- *prometheus.Desc) { s.lock.RLock() defer s.lock.RUnlock() for _, instance := range s.Collectors { instance.Describe(ch) } } // Collect implement prometheus.Collector interface func (s *Server) Collect(ch chan<- prometheus.Metric) { s.lock.Lock() defer s.lock.Unlock() s.scrapeBegin = time.Now() // This ts is used for cache expiration check // check server conn, gathering fact if s.err = s.beforeScrape(s); s.err != nil { logDebugf("fail establishing connection to %s: %s", s.Name(), s.err.Error()) goto final } // fact change (including first time) will incur a plan procedure if !s.Planned { s.Plan() } // First pass: execute all queries with Fatal flag if err := s.collectFatalQueries(ch); err != nil { s.err = err goto final } // Second pass: execute remaining non-Fatal queries s.collectNonFatalQueries(ch) final: s.scrapeDone = time.Now() // This ts is used for cache expiration check s.totalTime += s.scrapeDone.Sub(s.scrapeBegin).Seconds() s.totalCount++ if s.err != nil { s.UP = false s.errorCount++ logErrorf("fail scraping server [%s]: %s", s.Name(), s.err.Error()) } else { s.UP = true logDebugf("server [%s] scraped in %v", s.Name(), s.scrapeDone.Sub(s.scrapeBegin).Seconds()) } } // collectFatalQueries executes all queries with Fatal flag and returns on first error func (s *Server) collectFatalQueries(ch chan<- prometheus.Metric) error { for _, query := range s.Collectors { if !query.Fatal { continue } if err := s.executeQuery(query, ch); err != nil { logErrorf("query [%s] error: %s", query.Name, err) return err } } return nil } // collectNonFatalQueries executes all non-Fatal queries and logs errors without stopping func (s *Server) collectNonFatalQueries(ch chan<- prometheus.Metric) { for _, query := range s.Collectors { if query.Fatal { continue } if err := s.executeQuery(query, ch); err != nil { logWarnf("query [%s] error skipped: %s", query.Name, err) } } } // executeQuery runs a single query and updates its metrics func (s *Server) executeQuery(query *Collector, ch chan<- prometheus.Metric) error { query.Collect(ch) s.queryCacheTTL[query.Name] = query.cacheTTL() s.queryScrapeTotalCount[query.Name]++ s.queryScrapeMetricCount[query.Name] = float64(query.ResultSize()) s.queryScrapeDuration[query.Name] = query.scrapeDuration.Seconds() if query.Error() != nil { s.queryScrapeErrorCount[query.Name]++ return query.Error() } if query.CacheHit() { s.queryScrapeHitCount[query.Name]++ } // Update predicate skip count if applicable if len(query.PredicateQueries) > 0 { skipped, _ := query.PredicateSkip() if skipped { s.queryScrapePredicateSkipCount[query.Name]++ } } return nil } // HasTag tells whether this server have specific tag func (s *Server) HasTag(tag string) bool { for _, t := range s.Tags { if t == tag { return true } } return false } // Duration returns last scrape duration in float64 seconds func (s *Server) Duration() float64 { s.lock.RLock() defer s.lock.RUnlock() sec := s.scrapeDone.Sub(s.scrapeBegin).Seconds() return sec } // Uptime returns servers's uptime func (s *Server) Uptime() float64 { return time.Since(s.serverInit).Seconds() } /* ================ Server Creation ================ */ // NewServer will check dsn, but not trying to connect func NewServer(dsn string, opts ...ServerOpt) *Server { s := &Server{dsn: dsn} for _, opt := range opts { opt(s) } s.Database = ParseDatname(dsn) if s.Database != "pgbouncer" { s.PgbouncerMode = false s.beforeScrape = PostgresPrecheck } else { logInfof("datname pgbouncer detected, enabling pgbouncer mode") s.PgbouncerMode = true s.beforeScrape = PgbouncerPrecheck } s.MaxConn = 1 s.Databases = make(map[string]bool, 1) s.serverInit = time.Now() return s } // ServerOpt configures Server type ServerOpt func(*Server) // WithConstLabel copy constant label map to server func WithConstLabel(labels prometheus.Labels) ServerOpt { return func(s *Server) { if labels == nil { s.labels = nil } else { s.labels = make(prometheus.Labels, len(labels)) for k, v := range labels { s.labels[k] = v } } } } // WithCachePolicy will pass cache option to server func WithCachePolicy(disableCache bool) ServerOpt { return func(s *Server) { s.DisableCache = disableCache } } // WithQueries set server's default query set func WithQueries(queries map[string]*Query) ServerOpt { return func(s *Server) { s.queries = queries } } // WithServerTags will mark server only execute query without cluster tag func WithServerTags(tags []string) ServerOpt { return func(s *Server) { s.Tags = tags } } // WithServerConnectTimeout will set a connect timeout for server precheck queries // otherwise, a default value 100ms will be used. // Increase this value if you are monitoring a remote (cross-DC, cross-AZ) instance func WithServerConnectTimeout(timeout int) ServerOpt { return func(s *Server) { s.ConnectTimeout = timeout } } ================================================ FILE: exporter/server_exporter_test.go ================================================ package exporter import ( "strings" "testing" "time" "github.com/prometheus/client_golang/prometheus" ) func makeGaugeQuery(name string, priority int, tags ...string) *Query { return &Query{ Name: name, Branch: name, SQL: "SELECT 'db' AS datname, 1 AS value", Tags: tags, Priority: priority, Columns: map[string]*Column{ "datname": {Name: "datname", Usage: LABEL, Desc: "database"}, "value": {Name: "value", Usage: GAUGE, Desc: "value"}, }, ColumnNames: []string{"datname", "value"}, LabelNames: []string{"datname"}, MetricNames: []string{"value"}, } } func TestParseSemver(t *testing.T) { if got := ParseSemver("1.2.3"); got != 10203 { t.Fatalf("ParseSemver 1.2.3 = %d, want 10203", got) } if got := ParseSemver("PgBouncer 1.22.1"); got != 12201 { t.Fatalf("ParseSemver pgbouncer string = %d, want 12201", got) } if got := ParseSemver("invalid"); got != 0 { t.Fatalf("ParseSemver invalid = %d, want 0", got) } } func TestNewServerAndBasics(t *testing.T) { labels := prometheus.Labels{"k": "v"} q := map[string]*Query{"q": makeGaugeQuery("q", 1)} s := NewServer( "postgresql://user:pass@localhost:5432/postgres?sslmode=disable", WithConstLabel(labels), WithCachePolicy(true), WithQueries(q), WithServerTags([]string{"tag1"}), WithServerConnectTimeout(250), ) if s.PgbouncerMode { t.Fatal("postgres database should not trigger pgbouncer mode") } if !s.DisableCache { t.Fatal("WithCachePolicy(true) not applied") } if s.ConnectTimeout != 250 { t.Fatalf("ConnectTimeout = %d, want 250", s.ConnectTimeout) } if !s.HasTag("tag1") { t.Fatal("WithServerTags not applied") } if got := s.GetConnectTimeout(); got != 250*time.Millisecond { t.Fatalf("GetConnectTimeout = %v, want 250ms", got) } if got := s.Name(); got != "postgres" { t.Fatalf("Name = %s, want postgres", got) } s.Database = "" if got := s.Name(); !strings.Contains(got, "postgresql://") { t.Fatalf("Name with empty database should return redacted dsn, got: %s", got) } s2 := NewServer("postgresql://user:pass@localhost:5432/pgbouncer") if !s2.PgbouncerMode { t.Fatal("pgbouncer database should trigger pgbouncer mode") } } func TestServerCompatible(t *testing.T) { s := NewServer("postgresql://user:pass@localhost:5432/postgres") s.Version = 160000 s.Recovery = false s.Database = "postgres" s.Username = "monitor" s.Extensions = map[string]bool{"pg_stat_statements": true} s.Namespaces = map[string]bool{"public": true} s.Tags = []string{"foo"} tests := []struct { name string q *Query ok bool }{ {name: "skip", q: &Query{Name: "q", Skip: true}, ok: false}, {name: "pgbouncer tag mismatch", q: &Query{Name: "q", Tags: []string{"pgbouncer"}}, ok: false}, {name: "min version", q: &Query{Name: "q", MinVersion: 170000}, ok: false}, {name: "max version", q: &Query{Name: "q", MaxVersion: 160000}, ok: false}, {name: "extension exists", q: &Query{Name: "q", Tags: []string{"extension:pg_stat_statements"}}, ok: true}, {name: "extension missing", q: &Query{Name: "q", Tags: []string{"extension:missing"}}, ok: false}, {name: "schema exists", q: &Query{Name: "q", Tags: []string{"schema:public"}}, ok: true}, {name: "schema missing", q: &Query{Name: "q", Tags: []string{"schema:private"}}, ok: false}, {name: "dbname mismatch", q: &Query{Name: "q", Tags: []string{"dbname:other"}}, ok: false}, {name: "username match", q: &Query{Name: "q", Tags: []string{"username:monitor"}}, ok: true}, {name: "username mismatch", q: &Query{Name: "q", Tags: []string{"username:other"}}, ok: false}, {name: "forbidden not tag", q: &Query{Name: "q", Tags: []string{"not:foo"}}, ok: false}, {name: "server tag match", q: &Query{Name: "q", Tags: []string{"foo"}}, ok: true}, } for _, tt := range tests { got, _ := s.Compatible(tt.q) if got != tt.ok { t.Fatalf("%s compatible = %v, want %v", tt.name, got, tt.ok) } } s.Forked = true if ok, _ := s.Compatible(&Query{Name: "q", Tags: []string{"cluster"}}); ok { t.Fatal("cluster query should not run on forked server") } s.Forked = false s.Recovery = true if ok, _ := s.Compatible(&Query{Name: "q", Tags: []string{"primary"}}); ok { t.Fatal("primary query should not run on recovery server") } if ok, _ := s.Compatible(&Query{Name: "q", Tags: []string{"replica"}}); !ok { t.Fatal("replica query should run on recovery server") } } func TestPlanResetAndCollectCached(t *testing.T) { s := NewServer("postgresql://user:pass@localhost:5432/postgres") s.Version = 160000 s.Database = "postgres" s.Username = "monitor" s.Namespaces = map[string]bool{"public": true} s.Extensions = map[string]bool{} q1 := makeGaugeQuery("q1", 20) q2 := makeGaugeQuery("q2", 10) s.queries = map[string]*Query{"q1": q1, "q2": q2} s.Plan() if !s.Planned { t.Fatal("Plan should mark server planned") } if len(s.Collectors) != 2 { t.Fatalf("collector count = %d, want 2", len(s.Collectors)) } if s.Collectors[0].Name != "q2" || s.Collectors[1].Name != "q1" { t.Fatalf("collectors should be sorted by priority, got %s then %s", s.Collectors[0].Name, s.Collectors[1].Name) } // Build one cached metric for q2, so Collect path does not need DB. c := s.Collectors[0] metric := prometheus.MustNewConstMetric(c.descriptors["value"], prometheus.GaugeValue, 1, "db") c.result = []prometheus.Metric{metric} c.TTL = 3600 c.lastScrape = time.Now() c.err = nil s.beforeScrape = func(s *Server) error { s.UP = true return nil } s.Collectors = []*Collector{c} s.ResetStats() s.Planned = true ch := make(chan prometheus.Metric, 10) s.Collect(ch) if !s.UP { t.Fatal("Collect should keep server UP for successful cached query") } if s.totalCount != 1 { t.Fatalf("totalCount = %v, want 1", s.totalCount) } if s.queryScrapeTotalCount[c.Name] != 1 { t.Fatalf("queryScrapeTotalCount = %v, want 1", s.queryScrapeTotalCount[c.Name]) } if s.queryScrapeMetricCount[c.Name] != 1 { t.Fatalf("queryScrapeMetricCount = %v, want 1", s.queryScrapeMetricCount[c.Name]) } if s.queryScrapeHitCount[c.Name] != 1 { t.Fatalf("queryScrapeHitCount = %v, want 1", s.queryScrapeHitCount[c.Name]) } } func TestExporterServerLifecycleHelpers(t *testing.T) { e := &Exporter{ dsn: "postgresql://user:pass@localhost:5432/postgres?sslmode=disable", servers: map[string]*Server{}, queries: map[string]*Query{"q": makeGaugeQuery("q", 1)}, } e.CreateServer("db1") if len(e.servers) != 1 { t.Fatalf("CreateServer count = %d, want 1", len(e.servers)) } snapshot := e.IterateServer() if len(snapshot) != 1 || snapshot[0] == nil { t.Fatalf("IterateServer snapshot invalid: %#v", snapshot) } if !snapshot[0].Forked { t.Fatal("CreateServer should mark new server as Forked") } e.RemoveServer("db1") if len(e.servers) != 0 { t.Fatalf("RemoveServer count = %d, want 0", len(e.servers)) } } ================================================ FILE: exporter/testmain_test.go ================================================ package exporter import ( "os" "testing" ) func TestMain(m *testing.M) { Logger = configureLogger("error", "logfmt") os.Exit(m.Run()) } ================================================ FILE: exporter/utils.go ================================================ package exporter import ( "fmt" "log/slog" "math" "os" "strconv" "strings" "time" "github.com/prometheus/client_golang/prometheus" ) /* ================ Logger ================ */ func configureLogger(levelStr, formatStr string) *slog.Logger { var level slog.Level switch strings.ToLower(levelStr) { case "debug": level = slog.LevelDebug case "info": level = slog.LevelInfo case "warn": level = slog.LevelWarn case "error": level = slog.LevelError default: level = slog.LevelInfo // fallback to default info level } opts := &slog.HandlerOptions{ Level: level, } var handler slog.Handler switch strings.ToLower(formatStr) { case "json": handler = slog.NewJSONHandler(os.Stderr, opts) case "logfmt", "": handler = slog.NewTextHandler(os.Stderr, opts) default: // Be resilient to misconfiguration: fall back to logfmt. handler = slog.NewTextHandler(os.Stderr, opts) } return slog.New(handler) } func loggerOrDefault() *slog.Logger { if Logger != nil { return Logger } return slog.Default() } // logDebugf will log debug message func logDebugf(format string, v ...interface{}) { loggerOrDefault().Debug(fmt.Sprintf(format, v...)) } // logInfof will log info message func logInfof(format string, v ...interface{}) { loggerOrDefault().Info(fmt.Sprintf(format, v...)) } // logWarnf will log warning message func logWarnf(format string, v ...interface{}) { loggerOrDefault().Warn(fmt.Sprintf(format, v...)) } // logErrorf will log error message func logErrorf(format string, v ...interface{}) { loggerOrDefault().Error(fmt.Sprintf(format, v...)) } // logError will print error message directly func logError(msg string) { loggerOrDefault().Error(msg) } // logFatalf will log error message func logFatalf(format string, v ...interface{}) { loggerOrDefault().Error(fmt.Sprintf(format, v...)) os.Exit(1) } /* ================ Auxiliaries ================ */ // castFloat64 will cast datum into float64 with Column scale & default value. // Column.Scale/Column.Default are parsed when loading config, so this is hot-path safe. func castFloat64(t interface{}, c *Column) float64 { scale := 1.0 if c != nil && c.hasScale { scale = c.scaleFactor } switch v := t.(type) { case int64: return float64(v) * scale case float64: return v * scale case time.Time: return float64(v.Unix()) case []byte: strV := string(v) result, err := strconv.ParseFloat(strV, 64) if err != nil { logWarnf("fail casting []byte to float64: %v", t) return math.NaN() } return result * scale case string: result, err := strconv.ParseFloat(v, 64) if err != nil { logWarnf("fail casting string to float64: %v", t) return math.NaN() } return result * scale case bool: if v { return 1.0 } return 0.0 case nil: if c != nil && c.hasDefault { return c.defaultValue * scale } return math.NaN() default: logWarnf("fail casting unknown to float64: %v", t) return math.NaN() } } // castString will force interface{} into string func castString(t interface{}) string { switch v := t.(type) { case int64: return fmt.Sprintf("%v", v) case float64: return fmt.Sprintf("%v", v) case time.Time: return fmt.Sprintf("%v", v.Unix()) case nil: return "" case []byte: // Try and convert to string return string(v) case string: return v case bool: if v { return "true" } return "false" default: logWarnf("fail casting unknown to string: %v", t) return "" } } // parseConstLabels turn param string into prometheus.Labels func parseConstLabels(s string) prometheus.Labels { labels := make(prometheus.Labels) s = strings.TrimSpace(s) if len(s) == 0 { return nil } parts := strings.Split(s, ",") for _, p := range parts { keyValue := strings.SplitN(strings.TrimSpace(p), "=", 2) if len(keyValue) != 2 { logErrorf(`malformed labels format %q, should be "key=value"`, p) continue } key := strings.TrimSpace(keyValue[0]) value := strings.TrimSpace(keyValue[1]) if key == "" || value == "" { continue } if err := validatePromLabelName(key); err != nil { logWarnf("skip invalid const label name %q: %v", key, err) continue } labels[key] = value } if len(labels) == 0 { return nil } return labels } // parseCSV will turn a comma separated string into a []string func parseCSV(s string) (tags []string) { s = strings.TrimSpace(s) if len(s) == 0 { return nil } parts := strings.Split(s, ",") for _, p := range parts { if tag := strings.TrimSpace(p); len(tag) > 0 { tags = append(tags, tag) } } if len(tags) == 0 { return nil } return } ================================================ FILE: exporter/utils_test.go ================================================ package exporter import ( "math" "reflect" "testing" "time" ) func TestParseCSV(t *testing.T) { if got := parseCSV(""); got != nil { t.Fatalf("parseCSV empty = %v, want nil", got) } got := parseCSV(" a, b,, c , ") want := []string{"a", "b", "c"} if !reflect.DeepEqual(got, want) { t.Fatalf("parseCSV result = %#v, want %#v", got, want) } } func TestParseConstLabels(t *testing.T) { if got := parseConstLabels(""); got != nil { t.Fatalf("parseConstLabels empty = %v, want nil", got) } labels := parseConstLabels("env=prod,region=us-east-1") if labels["env"] != "prod" || labels["region"] != "us-east-1" { t.Fatalf("parseConstLabels valid result = %v", labels) } labels = parseConstLabels("token=a=b=c") if labels["token"] != "a=b=c" { t.Fatalf("parseConstLabels should preserve '=' in value, got %q", labels["token"]) } labels = parseConstLabels("bad,noeq=,=noval,ok=1") if len(labels) != 1 || labels["ok"] != "1" { t.Fatalf("parseConstLabels malformed handling = %v", labels) } } func TestCastFloat64(t *testing.T) { now := time.Unix(1700000000, 0) withScale := func(scale, def string) *Column { c := &Column{Scale: scale, Default: def} if err := c.parseNumbers(); err != nil { t.Fatalf("parseNumbers failed for scale=%q default=%q: %v", scale, def, err) } return c } if got := castFloat64(int64(3), withScale("2", "")); got != 6 { t.Fatalf("int64 scale cast = %v, want 6", got) } if got := castFloat64(float64(1.5), withScale("2", "")); got != 3 { t.Fatalf("float64 scale cast = %v, want 3", got) } if got := castFloat64(now, nil); got != float64(now.Unix()) { t.Fatalf("time cast = %v, want %v", got, now.Unix()) } if got := castFloat64([]byte("3.25"), withScale("10", "")); got != 32.5 { t.Fatalf("[]byte cast = %v, want 32.5", got) } if got := castFloat64("2.5", withScale("4", "")); got != 10 { t.Fatalf("string cast = %v, want 10", got) } if got := castFloat64(true, nil); got != 1 { t.Fatalf("bool true cast = %v, want 1", got) } if got := castFloat64(false, nil); got != 0 { t.Fatalf("bool false cast = %v, want 0", got) } if got := castFloat64(nil, withScale("", "2.5")); got != 2.5 { t.Fatalf("nil default cast = %v, want 2.5", got) } if got := castFloat64(nil, withScale("10", "2.5")); got != 25 { t.Fatalf("nil default cast should apply scale: %v, want 25", got) } if got := castFloat64("abc", nil); !math.IsNaN(got) { t.Fatalf("invalid string cast = %v, want NaN", got) } if got := castFloat64(struct{}{}, nil); !math.IsNaN(got) { t.Fatalf("unknown type cast = %v, want NaN", got) } // Invalid numeric options should be rejected at parse time. if err := (&Column{Scale: "bad"}).parseNumbers(); err == nil { t.Fatal("parseNumbers should fail for invalid scale") } if err := (&Column{Default: "bad"}).parseNumbers(); err == nil { t.Fatal("parseNumbers should fail for invalid default") } } func TestCastString(t *testing.T) { now := time.Unix(1700000000, 0) if got := castString(int64(3)); got != "3" { t.Fatalf("int64 cast = %q, want 3", got) } if got := castString(float64(1.5)); got != "1.5" { t.Fatalf("float64 cast = %q, want 1.5", got) } if got := castString(now); got != "1700000000" { t.Fatalf("time cast = %q, want 1700000000", got) } if got := castString([]byte("abc")); got != "abc" { t.Fatalf("[]byte cast = %q, want abc", got) } if got := castString(true); got != "true" { t.Fatalf("bool true cast = %q, want true", got) } if got := castString(nil); got != "" { t.Fatalf("nil cast = %q, want empty", got) } } func TestConfigureLogger(t *testing.T) { if l := configureLogger("debug", "json"); l == nil { t.Fatal("configureLogger returned nil for valid json format") } if l := configureLogger("bad-level", "logfmt"); l == nil { t.Fatal("configureLogger returned nil for fallback level") } if l := configureLogger("info", "unknown-format"); l == nil { t.Fatal("configureLogger returned nil for unknown format fallback") } } func TestLogHelpersWithNilLogger(t *testing.T) { origin := Logger Logger = nil t.Cleanup(func() { Logger = origin }) logDebugf("debug %d", 1) logInfof("info %d", 1) logWarnf("warn %d", 1) logErrorf("error %d", 1) logError("plain error") } ================================================ FILE: exporter/validate_labels.go ================================================ package exporter import ( "fmt" "sort" "github.com/prometheus/client_golang/prometheus" ) // validateConstLabelConflicts rejects constant label keys that would cause a // Prometheus panic due to duplicate label names between const and variable labels. // // This can happen when a user passes `--label key=value` (or PG_EXPORTER_LABEL) // where `key` equals one of a query's metric label names (after rename). // // When intro metrics are enabled, it also rejects keys that collide with the // exporter internal dynamic metric labels (currently: datname, query). func validateConstLabelConflicts(constLabels prometheus.Labels, queries map[string]*Query, disableIntro bool) error { if len(constLabels) == 0 { return nil } // Exporter internal dynamic metrics use these variable labels. if !disableIntro { for _, reserved := range []string{"datname", "query"} { if _, exists := constLabels[reserved]; exists { return fmt.Errorf("const label %q conflicts with built-in exporter metric label %q", reserved, reserved) } } } if len(queries) == 0 { return nil } // Stable iteration order for deterministic error messages. branches := make([]string, 0, len(queries)) for b := range queries { branches = append(branches, b) } sort.Strings(branches) for _, branch := range branches { q := queries[branch] if q == nil { continue } for _, lbl := range q.LabelList() { if _, exists := constLabels[lbl]; exists { return fmt.Errorf("const label %q conflicts with query %q (name=%q) label %q", lbl, branch, q.Name, lbl) } } } return nil } ================================================ FILE: exporter/validate_labels_test.go ================================================ package exporter import ( "strings" "testing" ) func TestValidateConstLabelConflicts_QueryLabelOverlap(t *testing.T) { cfg := ` q1: query: SELECT 1 AS value, 'x' AS datname metrics: - datname: usage: LABEL rename: db description: database - value: usage: GAUGE description: value ` queries, err := ParseConfig([]byte(cfg)) if err != nil { t.Fatalf("ParseConfig failed: %v", err) } labels := parseConstLabels("db=foo") if err := validateConstLabelConflicts(labels, queries, false); err == nil { t.Fatal("expected const label conflict error, got nil") } } func TestValidateConstLabelConflicts_InternalReservedLabels(t *testing.T) { labels := parseConstLabels("datname=foo") if err := validateConstLabelConflicts(labels, nil, false); err == nil { t.Fatal("expected reserved label conflict with intro metrics enabled, got nil") } // When intro metrics are disabled, internal dynamic series are not emitted. if err := validateConstLabelConflicts(labels, nil, true); err != nil { t.Fatalf("expected no error with intro metrics disabled, got %v", err) } } func TestNewExporterRejectsConstLabelConflict(t *testing.T) { cfg := ` q1: query: SELECT 1 AS value, 'x' AS datname metrics: - datname: usage: LABEL rename: db - value: usage: GAUGE ` _, err := NewExporter( "postgresql://u:p@localhost:5432/postgres?sslmode=disable", WithConfigReader(strings.NewReader(cfg)), WithConstLabels("db=foo"), ) if err == nil { t.Fatal("expected NewExporter to fail on const label conflict, got nil") } } ================================================ FILE: go.mod ================================================ module pg_exporter go 1.26.2 require ( github.com/alecthomas/kingpin/v2 v2.4.0 github.com/lib/pq v1.12.3 github.com/prometheus/client_golang v1.23.2 github.com/prometheus/common v0.67.5 github.com/prometheus/exporter-toolkit v0.16.0 gopkg.in/yaml.v3 v3.0.1 ) require ( github.com/alecthomas/units v0.0.0-20240927000941-0f3dac36c52b // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect github.com/coreos/go-systemd/v22 v22.7.0 // indirect github.com/golang-jwt/jwt/v5 v5.3.1 // indirect github.com/google/uuid v1.6.0 // indirect github.com/jpillora/backoff v1.0.0 // indirect github.com/klauspost/compress v1.18.5 // indirect github.com/mdlayher/socket v0.6.0 // indirect github.com/mdlayher/vsock v1.2.1 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f // indirect github.com/prometheus/client_model v0.6.2 // indirect github.com/prometheus/procfs v0.20.1 // indirect github.com/rogpeppe/go-internal v1.14.1 // indirect github.com/xhit/go-str2duration/v2 v2.1.0 // indirect go.yaml.in/yaml/v2 v2.4.4 // indirect golang.org/x/crypto v0.50.0 // indirect golang.org/x/net v0.53.0 // indirect golang.org/x/oauth2 v0.36.0 // indirect golang.org/x/sync v0.20.0 // indirect golang.org/x/sys v0.43.0 // indirect golang.org/x/text v0.36.0 // indirect golang.org/x/time v0.15.0 // indirect google.golang.org/protobuf v1.36.11 // indirect ) ================================================ FILE: go.sum ================================================ github.com/alecthomas/kingpin/v2 v2.4.0 h1:f48lwail6p8zpO1bC4TxtqACaGqHYA22qkHjHpqDjYY= github.com/alecthomas/kingpin/v2 v2.4.0/go.mod h1:0gyi0zQnjuFk8xrkNKamJoyUo382HRL7ATRpFZCw6tE= github.com/alecthomas/units v0.0.0-20240927000941-0f3dac36c52b h1:mimo19zliBX/vSQ6PWWSL9lK8qwHozUj03+zLoEB8O0= github.com/alecthomas/units v0.0.0-20240927000941-0f3dac36c52b/go.mod h1:fvzegU4vN3H1qMT+8wDmzjAcDONcgo2/SZ/TyfdUOFs= github.com/beorn7/perks v1.0.1 h1:VlbKKnNfV8bJzeqoa4cOKqO6bYr3WgKZxO8Z16+hsOM= github.com/beorn7/perks v1.0.1/go.mod h1:G2ZrVWU2WbWT9wwq4/hrbKbnv/1ERSJQ0ibhJ6rlkpw= github.com/cespare/xxhash/v2 v2.3.0 h1:UL815xU9SqsFlibzuggzjXhog7bL6oX9BbNZnL2UFvs= github.com/cespare/xxhash/v2 v2.3.0/go.mod h1:VGX0DQ3Q6kWi7AoAeZDth3/j3BFtOZR5XLFGgcrjCOs= github.com/coreos/go-systemd/v22 v22.7.0 h1:LAEzFkke61DFROc7zNLX/WA2i5J8gYqe0rSj9KI28KA= github.com/coreos/go-systemd/v22 v22.7.0/go.mod h1:xNUYtjHu2EDXbsxz1i41wouACIwT7Ybq9o0BQhMwD0w= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/golang-jwt/jwt/v5 v5.3.1 h1:kYf81DTWFe7t+1VvL7eS+jKFVWaUnK9cB1qbwn63YCY= github.com/golang-jwt/jwt/v5 v5.3.1/go.mod h1:fxCRLWMO43lRc8nhHWY6LGqRcf+1gQWArsqaEUEa5bE= github.com/google/go-cmp v0.7.0 h1:wk8382ETsv4JYUZwIsn6YpYiWiBsYLSJiTsyBybVuN8= github.com/google/go-cmp v0.7.0/go.mod h1:pXiqmnSA92OHEEa9HXL2W4E7lf9JzCmGVUdgjX3N/iU= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= github.com/google/uuid v1.6.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/jpillora/backoff v1.0.0 h1:uvFg412JmmHBHw7iwprIxkPMI+sGQ4kzOWsMeHnm2EA= github.com/jpillora/backoff v1.0.0/go.mod h1:J/6gKK9jxlEcS3zixgDgUAsiuZ7yrSoa/FX5e0EB2j4= github.com/klauspost/compress v1.18.5 h1:/h1gH5Ce+VWNLSWqPzOVn6XBO+vJbCNGvjoaGBFW2IE= github.com/klauspost/compress v1.18.5/go.mod h1:cwPg85FWrGar70rWktvGQj8/hthj3wpl0PGDogxkrSQ= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= github.com/kr/pretty v0.3.1/go.mod h1:hoEshYVHaxMs3cyo3Yncou5ZscifuDolrwPKZanG3xk= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/kylelemons/godebug v1.1.0 h1:RPNrshWIDI6G2gRW9EHilWtl7Z6Sb1BR0xunSBf0SNc= github.com/kylelemons/godebug v1.1.0/go.mod h1:9/0rRGxNHcop5bhtWyNeEfOS8JIWk580+fNqagV/RAw= github.com/lib/pq v1.12.0 h1:mC1zeiNamwKBecjHarAr26c/+d8V5w/u4J0I/yASbJo= github.com/lib/pq v1.12.0/go.mod h1:/p+8NSbOcwzAEI7wiMXFlgydTwcgTr3OSKMsD2BitpA= github.com/lib/pq v1.12.3 h1:tTWxr2YLKwIvK90ZXEw8GP7UFHtcbTtty8zsI+YjrfQ= github.com/lib/pq v1.12.3/go.mod h1:/p+8NSbOcwzAEI7wiMXFlgydTwcgTr3OSKMsD2BitpA= github.com/mdlayher/socket v0.5.1 h1:VZaqt6RkGkt2OE9l3GcC6nZkqD3xKeQLyfleW/uBcos= github.com/mdlayher/socket v0.5.1/go.mod h1:TjPLHI1UgwEv5J1B5q0zTZq12A/6H7nKmtTanQE37IQ= github.com/mdlayher/socket v0.6.0 h1:ScZPaAGyO1icQnbFrhPM8mnXyMu9qukC1K4ZoM2IQKU= github.com/mdlayher/socket v0.6.0/go.mod h1:q7vozUAnxSqnjHc12Fik5yUKIzfZ8ITCfMkhOtE9z18= github.com/mdlayher/vsock v1.2.1 h1:pC1mTJTvjo1r9n9fbm7S1j04rCgCzhCOS5DY0zqHlnQ= github.com/mdlayher/vsock v1.2.1/go.mod h1:NRfCibel++DgeMD8z/hP+PPTjlNJsdPOmxcnENvE+SE= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f h1:KUppIJq7/+SVif2QVs3tOP0zanoHgBEVAwHxUSIzRqU= github.com/mwitkow/go-conntrack v0.0.0-20190716064945-2f068394615f/go.mod h1:qRWi+5nqEBWmkhHvq77mSJWrCKwh8bxhgT7d/eI7P4U= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o= github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= github.com/prometheus/common v0.67.5 h1:pIgK94WWlQt1WLwAC5j2ynLaBRDiinoAb86HZHTUGI4= github.com/prometheus/common v0.67.5/go.mod h1:SjE/0MzDEEAyrdr5Gqc6G+sXI67maCxzaT3A2+HqjUw= github.com/prometheus/exporter-toolkit v0.15.1 h1:XrGGr/qWl8Gd+pqJqTkNLww9eG8vR/CoRk0FubOKfLE= github.com/prometheus/exporter-toolkit v0.15.1/go.mod h1:P/NR9qFRGbCFgpklyhix9F6v6fFr/VQB/CVsrMDGKo4= github.com/prometheus/exporter-toolkit v0.16.0 h1:xT/j7L2XKF+VJd6B4fpUw6xWabHrSmsUf6mYmFqyu0s= github.com/prometheus/exporter-toolkit v0.16.0/go.mod h1:d1EL8Z9674xQe/iWhwP2wDyCEoBPbXVeqDbqAUsgJWY= github.com/prometheus/procfs v0.20.1 h1:XwbrGOIplXW/AU3YhIhLODXMJYyC1isLFfYCsTEycfc= github.com/prometheus/procfs v0.20.1/go.mod h1:o9EMBZGRyvDrSPH1RqdxhojkuXstoe4UlK79eF5TGGo= github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0tI/otEQ= github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= github.com/xhit/go-str2duration/v2 v2.1.0 h1:lxklc02Drh6ynqX+DdPyp5pCKLUQpRT8bp8Ydu2Bstc= github.com/xhit/go-str2duration/v2 v2.1.0/go.mod h1:ohY8p+0f07DiV6Em5LKB0s2YpLtXVyJfNt1+BlmyAsU= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.yaml.in/yaml/v2 v2.4.4 h1:tuyd0P+2Ont/d6e2rl3be67goVK4R6deVxCUX5vyPaQ= go.yaml.in/yaml/v2 v2.4.4/go.mod h1:gMZqIpDtDqOfM0uNfy0SkpRhvUryYH0Z6wdMYcacYXQ= golang.org/x/crypto v0.49.0 h1:+Ng2ULVvLHnJ/ZFEq4KdcDd/cfjrrjjNSXNzxg0Y4U4= golang.org/x/crypto v0.49.0/go.mod h1:ErX4dUh2UM+CFYiXZRTcMpEcN8b/1gxEuv3nODoYtCA= golang.org/x/crypto v0.50.0 h1:zO47/JPrL6vsNkINmLoo/PH1gcxpls50DNogFvB5ZGI= golang.org/x/crypto v0.50.0/go.mod h1:3muZ7vA7PBCE6xgPX7nkzzjiUq87kRItoJQM1Yo8S+Q= golang.org/x/net v0.52.0 h1:He/TN1l0e4mmR3QqHMT2Xab3Aj3L9qjbhRm78/6jrW0= golang.org/x/net v0.52.0/go.mod h1:R1MAz7uMZxVMualyPXb+VaqGSa3LIaUqk0eEt3w36Sw= golang.org/x/net v0.53.0 h1:d+qAbo5L0orcWAr0a9JweQpjXF19LMXJE8Ey7hwOdUA= golang.org/x/net v0.53.0/go.mod h1:JvMuJH7rrdiCfbeHoo3fCQU24Lf5JJwT9W3sJFulfgs= golang.org/x/oauth2 v0.36.0 h1:peZ/1z27fi9hUOFCAZaHyrpWG5lwe0RJEEEeH0ThlIs= golang.org/x/oauth2 v0.36.0/go.mod h1:YDBUJMTkDnJS+A4BP4eZBjCqtokkg1hODuPjwiGPO7Q= golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4= golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0= golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo= golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= golang.org/x/sys v0.43.0 h1:Rlag2XtaFTxp19wS8MXlJwTvoh8ArU6ezoyFsMyCTNI= golang.org/x/sys v0.43.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= golang.org/x/text v0.35.0 h1:JOVx6vVDFokkpaq1AEptVzLTpDe9KGpj5tR4/X+ybL8= golang.org/x/text v0.35.0/go.mod h1:khi/HExzZJ2pGnjenulevKNX1W67CUy0AsXcNubPGCA= golang.org/x/text v0.36.0 h1:JfKh3XmcRPqZPKevfXVpI1wXPTqbkE5f7JA92a55Yxg= golang.org/x/text v0.36.0/go.mod h1:NIdBknypM8iqVmPiuco0Dh6P5Jcdk8lJL0CUebqK164= golang.org/x/time v0.15.0 h1:bbrp8t3bGUeFOx08pvsMYRTCVSMk89u4tKbNOZbp88U= golang.org/x/time v0.15.0/go.mod h1:Y4YMaQmXwGQZoFaVFk4YpCt4FLQMYKZe9oeV/f4MSno= google.golang.org/protobuf v1.36.11 h1:fV6ZwhNocDyBLK0dj+fg8ektcVegBBuEolpbTQyBNVE= google.golang.org/protobuf v1.36.11/go.mod h1:HTf+CrKn2C3g5S8VImy6tdcUvCska2kB7j23XfzDpco= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gopkg.in/yaml.v3 v3.0.1 h1:fxVm/GzAzEWqLHuvctI91KS9hhNmmWOoWu0XTYJS7CA= gopkg.in/yaml.v3 v3.0.1/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= ================================================ FILE: hugo.yaml ================================================ baseURL: 'https://exp.pgsty.com/' languageCode: 'en' title: 'PG Exporter - Advanced PostgreSQL & pgBouncer Metrics Exporter' enableRobotsTXT: true # Parse Git commit enableGitInfo: true # enableEmoji: false hasCJKLanguage: true services: googleAnalytics: ID: G-Y7HMTRQ6P7 contentDir: docs outputs: home: [HTML] page: [HTML] section: [HTML, RSS] module: imports: - path: github.com/imfing/hextra defaultContentLanguage: en languages: en: languageName: English languageCode: en weight: 1 title: PG Exporter menu: main: - name: Pigsty weight: 1 url: "https://pigsty.io" - name: GitHub weight: 2 url: "https://github.com/pgsty/pg_exporter" params: icon: github - name: Search weight: 3 params: type: search sidebar: - identifier: more name: Links params: type: separator weight: 6 - identifier: pigsty name: "Pigsty ↗" url: "https://pigsty.io" weight: 7 - identifier: pig-cli name: "PIG CLI ↗" url: "https://pigsty.io/docs/pig" weight: 8 - identifier: pgext name: "PG Extensions ↗" url: "https://pgext.cloud" weight: 10 - identifier: github-repo name: "GitHub repo ↗" url: "https://github.com/pgsty/pg_exporter" weight: 11 - identifier: demo name: "Public Demo ↗" url: "https://demo.pigsty.io" weight: 12 - identifier: vonng name: "Author: Vonng ↗" url: "https://vonng.com/en/" weight: 13 markup: highlight: noClasses: false goldmark: renderer: unsafe: true extensions: passthrough: delimiters: block: [['\[', '\]'], ['$$', '$$']] inline: [['\(', '\)']] enable: true params: description: PG Exporter is an advanced PostgreSQL and pgBouncer metrics exporter for Prometheus, providing 600+ metrics with declarative YAML configuration. sidebar: autoCollapse: false navbar: displayTitle: true displayLogo: true logo: path: /logo.png dark: /logo.png width: wide page: # full (100%), wide (90rem), normal (80rem) width: full theme: # light | dark | system default: system displayToggle: true footer: enable: false displayCopyright: true displayPoweredBy: true width: normal # Display the last modification date displayUpdatedDate: true dateFormat: "2006-01-02" # Search # flexsearch is enabled by default search: enable: true type: flexsearch flexsearch: # index page by: content | summary | heading | title index: content # full | forward | reverse | strict # https://github.com/nextapps-de/flexsearch/#tokenizer-prefix-search tokenize: forward editURL: enable: true base: "https://github.com/pgsty/pg_exporter/edit/main/content" toc: displayTags: true highlight: copy: enable: true display: hover comments: enable: true type: giscus # https://giscus.app/ giscus: repo: "pgsty/pg_exporter" repoId: "MDEwOlJlcG9zaXRvcnkyMjYyNzkxOTc" category: "Announcements" categoryId: "DIC_kwDODXy_Hc4Ct5Xv" mapping: pathname strict: 0 reactionsEnabled: 1 emitMetadata: 0 inputPosition: bottom lang: en theme: transparent_dark ================================================ FILE: legacy/README.md ================================================ # Legacy Config Bundle (for PG 9.1 - 9.6) This directory contains the **legacy pg_exporter config bundle** for **PostgreSQL 9.1 - 9.6** (EOL). - `pg_exporter.yml`: merged legacy config (ready to use) - `config/`: separated collector definitions (source of truth) ## Generate / Update From the repository root: ```bash make conf9 ``` ## Usage Use the merged legacy config file: ```bash PG_EXPORTER_CONFIG=legacy/pg_exporter.yml \ PG_EXPORTER_URL='postgres://user:pass@host:5432/postgres' \ pg_exporter --auto-discovery --exclude-database=template0,template1 ``` Or load separated collectors directly (directory mode): ```bash PG_EXPORTER_CONFIG=legacy/config \ PG_EXPORTER_URL='postgres://user:pass@host:5432/postgres' \ pg_exporter --auto-discovery --exclude-database=template0,template1 ``` ## Notes - PostgreSQL **9.0 is not supported**. - For PostgreSQL **10+**, use the default config in repo root: `pg_exporter.yml`. ================================================ FILE: legacy/config/0000-doc.yml ================================================ #==============================================================# # Desc : pg_exporter metrics collector definition (Legacy) # Ver : PostgreSQL 9.1 ~ 9.6 and pgbouncer 1.9~1.25+ # Ctime : 2019-12-09 # Mtime : 2026-02-07 # Homepage : https://pigsty.io # Author : Ruohang Feng (rh@vonng.com) # License : Apache-2.0 @ https://github.com/pgsty/pg_exporter # Copyright : 2018-2026 Ruohang Feng / Vonng (rh@vonng.com) #==============================================================# #==============================================================# # 1. Config File #==============================================================# # The configuration file for pg_exporter is a YAML file. # Default configurations are retrieved via following precedence: # 1. command line args: --config= # 2. environment variables: PG_EXPORTER_CONFIG= # 3. pg_exporter.yml (Current directory) # 4. /etc/pg_exporter.yml (config file) # 5. /etc/pg_exporter (config dir) #==============================================================# # 2. Config Format #==============================================================# # pg_exporter config could be a single YAML file, or a directory containing a series of separated YAML files. # Each YAML config file consists of one or more metrics Collector definition, which are top-level objects. # If a directory is provided, all YAML in that directory will be merged in alphabetic order. #==============================================================# # 3. Version Compatibility #==============================================================# # Each collector has two optional version compatibility parameters: `min_version` and `max_version`. # These two parameters specify the version compatibility of the collector. If target postgres/pgbouncer's # version is less than `min_version`, or higher than `max_version`, the collector will not be installed. # # These two parameters are using PostgreSQL server version number format, which is a 6-digit integer # format as :. # # For example: # - 90100 stands for 9.1 # - 90600 stands for 9.6 # - 100000 stands for 10.0 # # Version compatibility range is left-inclusive right-exclusive: [min, max) ================================================ FILE: legacy/config/0110-pg.yml ================================================ #==============================================================# # 0110 pg #==============================================================# pg_primary_only: name: pg desc: PostgreSQL basic information (on primary) query: |- SELECT extract(EPOCH FROM CURRENT_TIMESTAMP) AS timestamp, extract(EPOCH FROM now() - pg_postmaster_start_time()) AS uptime, extract(EPOCH FROM pg_postmaster_start_time()) AS boot_time, (('x' || lpad(split_part(pg_current_xlog_location()::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(pg_current_xlog_location()::text, '/', 2), 8, '0'))::bit(32)::bigint) AS lsn, (('x' || lpad(split_part(pg_current_xlog_insert_location()::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(pg_current_xlog_insert_location()::text, '/', 2), 8, '0'))::bit(32)::bigint) AS insert_lsn, (('x' || lpad(split_part(pg_current_xlog_location()::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(pg_current_xlog_location()::text, '/', 2), 8, '0'))::bit(32)::bigint) AS write_lsn, (('x' || lpad(split_part(pg_current_xlog_location()::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(pg_current_xlog_location()::text, '/', 2), 8, '0'))::bit(32)::bigint) AS flush_lsn, NULL::BIGINT AS receive_lsn, NULL::BIGINT AS replay_lsn, extract(EPOCH FROM pg_conf_load_time()) AS reload_time, extract(EPOCH FROM now() - pg_conf_load_time()) AS conf_reload_time, NULL::FLOAT AS last_replay_time, 0::FLOAT AS lag, pg_is_in_recovery() AS is_in_recovery, FALSE AS is_wal_replay_paused; tags: [ cluster, primary ] ttl: 1 min_version: 90100 max_version: 100000 fatal: true skip: false metrics: - timestamp: { usage: GAUGE ,description: "current database timestamp in unix epoch" } - uptime: { usage: GAUGE ,description: "seconds since postmaster start" } - boot_time: { usage: GAUGE ,description: "postmaster boot timestamp in unix epoch" } - lsn: { usage: COUNTER ,description: "log sequence number, current write location" } - insert_lsn: { usage: COUNTER ,description: "primary only, location of current wal inserting" } - write_lsn: { usage: COUNTER ,description: "primary only, location of current wal writing" } - flush_lsn: { usage: COUNTER ,description: "primary only, location of current wal syncing" } - receive_lsn: { usage: COUNTER ,description: "replica only, location of wal synced to disk" } - replay_lsn: { usage: COUNTER ,description: "replica only, location of wal applied" } - reload_time: { usage: GAUGE ,description: "time when configuration was last reloaded" } - conf_reload_time: { usage: GAUGE ,description: "seconds since last configuration reload" } - last_replay_time: { usage: GAUGE ,description: "time when last transaction been replayed" } - lag: { usage: GAUGE ,description: "replica only, replication lag in seconds" } - is_in_recovery: { usage: GAUGE ,description: "1 if in recovery mode" } - is_wal_replay_paused: { usage: GAUGE ,description: "1 if wal play is paused" } pg_replica_only: name: pg desc: PostgreSQL basic information (on replica, 9.1+) query: |- SELECT extract(EPOCH FROM CURRENT_TIMESTAMP) AS timestamp, extract(EPOCH FROM now() - pg_postmaster_start_time()) AS uptime, extract(EPOCH FROM pg_postmaster_start_time()) AS boot_time, (('x' || lpad(split_part(pg_last_xlog_replay_location()::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(pg_last_xlog_replay_location()::text, '/', 2), 8, '0'))::bit(32)::bigint) AS lsn, NULL::BIGINT AS insert_lsn, NULL::BIGINT AS write_lsn, NULL::BIGINT AS flush_lsn, (('x' || lpad(split_part(pg_last_xlog_receive_location()::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(pg_last_xlog_receive_location()::text, '/', 2), 8, '0'))::bit(32)::bigint) AS receive_lsn, (('x' || lpad(split_part(pg_last_xlog_replay_location()::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(pg_last_xlog_replay_location()::text, '/', 2), 8, '0'))::bit(32)::bigint) AS replay_lsn, extract(EPOCH FROM pg_conf_load_time()) AS reload_time, extract(EPOCH FROM now() - pg_conf_load_time()) AS conf_reload_time, extract(EPOCH FROM pg_last_xact_replay_timestamp()) AS last_replay_time, CASE WHEN pg_last_xlog_receive_location() = pg_last_xlog_replay_location() THEN 0 ELSE EXTRACT(EPOCH FROM now() - pg_last_xact_replay_timestamp()) END AS lag, pg_is_in_recovery() AS is_in_recovery, pg_is_xlog_replay_paused() AS is_wal_replay_paused; tags: [ cluster, replica ] ttl: 1 min_version: 90100 max_version: 100000 fatal: true skip: false metrics: - timestamp: { usage: GAUGE ,description: "current database timestamp in unix epoch" } - uptime: { usage: GAUGE ,description: "seconds since postmaster start" } - boot_time: { usage: GAUGE ,description: "postmaster boot timestamp in unix epoch" } - lsn: { usage: COUNTER ,description: "log sequence number, current write location" } - insert_lsn: { usage: COUNTER ,description: "primary only, location of current wal inserting" } - write_lsn: { usage: COUNTER ,description: "primary only, location of current wal writing" } - flush_lsn: { usage: COUNTER ,description: "primary only, location of current wal syncing" } - receive_lsn: { usage: COUNTER ,description: "replica only, location of wal synced to disk" } - replay_lsn: { usage: COUNTER ,description: "replica only, location of wal applied" } - reload_time: { usage: GAUGE ,description: "time when configuration was last reloaded" } - conf_reload_time: { usage: GAUGE ,description: "seconds since last configuration reload" } - last_replay_time: { usage: GAUGE ,description: "time when last transaction been replayed" } - lag: { usage: GAUGE ,description: "replica only, replication lag in seconds" } - is_in_recovery: { usage: GAUGE ,description: "1 if in recovery mode" } - is_wal_replay_paused: { usage: GAUGE ,description: "1 if wal play is paused" } ================================================ FILE: legacy/config/0120-pg_meta.yml ================================================ #==============================================================# # 0120 pg_meta #==============================================================# pg_meta_96: name: pg_meta desc: PostgreSQL meta info for pg 9.6 (with pg_control_system) query: | SELECT (SELECT system_identifier FROM pg_control_system()) AS cluster_id, coalesce((SELECT setting FROM pg_settings WHERE name = 'cluster_name'), 'N/A') AS cluster_name, (SELECT setting FROM pg_settings WHERE name = 'port') AS listen_port, (SELECT setting FROM pg_settings WHERE name = 'data_directory') AS data_dir, (SELECT setting FROM pg_settings WHERE name = 'config_file') AS conf_path, (SELECT setting FROM pg_settings WHERE name = 'hba_file') AS hba_path, (SELECT setting FROM pg_settings WHERE name = 'wal_level') AS wal_level, (SELECT setting FROM pg_settings WHERE name = 'server_encoding') AS encoding, (SELECT setting FROM pg_settings WHERE name = 'server_version') AS version, (SELECT setting FROM pg_settings WHERE name = 'server_version_num') AS ver_num, version() AS ver_str, (SELECT setting FROM pg_settings WHERE name = 'shared_preload_libraries') AS extensions, coalesce((SELECT setting FROM pg_settings WHERE name = 'primary_conninfo'), 'N/A') AS primary_conninfo, 1 AS info; ttl: 10 min_version: 90600 tags: [ cluster ] metrics: - cluster_id: { usage: LABEL ,description: "cluster system identifier" } - cluster_name: { usage: LABEL ,description: "cluster name" } - listen_port: { usage: LABEL ,description: "listen port" } - data_dir: { usage: LABEL ,description: "path to data directory" } - conf_path: { usage: LABEL ,description: "path to postgresql.conf" } - hba_path: { usage: LABEL ,description: "path to pg_hba.conf" } - wal_level: { usage: LABEL ,description: "wal level" } - encoding: { usage: LABEL ,description: "server encoding" } - version: { usage: LABEL ,description: "server version in human-readable format" } - ver_num: { usage: LABEL ,description: "server version number in machine-readable format" } - ver_str: { usage: LABEL ,description: "complete version string" } - extensions: { usage: LABEL ,description: "server installed preload libraries" } - primary_conninfo: { usage: LABEL ,description: "connection string to upstream (do not set password here)" } - info: { usage: GAUGE ,description: "constant 1" } pg_meta_91: name: pg_meta desc: PostgreSQL meta info for pg 9.1 - 9.5 query: | SELECT 'N/A' AS cluster_id, coalesce((SELECT setting FROM pg_settings WHERE name = 'cluster_name'), 'N/A') AS cluster_name, (SELECT setting FROM pg_settings WHERE name = 'port') AS listen_port, (SELECT setting FROM pg_settings WHERE name = 'data_directory') AS data_dir, (SELECT setting FROM pg_settings WHERE name = 'config_file') AS conf_path, (SELECT setting FROM pg_settings WHERE name = 'hba_file') AS hba_path, (SELECT setting FROM pg_settings WHERE name = 'wal_level') AS wal_level, (SELECT setting FROM pg_settings WHERE name = 'server_encoding') AS encoding, (SELECT setting FROM pg_settings WHERE name = 'server_version') AS version, (SELECT setting FROM pg_settings WHERE name = 'server_version_num') AS ver_num, version() AS ver_str, (SELECT setting FROM pg_settings WHERE name = 'shared_preload_libraries') AS extensions, coalesce((SELECT setting FROM pg_settings WHERE name = 'primary_conninfo'), 'N/A') AS primary_conninfo, 1 AS info; ttl: 10 min_version: 90100 max_version: 90600 tags: [ cluster ] metrics: - cluster_id: { usage: LABEL ,description: "cluster system identifier" } - cluster_name: { usage: LABEL ,description: "cluster name" } - listen_port: { usage: LABEL ,description: "listen port" } - data_dir: { usage: LABEL ,description: "path to data directory" } - conf_path: { usage: LABEL ,description: "path to postgresql.conf" } - hba_path: { usage: LABEL ,description: "path to pg_hba.conf" } - wal_level: { usage: LABEL ,description: "wal level" } - encoding: { usage: LABEL ,description: "server encoding" } - version: { usage: LABEL ,description: "server version in human-readable format" } - ver_num: { usage: LABEL ,description: "server version number in machine-readable format" } - ver_str: { usage: LABEL ,description: "complete version string" } - extensions: { usage: LABEL ,description: "server installed preload libraries" } - primary_conninfo: { usage: LABEL ,description: "connection string to upstream (do not set password here)" } - info: { usage: GAUGE ,description: "constant 1" } ================================================ FILE: legacy/config/0130-pg_setting.yml ================================================ #==============================================================# # 0130 pg_setting #==============================================================# # Key PostgreSQL configuration parameters for PostgreSQL 9.1 - 9.6 # Use scalar subquery on pg_settings for "missing_ok" semantics (return NULL if not exist) pg_setting: name: pg_setting desc: PostgreSQL shared configuration parameters (legacy 9.1-9.6) query: | SELECT (SELECT setting::int FROM pg_settings WHERE name = 'max_connections') AS max_connections, (SELECT setting::int FROM pg_settings WHERE name = 'max_prepared_transactions') AS max_prepared_transactions, (SELECT setting::int FROM pg_settings WHERE name = 'max_locks_per_transaction') AS max_locks_per_transaction, (SELECT setting::int FROM pg_settings WHERE name = 'max_worker_processes') AS max_worker_processes, (SELECT setting::int FROM pg_settings WHERE name = 'max_parallel_workers') AS max_parallel_workers, (SELECT setting::int FROM pg_settings WHERE name = 'max_parallel_workers_per_gather') AS max_parallel_workers_per_gather, (SELECT setting::int FROM pg_settings WHERE name = 'max_parallel_maintenance_workers') AS max_parallel_maintenance_workers, (SELECT setting::int FROM pg_settings WHERE name = 'max_replication_slots') AS max_replication_slots, (SELECT setting::int FROM pg_settings WHERE name = 'max_wal_senders') AS max_wal_senders, (SELECT setting::int FROM pg_settings WHERE name = 'block_size') AS block_size, (SELECT setting::int FROM pg_settings WHERE name = 'wal_block_size') AS wal_block_size, (SELECT setting::bigint * CASE unit WHEN '8kB' THEN 8192 WHEN 'kB' THEN 1024 WHEN 'MB' THEN 1048576 WHEN 'GB' THEN 1073741824 ELSE 1 END FROM pg_settings WHERE name = 'segment_size') AS segment_size, (SELECT setting::bigint * CASE unit WHEN '8kB' THEN 8192 WHEN 'kB' THEN 1024 WHEN 'MB' THEN 1048576 WHEN 'GB' THEN 1073741824 ELSE 1 END FROM pg_settings WHERE name = 'wal_segment_size') AS wal_segment_size, (SELECT CASE setting WHEN 'on' THEN 1 ELSE 0 END FROM pg_settings WHERE name = 'data_checksums') AS data_checksums, (SELECT CASE setting WHEN 'on' THEN 1 ELSE 0 END FROM pg_settings WHERE name = 'wal_log_hints') AS wal_log_hints, (SELECT CASE setting WHEN 'on' THEN 1 ELSE 0 END FROM pg_settings WHERE name = 'fsync') AS fsync, (SELECT CASE setting WHEN 'on' THEN 1 ELSE 0 END FROM pg_settings WHERE name = 'full_page_writes') AS full_page_writes, (SELECT CASE setting WHEN 'minimal' THEN 1 WHEN 'archive' THEN 2 WHEN 'hot_standby' THEN 3 ELSE 0 END FROM pg_settings WHERE name = 'wal_level') AS wal_level, (SELECT setting::int FROM pg_settings WHERE name = 'checkpoint_segments') AS checkpoint_segments, (SELECT setting::bigint * CASE unit WHEN '8kB' THEN 8192 WHEN 'kB' THEN 1024 WHEN 'MB' THEN 1048576 WHEN 'GB' THEN 1073741824 ELSE 1 END FROM pg_settings WHERE name = 'min_wal_size') AS min_wal_size, (SELECT setting::bigint * CASE unit WHEN '8kB' THEN 8192 WHEN 'kB' THEN 1024 WHEN 'MB' THEN 1048576 WHEN 'GB' THEN 1073741824 ELSE 1 END FROM pg_settings WHERE name = 'max_wal_size') AS max_wal_size, (SELECT setting::int FROM pg_settings WHERE name = 'wal_keep_segments') AS wal_keep_segments, (SELECT setting::bigint * CASE unit WHEN '8kB' THEN 8192 WHEN 'kB' THEN 1024 WHEN 'MB' THEN 1048576 WHEN 'GB' THEN 1073741824 ELSE 1 END FROM pg_settings WHERE name = 'shared_buffers') AS shared_buffers, (SELECT setting::bigint * CASE unit WHEN '8kB' THEN 8192 WHEN 'kB' THEN 1024 WHEN 'MB' THEN 1048576 WHEN 'GB' THEN 1073741824 ELSE 1 END FROM pg_settings WHERE name = 'work_mem') AS work_mem, (SELECT setting::bigint * CASE unit WHEN '8kB' THEN 8192 WHEN 'kB' THEN 1024 WHEN 'MB' THEN 1048576 WHEN 'GB' THEN 1073741824 ELSE 1 END FROM pg_settings WHERE name = 'maintenance_work_mem') AS maintenance_work_mem, (SELECT setting::bigint * CASE unit WHEN '8kB' THEN 8192 WHEN 'kB' THEN 1024 WHEN 'MB' THEN 1048576 WHEN 'GB' THEN 1073741824 ELSE 1 END FROM pg_settings WHERE name = 'effective_cache_size') AS effective_cache_size, (SELECT CASE setting WHEN 'off' THEN 0 WHEN 'on' THEN 1 WHEN 'always' THEN 2 ELSE -1 END FROM pg_settings WHERE name = 'archive_mode') AS archive_mode, (SELECT CASE setting WHEN 'on' THEN 1 ELSE 0 END FROM pg_settings WHERE name = 'autovacuum') AS autovacuum, (SELECT setting::int FROM pg_settings WHERE name = 'autovacuum_max_workers') AS autovacuum_max_workers, (SELECT setting::int FROM pg_settings WHERE name = 'checkpoint_timeout') AS checkpoint_timeout, (SELECT setting::float FROM pg_settings WHERE name = 'checkpoint_completion_target') AS checkpoint_completion_target, (SELECT CASE setting WHEN 'on' THEN 1 ELSE 0 END FROM pg_settings WHERE name = 'hot_standby') AS hot_standby, (SELECT CASE setting WHEN 'off' THEN 0 WHEN 'local' THEN 1 WHEN 'remote_write' THEN 2 WHEN 'on' THEN 3 WHEN 'remote_apply' THEN 4 ELSE -1 END FROM pg_settings WHERE name = 'synchronous_commit') AS synchronous_commit; ttl: 10 min_version: 90100 tags: [ cluster ] metrics: - max_connections: { usage: GAUGE ,description: "maximum number of concurrent connections to the database server" } - max_prepared_transactions: { usage: GAUGE ,description: "maximum number of transactions that can be in the prepared state simultaneously" } - max_locks_per_transaction: { usage: GAUGE ,description: "maximum number of locks per transaction" } - max_worker_processes: { usage: GAUGE ,description: "maximum number of background processes (9.4+)" } - max_parallel_workers: { usage: GAUGE ,description: "maximum number of parallel workers that can be active at one time (9.6+)" } - max_parallel_workers_per_gather: { usage: GAUGE ,description: "maximum number of parallel workers per Gather node (9.6+)" } - max_parallel_maintenance_workers: { usage: GAUGE ,description: "maximum number of parallel maintenance workers (NULL on 9.x)" } - max_replication_slots: { usage: GAUGE ,description: "maximum number of replication slots (9.4+)" } - max_wal_senders: { usage: GAUGE ,description: "maximum number of concurrent WAL sender connections" } - block_size: { usage: GAUGE ,description: "database block size in bytes (default 8192)" } - wal_block_size: { usage: GAUGE ,description: "WAL block size in bytes" } - segment_size: { usage: GAUGE ,description: "database file segment size in bytes" } - wal_segment_size: { usage: GAUGE ,description: "WAL segment size in bytes" } - data_checksums: { usage: GAUGE ,description: "data checksums enabled, 1=on 0=off (9.3+)" } - wal_log_hints: { usage: GAUGE ,description: "WAL log hints enabled, 1=on 0=off (9.4+)" } - fsync: { usage: GAUGE ,description: "fsync enabled (CRITICAL for data safety), 1=on 0=off" } - full_page_writes: { usage: GAUGE ,description: "full page writes enabled, 1=on 0=off" } - wal_level: { usage: GAUGE ,description: "WAL level, 1=minimal 2=archive 3=hot_standby" } - checkpoint_segments: { usage: GAUGE ,description: "number of checkpoint segments (pre-9.5)" } - min_wal_size: { usage: GAUGE ,description: "minimum WAL size in bytes (9.5+)" } - max_wal_size: { usage: GAUGE ,description: "maximum WAL size in bytes (9.5+)" } - wal_keep_segments: { usage: GAUGE ,description: "WAL segments kept for standby replication (pg_basebackup/streaming)" } - shared_buffers: { usage: GAUGE ,description: "shared buffer size in bytes" } - work_mem: { usage: GAUGE ,description: "work memory size in bytes" } - maintenance_work_mem: { usage: GAUGE ,description: "maintenance work memory size in bytes" } - effective_cache_size: { usage: GAUGE ,description: "planner's assumption about effective OS cache size in bytes" } - archive_mode: { usage: GAUGE ,description: "archive mode, 0=off 1=on 2=always" } - autovacuum: { usage: GAUGE ,description: "autovacuum enabled, 1=on 0=off" } - autovacuum_max_workers: { usage: GAUGE ,description: "maximum number of autovacuum worker processes" } - checkpoint_timeout: { usage: GAUGE ,description: "checkpoint timeout in seconds" } - checkpoint_completion_target: { usage: GAUGE ,description: "checkpoint completion target (0.0-1.0)" } - hot_standby: { usage: GAUGE ,description: "hot standby mode enabled, 1=on 0=off" } - synchronous_commit: { usage: GAUGE ,description: "synchronous commit level, 0=off 1=local 2=remote_write 3=on 4=remote_apply" } ================================================ FILE: legacy/config/0210-pg_repl.yml ================================================ #==============================================================# # 0210 pg_repl #==============================================================# pg_repl_94: name: pg_repl desc: PostgreSQL replication stat metrics 9.4 - 9.6 (with backend_xmin) query: |- SELECT appname, usename, address, pid, client_port, state, sync_state, sync_priority, backend_xmin, lsn, lsn - sent_lsn AS sent_diff, lsn - write_lsn AS write_diff, lsn - flush_lsn AS flush_diff, lsn - replay_lsn AS replay_diff, sent_lsn, write_lsn, flush_lsn, replay_lsn, 0::FLOAT AS write_lag, 0::FLOAT AS flush_lag, 0::FLOAT AS replay_lag, extract(EPOCH FROM current_timestamp) AS "time", extract(EPOCH FROM backend_start) AS launch_time FROM ( SELECT application_name AS appname, usename, coalesce(client_addr::TEXT,'localhost') AS address, pid::TEXT, client_port, CASE state WHEN 'streaming' THEN 0 WHEN 'startup' THEN 1 WHEN 'catchup' THEN 2 WHEN 'backup' THEN 3 WHEN 'stopping' THEN 4 ELSE -1 END AS state, CASE sync_state WHEN 'async' THEN 0 WHEN 'potential' THEN 1 WHEN 'sync' THEN 2 WHEN 'quorum' THEN 3 ELSE -1 END AS sync_state, sync_priority, backend_xmin::TEXT::BIGINT AS backend_xmin, (('x' || lpad(split_part(current.loc::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(current.loc::text, '/', 2), 8, '0'))::bit(32)::bigint) AS lsn, (('x' || lpad(split_part(sent_location::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(sent_location::text, '/', 2), 8, '0'))::bit(32)::bigint) AS sent_lsn, (('x' || lpad(split_part(write_location::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(write_location::text, '/', 2), 8, '0'))::bit(32)::bigint) AS write_lsn, (('x' || lpad(split_part(flush_location::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(flush_location::text, '/', 2), 8, '0'))::bit(32)::bigint) AS flush_lsn, (('x' || lpad(split_part(replay_location::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(replay_location::text, '/', 2), 8, '0'))::bit(32)::bigint) AS replay_lsn, backend_start FROM pg_stat_replication, (SELECT CASE WHEN pg_is_in_recovery() THEN pg_last_xlog_replay_location() ELSE pg_current_xlog_location() END AS loc) current ) d; ttl: 10 min_version: 90400 max_version: 100000 tags: [ cluster ] metrics: - appname: { usage: LABEL ,description: "Name of the application that is connected to this WAL sender" } - usename: { usage: LABEL ,description: "Name of the user logged into this WAL sender process" } - address: { usage: LABEL ,description: "IP address of the client connected to this WAL sender, localhost for unix socket" } - pid: { usage: LABEL ,description: "Process ID of the WAL sender process" } - client_port: { usage: GAUGE ,description: "TCP port number that the client is using for communication with this WAL sender, or -1 if a Unix socket is used" } - state: { usage: GAUGE ,description: "Current WAL sender encoded state 0-4 for streaming|startup|catchup|backup|stopping" } - sync_state: { usage: GAUGE ,description: "Encoded synchronous state of this standby server, 0-3 for async|potential|sync|quorum" } - sync_priority: { usage: GAUGE ,description: "Priority of this standby server for being chosen as the synchronous standby" } - backend_xmin: { usage: COUNTER ,description: "This standby's xmin horizon reported by hot_standby_feedback." } - lsn: { usage: COUNTER ,description: "Current log position on this server" } - sent_diff: { usage: GAUGE ,description: "Last log position sent to this standby server diff with current lsn" } - write_diff: { usage: GAUGE ,description: "Last log position written to disk by this standby server diff with current lsn" } - flush_diff: { usage: GAUGE ,description: "Last log position flushed to disk by this standby server diff with current lsn" } - replay_diff: { usage: GAUGE ,description: "Last log position replayed into the database on this standby server diff with current lsn" } - sent_lsn: { usage: COUNTER ,description: "Last write-ahead log location sent on this connection" } - write_lsn: { usage: COUNTER ,description: "Last write-ahead log location written to disk by this standby server" } - flush_lsn: { usage: COUNTER ,description: "Last write-ahead log location flushed to disk by this standby server" } - replay_lsn: { usage: COUNTER ,description: "Last write-ahead log location replayed into the database on this standby server" } - write_lag: { usage: GAUGE ,description: "Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written it (N/A on 9.x)" } - flush_lag: { usage: GAUGE ,description: "Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written and flushed it (N/A on 9.x)" } - replay_lag: { usage: GAUGE ,description: "Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written, flushed and applied it (N/A on 9.x)" } - time: { usage: COUNTER ,description: "Current timestamp in unix epoch" } - launch_time: { usage: COUNTER ,description: "Time when this process was started, i.e., when the client connected to this WAL sender" } pg_repl_92: name: pg_repl desc: PostgreSQL replication stat metrics 9.2 - 9.3 query: |- SELECT appname, usename, address, pid, client_port, state, sync_state, sync_priority, backend_xmin, lsn, lsn - sent_lsn AS sent_diff, lsn - write_lsn AS write_diff, lsn - flush_lsn AS flush_diff, lsn - replay_lsn AS replay_diff, sent_lsn, write_lsn, flush_lsn, replay_lsn, 0::FLOAT AS write_lag, 0::FLOAT AS flush_lag, 0::FLOAT AS replay_lag, extract(EPOCH FROM current_timestamp) AS "time", extract(EPOCH FROM backend_start) AS launch_time FROM ( SELECT application_name AS appname, usename, coalesce(client_addr::TEXT,'localhost') AS address, pid::TEXT, client_port, CASE state WHEN 'streaming' THEN 0 WHEN 'startup' THEN 1 WHEN 'catchup' THEN 2 WHEN 'backup' THEN 3 WHEN 'stopping' THEN 4 ELSE -1 END AS state, CASE sync_state WHEN 'async' THEN 0 WHEN 'potential' THEN 1 WHEN 'sync' THEN 2 WHEN 'quorum' THEN 3 ELSE -1 END AS sync_state, sync_priority, 0::BIGINT AS backend_xmin, (('x' || lpad(split_part(current.loc::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(current.loc::text, '/', 2), 8, '0'))::bit(32)::bigint) AS lsn, (('x' || lpad(split_part(sent_location::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(sent_location::text, '/', 2), 8, '0'))::bit(32)::bigint) AS sent_lsn, (('x' || lpad(split_part(write_location::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(write_location::text, '/', 2), 8, '0'))::bit(32)::bigint) AS write_lsn, (('x' || lpad(split_part(flush_location::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(flush_location::text, '/', 2), 8, '0'))::bit(32)::bigint) AS flush_lsn, (('x' || lpad(split_part(replay_location::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(replay_location::text, '/', 2), 8, '0'))::bit(32)::bigint) AS replay_lsn, backend_start FROM pg_stat_replication, (SELECT CASE WHEN pg_is_in_recovery() THEN pg_last_xlog_replay_location() ELSE pg_current_xlog_location() END AS loc) current ) d; ttl: 10 min_version: 90200 max_version: 90400 tags: [ cluster ] metrics: - appname: { usage: LABEL ,description: "Name of the application that is connected to this WAL sender" } - usename: { usage: LABEL ,description: "Name of the user logged into this WAL sender process" } - address: { usage: LABEL ,description: "IP address of the client connected to this WAL sender, localhost for unix socket" } - pid: { usage: LABEL ,description: "Process ID of the WAL sender process" } - client_port: { usage: GAUGE ,description: "TCP port number that the client is using for communication with this WAL sender, or -1 if a Unix socket is used" } - state: { usage: GAUGE ,description: "Current WAL sender encoded state 0-4 for streaming|startup|catchup|backup|stopping" } - sync_state: { usage: GAUGE ,description: "Encoded synchronous state of this standby server, 0-3 for async|potential|sync|quorum" } - sync_priority: { usage: GAUGE ,description: "Priority of this standby server for being chosen as the synchronous standby" } - backend_xmin: { usage: COUNTER ,description: "This standby's xmin horizon reported by hot_standby_feedback (N/A before 9.4)" } - lsn: { usage: COUNTER ,description: "Current log position on this server" } - sent_diff: { usage: GAUGE ,description: "Last log position sent to this standby server diff with current lsn" } - write_diff: { usage: GAUGE ,description: "Last log position written to disk by this standby server diff with current lsn" } - flush_diff: { usage: GAUGE ,description: "Last log position flushed to disk by this standby server diff with current lsn" } - replay_diff: { usage: GAUGE ,description: "Last log position replayed into the database on this standby server diff with current lsn" } - sent_lsn: { usage: COUNTER ,description: "Last write-ahead log location sent on this connection" } - write_lsn: { usage: COUNTER ,description: "Last write-ahead log location written to disk by this standby server" } - flush_lsn: { usage: COUNTER ,description: "Last write-ahead log location flushed to disk by this standby server" } - replay_lsn: { usage: COUNTER ,description: "Last write-ahead log location replayed into the database on this standby server" } - write_lag: { usage: GAUGE ,description: "Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written it (N/A on 9.x)" } - flush_lag: { usage: GAUGE ,description: "Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written and flushed it (N/A on 9.x)" } - replay_lag: { usage: GAUGE ,description: "Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written, flushed and applied it (N/A on 9.x)" } - time: { usage: COUNTER ,description: "Current timestamp in unix epoch" } - launch_time: { usage: COUNTER ,description: "Time when this process was started, i.e., when the client connected to this WAL sender" } pg_repl_91: name: pg_repl desc: PostgreSQL replication stat metrics 9.1 (procpid, no state/sync columns) query: |- SELECT appname, usename, address, pid, client_port, state, sync_state, sync_priority, backend_xmin, lsn, lsn - sent_lsn AS sent_diff, lsn - write_lsn AS write_diff, lsn - flush_lsn AS flush_diff, lsn - replay_lsn AS replay_diff, sent_lsn, write_lsn, flush_lsn, replay_lsn, 0::FLOAT AS write_lag, 0::FLOAT AS flush_lag, 0::FLOAT AS replay_lag, extract(EPOCH FROM current_timestamp) AS "time", extract(EPOCH FROM backend_start) AS launch_time FROM ( SELECT application_name AS appname, usename, coalesce(client_addr::TEXT,'localhost') AS address, procpid::TEXT AS pid, client_port, 0::INT AS state, 0::INT AS sync_state, 0::INT AS sync_priority, 0::BIGINT AS backend_xmin, (('x' || lpad(split_part(current.loc::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(current.loc::text, '/', 2), 8, '0'))::bit(32)::bigint) AS lsn, (('x' || lpad(split_part(sent_location::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(sent_location::text, '/', 2), 8, '0'))::bit(32)::bigint) AS sent_lsn, (('x' || lpad(split_part(write_location::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(write_location::text, '/', 2), 8, '0'))::bit(32)::bigint) AS write_lsn, (('x' || lpad(split_part(flush_location::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(flush_location::text, '/', 2), 8, '0'))::bit(32)::bigint) AS flush_lsn, (('x' || lpad(split_part(replay_location::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(replay_location::text, '/', 2), 8, '0'))::bit(32)::bigint) AS replay_lsn, backend_start FROM pg_stat_replication, (SELECT CASE WHEN pg_is_in_recovery() THEN pg_last_xlog_replay_location() ELSE pg_current_xlog_location() END AS loc) current ) d; ttl: 10 min_version: 90100 max_version: 90200 tags: [ cluster ] metrics: - appname: { usage: LABEL ,description: "Name of the application that is connected to this WAL sender" } - usename: { usage: LABEL ,description: "Name of the user logged into this WAL sender process" } - address: { usage: LABEL ,description: "IP address of the client connected to this WAL sender, localhost for unix socket" } - pid: { usage: LABEL ,description: "Process ID of the WAL sender process" } - client_port: { usage: GAUGE ,description: "TCP port number that the client is using for communication with this WAL sender, or -1 if a Unix socket is used" } - state: { usage: GAUGE ,description: "Current WAL sender encoded state 0-4 for streaming|startup|catchup|backup|stopping" } - sync_state: { usage: GAUGE ,description: "Encoded synchronous state of this standby server, 0-3 for async|potential|sync|quorum" } - sync_priority: { usage: GAUGE ,description: "Priority of this standby server for being chosen as the synchronous standby" } - backend_xmin: { usage: COUNTER ,description: "This standby's xmin horizon reported by hot_standby_feedback (N/A before 9.4)" } - lsn: { usage: COUNTER ,description: "Current log position on this server" } - sent_diff: { usage: GAUGE ,description: "Last log position sent to this standby server diff with current lsn" } - write_diff: { usage: GAUGE ,description: "Last log position written to disk by this standby server diff with current lsn" } - flush_diff: { usage: GAUGE ,description: "Last log position flushed to disk by this standby server diff with current lsn" } - replay_diff: { usage: GAUGE ,description: "Last log position replayed into the database on this standby server diff with current lsn" } - sent_lsn: { usage: COUNTER ,description: "Last write-ahead log location sent on this connection" } - write_lsn: { usage: COUNTER ,description: "Last write-ahead log location written to disk by this standby server" } - flush_lsn: { usage: COUNTER ,description: "Last write-ahead log location flushed to disk by this standby server" } - replay_lsn: { usage: COUNTER ,description: "Last write-ahead log location replayed into the database on this standby server" } - write_lag: { usage: GAUGE ,description: "Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written it (N/A on 9.x)" } - flush_lag: { usage: GAUGE ,description: "Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written and flushed it (N/A on 9.x)" } - replay_lag: { usage: GAUGE ,description: "Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written, flushed and applied it (N/A on 9.x)" } - time: { usage: COUNTER ,description: "Current timestamp in unix epoch" } - launch_time: { usage: COUNTER ,description: "Time when this process was started, i.e., when the client connected to this WAL sender" } ================================================ FILE: legacy/config/0220-pg_sync_standby.yml ================================================ #==============================================================# # 0220 pg_sync_standby #==============================================================# pg_sync_standby: name: pg_sync_standby desc: PostgreSQL synchronous standby status and names query: |- SELECT CASE WHEN names <> '' THEN names ELSE '' END AS names, CASE WHEN names <> '' THEN 1 ELSE 0 END AS enabled FROM (SELECT current_setting('synchronous_standby_names') AS names) n; ttl: 10 min_version: 90100 tags: [ cluster ] metrics: - names: { usage: LABEL ,description: "List of standby servers that can support synchronous replication" } - enabled: { usage: GAUGE ,description: "Synchronous commit enabled, 1 if enabled, 0 if disabled" } ================================================ FILE: legacy/config/0230-pg_downstream.yml ================================================ #==============================================================# # 0230 pg_downstream #==============================================================# pg_downstream: name: pg_downstream desc: PostgreSQL replication client count (no state column on 9.1) query: |- SELECT 'connected' AS state, count(*) AS count FROM pg_stat_replication; ttl: 10 min_version: 90100 max_version: 90200 tags: [ cluster ] metrics: - state: { usage: LABEL ,description: "Replication client state" } - count: { usage: GAUGE ,description: "Count of replication clients by state" } pg_downstream_92: name: pg_downstream desc: PostgreSQL replication client count (group by state) query: |- SELECT state, count(*) AS count FROM pg_stat_replication GROUP BY state; ttl: 10 min_version: 90200 tags: [ cluster ] metrics: - state: { usage: LABEL ,description: "Replication client state" } - count: { usage: GAUGE ,description: "Count of replication clients by state" } ================================================ FILE: legacy/config/0240-pg_slot.yml ================================================ #==============================================================# # 0240 pg_slot #==============================================================# pg_slot_96: name: pg_slot desc: PostgreSQL replication slot metrics 9.6 (with active_pid, confirmed_flush_lsn) query: |- SELECT slot_name, slot_type, plugin, database AS datname, datoid, active_pid, active, FALSE AS temporary, xmin::TEXT::BIGINT AS xmin, catalog_xmin::TEXT::BIGINT AS catalog_xmin, restart_lsn, confirm_lsn, current_lsn - restart_lsn AS retained_bytes FROM ( SELECT slot_name, slot_type, plugin, database, datoid, active_pid, active, xmin, catalog_xmin, (('x' || lpad(split_part(restart_lsn::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(restart_lsn::text, '/', 2), 8, '0'))::bit(32)::bigint) AS restart_lsn, (('x' || lpad(split_part(confirmed_flush_lsn::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(confirmed_flush_lsn::text, '/', 2), 8, '0'))::bit(32)::bigint) AS confirm_lsn, (('x' || lpad(split_part(current.loc::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(current.loc::text, '/', 2), 8, '0'))::bit(32)::bigint) AS current_lsn FROM pg_replication_slots, (SELECT CASE WHEN pg_is_in_recovery() THEN pg_last_xlog_replay_location() ELSE pg_current_xlog_location() END AS loc) current ) d; ttl: 10 min_version: 90600 max_version: 100000 tags: [ cluster, primary ] metrics: - slot_name: { usage: LABEL ,description: "A unique, cluster-wide identifier for the replication slot" } - slot_type: { usage: LABEL ,description: "The slot type, physical or logical" } - plugin: { usage: LABEL ,description: "The base name of the shared object containing the output plugin this logical slot is using, or null for physical slots." } - datname: { usage: LABEL ,description: "The name of the database this slot is associated with, logical slots only, null for physical slot" } - datoid: { usage: GAUGE ,description: "The OID of the database this slot is associated with, logical slots only, null for physical slot" } - active_pid: { usage: GAUGE ,description: "The process ID of the session streaming data for this slot. NULL if inactive." } - active: { usage: GAUGE ,description: "True(1) if this slot is currently actively being used" } - temporary: { usage: GAUGE ,description: "True(1) if this is a temporary replication slot (N/A on 9.x, always 0)" } - xmin: { usage: COUNTER ,description: "The oldest transaction that this slot needs the database to retain." } - catalog_xmin: { usage: COUNTER ,description: "The oldest transaction affecting the system catalogs that this slot needs the database to retain." } - restart_lsn: { usage: COUNTER ,description: "The address (LSN) of oldest WAL which still might be required by the consumer of this slot" } - confirm_lsn: { usage: COUNTER ,description: "The address (LSN) up to which the logical slot's consumer has confirmed receiving data." } - retained_bytes: { usage: GAUGE ,description: "Size of bytes that retained for this slot" } pg_slot_95: name: pg_slot desc: PostgreSQL replication slot metrics 9.5 (no confirmed_flush_lsn) query: |- SELECT slot_name, slot_type, plugin, database AS datname, datoid, active_pid, active, FALSE AS temporary, xmin::TEXT::BIGINT AS xmin, catalog_xmin::TEXT::BIGINT AS catalog_xmin, restart_lsn, confirm_lsn, current_lsn - restart_lsn AS retained_bytes FROM ( SELECT slot_name, slot_type, plugin, database, datoid, active_pid, active, xmin, catalog_xmin, (('x' || lpad(split_part(restart_lsn::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(restart_lsn::text, '/', 2), 8, '0'))::bit(32)::bigint) AS restart_lsn, 0::BIGINT AS confirm_lsn, (('x' || lpad(split_part(current.loc::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(current.loc::text, '/', 2), 8, '0'))::bit(32)::bigint) AS current_lsn FROM pg_replication_slots, (SELECT CASE WHEN pg_is_in_recovery() THEN pg_last_xlog_replay_location() ELSE pg_current_xlog_location() END AS loc) current ) d; ttl: 10 min_version: 90500 max_version: 90600 tags: [ cluster, primary ] metrics: - slot_name: { usage: LABEL ,description: "A unique, cluster-wide identifier for the replication slot" } - slot_type: { usage: LABEL ,description: "The slot type, physical or logical" } - plugin: { usage: LABEL ,description: "The base name of the shared object containing the output plugin this logical slot is using, or null for physical slots." } - datname: { usage: LABEL ,description: "The name of the database this slot is associated with, logical slots only, null for physical slot" } - datoid: { usage: GAUGE ,description: "The OID of the database this slot is associated with, logical slots only, null for physical slot" } - active_pid: { usage: GAUGE ,description: "The process ID of the session streaming data for this slot. NULL if inactive." } - active: { usage: GAUGE ,description: "True(1) if this slot is currently actively being used" } - temporary: { usage: GAUGE ,description: "True(1) if this is a temporary replication slot (N/A on 9.x, always 0)" } - xmin: { usage: COUNTER ,description: "The oldest transaction that this slot needs the database to retain." } - catalog_xmin: { usage: COUNTER ,description: "The oldest transaction affecting the system catalogs that this slot needs the database to retain." } - restart_lsn: { usage: COUNTER ,description: "The address (LSN) of oldest WAL which still might be required by the consumer of this slot" } - confirm_lsn: { usage: COUNTER ,description: "Confirmed flush lsn (N/A before 9.6, always 0)" } - retained_bytes: { usage: GAUGE ,description: "Size of bytes that retained for this slot" } pg_slot_94: name: pg_slot desc: PostgreSQL replication slot metrics 9.4 (no active_pid, confirmed_flush_lsn) query: |- SELECT slot_name, slot_type, plugin, database AS datname, datoid, active_pid, active, FALSE AS temporary, xmin::TEXT::BIGINT AS xmin, catalog_xmin::TEXT::BIGINT AS catalog_xmin, restart_lsn, confirm_lsn, current_lsn - restart_lsn AS retained_bytes FROM ( SELECT slot_name, slot_type, plugin, database, datoid, NULL::INT AS active_pid, active, xmin, catalog_xmin, (('x' || lpad(split_part(restart_lsn::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(restart_lsn::text, '/', 2), 8, '0'))::bit(32)::bigint) AS restart_lsn, 0::BIGINT AS confirm_lsn, (('x' || lpad(split_part(current.loc::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(current.loc::text, '/', 2), 8, '0'))::bit(32)::bigint) AS current_lsn FROM pg_replication_slots, (SELECT CASE WHEN pg_is_in_recovery() THEN pg_last_xlog_replay_location() ELSE pg_current_xlog_location() END AS loc) current ) d; ttl: 10 min_version: 90400 max_version: 90500 tags: [ cluster, primary ] metrics: - slot_name: { usage: LABEL ,description: "A unique, cluster-wide identifier for the replication slot" } - slot_type: { usage: LABEL ,description: "The slot type, physical or logical" } - plugin: { usage: LABEL ,description: "The base name of the shared object containing the output plugin this logical slot is using, or null for physical slots." } - datname: { usage: LABEL ,description: "The name of the database this slot is associated with, logical slots only, null for physical slot" } - datoid: { usage: GAUGE ,description: "The OID of the database this slot is associated with, logical slots only, null for physical slot" } - active_pid: { usage: GAUGE ,description: "Process ID is not available before 9.5 (NULL)" } - active: { usage: GAUGE ,description: "True(1) if this slot is currently actively being used" } - temporary: { usage: GAUGE ,description: "True(1) if this is a temporary replication slot (N/A on 9.x, always 0)" } - xmin: { usage: COUNTER ,description: "The oldest transaction that this slot needs the database to retain." } - catalog_xmin: { usage: COUNTER ,description: "The oldest transaction affecting the system catalogs that this slot needs the database to retain." } - restart_lsn: { usage: COUNTER ,description: "The address (LSN) of oldest WAL which still might be required by the consumer of this slot" } - confirm_lsn: { usage: COUNTER ,description: "Confirmed flush lsn (N/A before 9.6, always 0)" } - retained_bytes: { usage: GAUGE ,description: "Size of bytes that retained for this slot" } ================================================ FILE: legacy/config/0250-pg_recv.yml ================================================ #==============================================================# # 0250 pg_recv #==============================================================# pg_recv_96: name: pg_recv desc: PostgreSQL walreceiver metrics (9.6 - 12) query: |- SELECT (regexp_match(conninfo, '.*host=(\S+).*'))[1] AS sender_host, (regexp_match(conninfo, '.*port=(\S+).*'))[1] AS sender_port, coalesce(slot_name, 'NULL') AS slot_name, pid, CASE status WHEN 'streaming' THEN 0 WHEN 'startup' THEN 1 WHEN 'catchup' THEN 2 WHEN 'backup' THEN 3 WHEN 'stopping' THEN 4 ELSE -1 END AS state, (('x' || lpad(split_part(receive_start_lsn::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(receive_start_lsn::text, '/', 2), 8, '0'))::bit(32)::bigint) AS init_lsn, receive_start_tli AS init_tli, (('x' || lpad(split_part(received_lsn::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(received_lsn::text, '/', 2), 8, '0'))::bit(32)::bigint) AS flush_lsn, received_tli AS flush_tli, (('x' || lpad(split_part(latest_end_lsn::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(latest_end_lsn::text, '/', 2), 8, '0'))::bit(32)::bigint) AS reported_lsn, last_msg_send_time AS msg_send_time, last_msg_receipt_time AS msg_recv_time, latest_end_time AS reported_time, now() AS time FROM pg_stat_wal_receiver; ttl: 10 min_version: 90600 max_version: 130000 tags: [ cluster, replica ] metrics: - sender_host: { usage: LABEL ,description: "Host of the PostgreSQL instance this WAL receiver is connected to" } - sender_port: { usage: LABEL ,description: "Port number of the PostgreSQL instance this WAL receiver is connected to." } - slot_name: { usage: LABEL ,description: "Replication slot name used by this WAL receiver" } - pid: { usage: GAUGE ,description: "Process ID of the WAL receiver process" } - state: { usage: GAUGE ,description: "Encoded activity status of the WAL receiver process 0-4 for streaming|startup|catchup|backup|stopping" } - init_lsn: { usage: COUNTER ,description: "First write-ahead log location used when WAL receiver is started" } - init_tli: { usage: COUNTER ,description: "First timeline number used when WAL receiver is started" } - flush_lsn: { usage: COUNTER ,description: "Last write-ahead log location already received and flushed to disk" } - flush_tli: { usage: COUNTER ,description: "Timeline number of last write-ahead log location received and flushed to disk" } - reported_lsn: { usage: COUNTER ,description: "Last write-ahead log location reported to origin WAL sender" } - msg_send_time: { usage: GAUGE ,description: "Send time of last message received from origin WAL sender" } - msg_recv_time: { usage: GAUGE ,description: "Receipt time of last message received from origin WAL sender" } - reported_time: { usage: GAUGE ,description: "Time of last write-ahead log location reported to origin WAL sender" } - time: { usage: GAUGE ,description: "Time of current snapshot" } ================================================ FILE: legacy/config/0270-pg_origin.yml ================================================ #==============================================================# # 0270 pg_origin #==============================================================# # skip by default, require additional privilege setup # GRANT SELECT ON pg_replication_origin, pg_replication_origin_status TO pg_monitor; pg_origin: name: pg_origin desc: PostgreSQL replay state (approximate) for a certain origin query: |- SELECT roname, (('x' || lpad(split_part(remote_lsn::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(remote_lsn::text, '/', 2), 8, '0'))::bit(32)::bigint) AS remote_lsn, (('x' || lpad(split_part(local_lsn::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(local_lsn::text, '/', 2), 8, '0'))::bit(32)::bigint) AS local_lsn FROM pg_replication_origin o LEFT JOIN pg_replication_origin_status os ON o.roident = os.local_id; ttl: 10 min_version: 90500 skip: true tags: [ cluster ] metrics: - roname: { usage: LABEL ,description: "The external, user defined, name of a replication origin." } - remote_lsn: { usage: COUNTER ,description: "The origin node's LSN up to which data has been replicated." } - local_lsn: { usage: COUNTER ,description: "This node's LSN at which remote_lsn has been replicated." } ================================================ FILE: legacy/config/0310-pg_size.yml ================================================ #==============================================================# # 0310 pg_size #==============================================================# pg_size: name: pg_size desc: PostgreSQL database size (legacy 9.1-9.6) query: |- SELECT datname, pg_database_size(oid) AS bytes FROM pg_database; ttl: 60 timeout: 1 min_version: 90100 tags: [ cluster ] metrics: - datname: { usage: LABEL ,description: "Database name" } - bytes: { usage: GAUGE ,description: "Database size in bytes" } ================================================ FILE: legacy/config/0320-pg_archiver.yml ================================================ #==============================================================# # 0320 pg_archiver #==============================================================# pg_archiver: name: pg_archiver desc: PostgreSQL archiver process statistics query: |- SELECT archived_count AS finish_count,failed_count, extract(epoch FROM last_archived_time) AS finish_time, extract(epoch FROM last_failed_time) AS failed_time, extract(epoch FROM stats_reset) AS reset_time FROM pg_stat_archiver; ttl: 60 min_version: 90400 tags: [ cluster ] metrics: - finish_count: { usage: COUNTER ,description: "Number of WAL files that have been successfully archived" } - failed_count: { usage: COUNTER ,description: "Number of failed attempts for archiving WAL files" } - finish_time: { usage: GAUGE ,description: "Time of the last successful archive operation" } - failed_time: { usage: GAUGE ,description: "Time of the last failed archival operation" } - reset_time: { usage: GAUGE ,description: "Time at which archive statistics were last reset" } ================================================ FILE: legacy/config/0330-pg_bgwriter.yml ================================================ #==============================================================# # 0330 pg_bgwriter #==============================================================# # https://pgpedia.info/p/pg_stat_bgwriter.html pg_bgwriter_94: name: pg_bgwriter desc: "PostgreSQL background writer metrics (PG 9.4-16)" query: SELECT checkpoints_timed, checkpoints_req, checkpoint_write_time, checkpoint_sync_time, buffers_checkpoint, buffers_clean, buffers_backend, maxwritten_clean, buffers_backend_fsync, buffers_alloc, extract(EPOCH FROM stats_reset) AS reset_time FROM pg_stat_bgwriter; ttl: 10 min_version: 90400 max_version: 170000 tags: [ cluster ] metrics: - checkpoints_timed: { usage: COUNTER ,description: "Number of scheduled checkpoints that have been performed" } - checkpoints_req: { usage: COUNTER ,description: "Number of requested checkpoints that have been performed" } - checkpoint_write_time: { usage: COUNTER ,scale: 1e-3 ,description: "Total amount of time that has been spent in the portion of checkpoint processing where files are written to disk, in seconds" } - checkpoint_sync_time: { usage: COUNTER ,scale: 1e-3 ,description: "Total amount of time that has been spent in the portion of checkpoint processing where files are synchronized to disk, in seconds" } - buffers_checkpoint: { usage: COUNTER ,description: "Number of buffers written during checkpoints" } - buffers_clean: { usage: COUNTER ,description: "Number of buffers written by the background writer" } - buffers_backend: { usage: COUNTER ,description: "Number of buffers written directly by a backend" } - maxwritten_clean: { usage: COUNTER ,description: "Number of times the background writer stopped a cleaning scan because it had written too many buffers" } - buffers_backend_fsync: { usage: COUNTER ,description: "Number of times a backend had to execute its own fsync call" } - buffers_alloc: { usage: COUNTER ,description: "Number of buffers allocated" } - reset_time: { usage: GAUGE ,description: "Time at which bgwriter statistics were last reset" } pg_bgwriter_91: name: pg_bgwriter desc: "PostgreSQL background writer metrics (PG 9.1-9.3)" query: SELECT checkpoints_timed, checkpoints_req, 0::BIGINT AS checkpoint_write_time, 0::BIGINT AS checkpoint_sync_time, buffers_checkpoint, buffers_clean, buffers_backend, maxwritten_clean, buffers_backend_fsync, buffers_alloc, extract(EPOCH FROM stats_reset) AS reset_time FROM pg_stat_bgwriter; ttl: 10 min_version: 90100 max_version: 90400 tags: [ cluster ] metrics: - checkpoints_timed: { usage: COUNTER ,description: "Number of scheduled checkpoints that have been performed" } - checkpoints_req: { usage: COUNTER ,description: "Number of requested checkpoints that have been performed" } - checkpoint_write_time: { usage: COUNTER ,scale: 1e-3 ,description: "Total amount of time spent writing checkpoint files, in seconds (N/A on 9.1-9.3, always 0)" } - checkpoint_sync_time: { usage: COUNTER ,scale: 1e-3 ,description: "Total amount of time spent syncing checkpoint files, in seconds (N/A on 9.1-9.3, always 0)" } - buffers_checkpoint: { usage: COUNTER ,description: "Number of buffers written during checkpoints" } - buffers_clean: { usage: COUNTER ,description: "Number of buffers written by the background writer" } - buffers_backend: { usage: COUNTER ,description: "Number of buffers written directly by a backend" } - maxwritten_clean: { usage: COUNTER ,description: "Number of times the background writer stopped a cleaning scan because it had written too many buffers" } - buffers_backend_fsync: { usage: COUNTER ,description: "Number of times a backend had to execute its own fsync call" } - buffers_alloc: { usage: COUNTER ,description: "Number of buffers allocated" } - reset_time: { usage: GAUGE ,description: "Time at which bgwriter statistics were last reset" } ================================================ FILE: legacy/config/0331-pg_checkpointer.yml ================================================ #==============================================================# # 0331 pg_checkpointer #==============================================================# pg_checkpointer_94: name: pg_checkpointer desc: "PostgreSQL checkpointer stat metrics for pg 9.4-16" query: SELECT checkpoints_timed, checkpoints_req, checkpoint_write_time, checkpoint_sync_time, buffers_checkpoint, extract(EPOCH FROM stats_reset) AS reset_time FROM pg_stat_bgwriter; ttl: 10 min_version: 90400 max_version: 170000 tags: [ cluster ] metrics: - checkpoints_timed: { usage: COUNTER ,rename: timed ,description: "Number of scheduled checkpoints that have been performed" } - checkpoints_req: { usage: COUNTER ,rename: req ,description: "Number of requested checkpoints that have been performed" } - checkpoint_write_time: { usage: COUNTER ,rename: write_time ,scale: 1e-3 ,description: "Total amount of time that has been spent writing checkpoint files, in seconds" } - checkpoint_sync_time: { usage: COUNTER ,rename: sync_time ,scale: 1e-3 ,description: "Total amount of time that has been spent synchronizing checkpoint files to disk, in seconds" } - buffers_checkpoint: { usage: COUNTER ,rename: buffers_written ,description: "Number of buffers written during checkpoints and restartpoints" } - reset_time: { usage: GAUGE ,description: "Time at which checkpointer statistics were last reset" } pg_checkpointer_91: name: pg_checkpointer desc: "PostgreSQL checkpointer stat metrics for pg 9.1-9.3" query: SELECT checkpoints_timed, checkpoints_req, 0::BIGINT AS checkpoint_write_time, 0::BIGINT AS checkpoint_sync_time, buffers_checkpoint, extract(EPOCH FROM stats_reset) AS reset_time FROM pg_stat_bgwriter; ttl: 10 min_version: 90100 max_version: 90400 tags: [ cluster ] metrics: - checkpoints_timed: { usage: COUNTER ,rename: timed ,description: "Number of scheduled checkpoints that have been performed" } - checkpoints_req: { usage: COUNTER ,rename: req ,description: "Number of requested checkpoints that have been performed" } - checkpoint_write_time: { usage: COUNTER ,rename: write_time ,scale: 1e-3 ,description: "Total amount of time that has been spent writing checkpoint files, in seconds (N/A on 9.1-9.3, always 0)" } - checkpoint_sync_time: { usage: COUNTER ,rename: sync_time ,scale: 1e-3 ,description: "Total amount of time that has been spent synchronizing checkpoint files to disk, in seconds (N/A on 9.1-9.3, always 0)" } - buffers_checkpoint: { usage: COUNTER ,rename: buffers_written ,description: "Number of buffers written during checkpoints and restartpoints" } - reset_time: { usage: GAUGE ,description: "Time at which checkpointer statistics were last reset" } ================================================ FILE: legacy/config/0340-pg_ssl.yml ================================================ #==============================================================# # 0340 pg_ssl #==============================================================# pg_ssl: name: pg_ssl desc: PostgreSQL SSL client connection count query: | SELECT count(*) FILTER (WHERE ssl) AS enabled, count(*) FILTER ( WHERE NOT ssl) AS disabled FROM pg_stat_ssl; ttl: 10 min_version: 90500 tags: [ cluster ] metrics: - enabled: { usage: GAUGE ,description: "Number of client connection that use ssl" } - disabled: { usage: GAUGE ,description: "Number of client connection that does not use ssl" } ================================================ FILE: legacy/config/0350-pg_checkpoint.yml ================================================ #==============================================================# # 0350 pg_checkpoint #==============================================================# pg_checkpoint: name: pg_checkpoint desc: checkpoint information from pg_control_checkpoint (9.6) query: |- SELECT (('x' || lpad(split_part(checkpoint_location::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(checkpoint_location::text, '/', 2), 8, '0'))::bit(32)::bigint) AS checkpoint_lsn, (('x' || lpad(split_part(redo_location::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(redo_location::text, '/', 2), 8, '0'))::bit(32)::bigint) AS redo_lsn, timeline_id AS tli, prev_timeline_id AS prev_tli, full_page_writes, split_part(next_xid, ':', 1) AS next_xid_epoch, split_part(next_xid, ':', 2) AS next_xid, next_oid::BIGINT, next_multixact_id::text::BIGINT, next_multi_offset::text::BIGINT, oldest_xid::text::BIGINT, oldest_xid_dbid::text::BIGINT, oldest_active_xid::text::BIGINT, oldest_multi_xid::text::BIGINT, oldest_multi_dbid::BIGINT, oldest_commit_ts_xid::text::BIGINT, newest_commit_ts_xid::text::BIGINT, checkpoint_time AS time, extract(epoch from now() - checkpoint_time) AS elapse FROM pg_control_checkpoint(); ttl: 60 min_version: 90600 tags: [ cluster ] metrics: - checkpoint_lsn: { usage: COUNTER ,description: "Latest checkpoint location" } - redo_lsn: { usage: COUNTER ,description: "Latest checkpoint's REDO location" } - tli: { usage: COUNTER ,description: "Latest checkpoint's TimeLineID" } - prev_tli: { usage: COUNTER ,description: "Latest checkpoint's PrevTimeLineID" } - full_page_writes: { usage: GAUGE ,description: "Latest checkpoint's full_page_writes enabled" } - next_xid_epoch: { usage: COUNTER ,description: "Latest checkpoint's NextXID epoch" } - next_xid: { usage: COUNTER ,description: "Latest checkpoint's NextXID xid" } - next_oid: { usage: COUNTER ,description: "Latest checkpoint's NextOID" } - next_multixact_id: { usage: COUNTER ,description: "Latest checkpoint's NextMultiXactId" } - next_multi_offset: { usage: COUNTER ,description: "Latest checkpoint's NextMultiOffset" } - oldest_xid: { usage: COUNTER ,description: "Latest checkpoint's oldestXID" } - oldest_xid_dbid: { usage: GAUGE ,description: "Latest checkpoint's oldestXID's DB OID" } - oldest_active_xid: { usage: COUNTER ,description: "Latest checkpoint's oldestActiveXID" } - oldest_multi_xid: { usage: COUNTER ,description: "Latest checkpoint's oldestMultiXid" } - oldest_multi_dbid: { usage: GAUGE ,description: "Latest checkpoint's oldestMulti's DB OID" } - oldest_commit_ts_xid: { usage: COUNTER ,description: "Latest checkpoint's oldestCommitTsXid" } - newest_commit_ts_xid: { usage: COUNTER ,description: "Latest checkpoint's newestCommitTsXid" } - time: { usage: COUNTER ,description: "Time of latest checkpoint" } - elapse: { usage: GAUGE ,description: "Seconds elapsed since latest checkpoint in seconds" } ================================================ FILE: legacy/config/0355-pg_timeline.yml ================================================ #==============================================================# # 0355 pg_timeline #==============================================================# pg_timeline: name: pg_timeline desc: Current timeline ID from primary or replica query: | SELECT COALESCE( (SELECT received_tli FROM pg_stat_wal_receiver), (SELECT timeline_id FROM pg_control_checkpoint()) ) AS id; ttl: 10 min_version: 90600 tags: [ cluster ] metrics: - id: { usage: GAUGE ,description: "Current timeline ID" } ================================================ FILE: legacy/config/0360-pg_recovery.yml ================================================ #==============================================================# # 0360 pg_recovery #==============================================================# pg_recovery: name: pg_recovery desc: PostgreSQL control recovery metrics (9.6) query: | SELECT min_recovery_end_timeline AS min_timeline, (('x' || lpad(split_part(min_recovery_end_lsn::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(min_recovery_end_lsn::text, '/', 2), 8, '0'))::bit(32)::bigint) AS min_lsn, (('x' || lpad(split_part(backup_start_lsn::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(backup_start_lsn::text, '/', 2), 8, '0'))::bit(32)::bigint) AS backup_start_lsn, (('x' || lpad(split_part(backup_end_lsn::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(backup_end_lsn::text, '/', 2), 8, '0'))::bit(32)::bigint) AS backup_end_lsn, end_of_backup_record_required AS require_record FROM pg_control_recovery(); ttl: 10 min_version: 90600 tags: [ cluster, replica ] metrics: - min_timeline: { usage: COUNTER ,description: "Min recovery ending loc's timeline" } - min_lsn: { usage: COUNTER ,description: "Minimum recovery ending location" } - backup_start_lsn: { usage: COUNTER ,description: "Backup start location" } - backup_end_lsn: { usage: COUNTER ,description: "Backup end location" } - require_record: { usage: GAUGE ,description: "End-of-backup record required" } ================================================ FILE: legacy/config/0410-pg_activity.yml ================================================ #==============================================================# # 0410 pg_activity #==============================================================# pg_activity_92: name: pg_activity desc: PostgreSQL backend activity group by database and state (9.2+) query: |- SELECT datname, state, coalesce(count, 0) AS count, coalesce(max_duration, 0) AS max_duration, coalesce(max_tx_duration, 0) AS max_tx_duration, coalesce(max_conn_duration, 0) AS max_conn_duration FROM (SELECT d.datname, a.state FROM pg_database d, unnest(ARRAY ['active','idle','idle in transaction','idle in transaction (aborted)','fastpath function call','disabled']) a(state) WHERE d.datallowconn AND NOT d.datistemplate) base LEFT JOIN (SELECT datname, state, count(*) AS count, max(extract(epoch from now() - state_change)) AS max_duration, max(extract(epoch from now() - xact_start)) AS max_tx_duration, max(extract(epoch from now() - backend_start)) AS max_conn_duration FROM pg_stat_activity WHERE pid <> pg_backend_pid() GROUP BY 1,2) data USING (datname,state); ttl: 10 min_version: 90200 tags: [ cluster ] metrics: - datname: { usage: LABEL ,description: "Name of the database this backend is connected to" } - state: { usage: LABEL ,description: "Current overall state of this backend." } - count: { usage: GAUGE ,description: "Count of connection among (datname,state)" } - max_duration: { usage: GAUGE ,description: "Max duration since last state change among (datname, state)" } - max_tx_duration: { usage: GAUGE ,description: "Max transaction duration since state change among (datname, state)" } - max_conn_duration: { usage: GAUGE ,description: "Max backend session duration since state change among (datname, state)" } pg_activity_91: name: pg_activity desc: PostgreSQL backend activity group by database (9.1) query: | SELECT datname, 'active' AS state, count(*) AS count, max(extract(epoch from now() - query_start)) AS max_duration, max(extract(epoch from now() - xact_start)) AS max_tx_duration, max(extract(epoch from now() - backend_start)) AS max_conn_duration FROM pg_stat_activity WHERE procpid <> pg_backend_pid() AND datname IS NOT NULL GROUP BY datname; ttl: 10 min_version: 90100 max_version: 90200 tags: [ cluster ] metrics: - datname: { usage: LABEL ,description: "Name of the database this backend is connected to" } - state: { usage: LABEL ,description: "Current overall state of this backend (always active on 9.1)" } - count: { usage: GAUGE ,description: "Count of connection among (datname,state)" } - max_duration: { usage: GAUGE ,description: "Max duration since query start among (datname)" } - max_tx_duration: { usage: GAUGE ,description: "Max transaction duration among (datname)" } - max_conn_duration: { usage: GAUGE ,description: "Max backend session duration among (datname)" } ================================================ FILE: legacy/config/0420-pg_wait.yml ================================================ #==============================================================# # 0420 pg_wait #==============================================================# pg_wait_96: name: pg_wait desc: PostgreSQL backend client count group by wait event type (9.6) query: | SELECT coalesce(datname, '_system') AS datname, coalesce(wait_event_type, 'Running') AS event, count(*) AS count FROM pg_stat_activity GROUP BY 1, 2; ttl: 10 min_version: 90600 tags: [ cluster ] metrics: - datname: { usage: LABEL ,description: "Name of the database, _system for global process" } - event: { usage: LABEL ,description: "Wait event type" } - count: { usage: GAUGE ,description: "Count of WaitEvent on target database" } pg_wait_91: name: pg_wait desc: PostgreSQL backend client count group by waiting flag (9.1-9.5) query: | SELECT coalesce(datname, '_system') AS datname, CASE WHEN waiting THEN 'Waiting' ELSE 'Running' END AS event, count(*) AS count FROM pg_stat_activity GROUP BY 1, 2; ttl: 10 min_version: 90100 max_version: 90600 tags: [ cluster ] metrics: - datname: { usage: LABEL ,description: "Name of the database, _system for global process" } - event: { usage: LABEL ,description: "Waiting flag, Waiting or Running" } - count: { usage: GAUGE ,description: "Backend count group by waiting flag" } ================================================ FILE: legacy/config/0440-pg_xact.yml ================================================ #==============================================================# # 0440 pg_xact #==============================================================# pg_xact: name: pg_xact desc: PostgreSQL transaction identifier metrics query: | WITH snap(v) AS (SELECT txid_current_snapshot()), xset(v) AS (SELECT txid_snapshot_xip(v) FROM snap), xnum(v) AS (SELECT count(*) FROM xset), xmin(v) AS (SELECT txid_snapshot_xmin(v) FROM snap), xmax(v) AS (SELECT txid_snapshot_xmax(v) FROM snap) SELECT xmin.v AS xmin, xmax.v AS xmax, xnum.v AS xnum FROM xmin, xmax, xnum; ttl: 10 min_version: 90100 tags: [ cluster ] metrics: - xmin: { usage: COUNTER ,description: "Earliest txid that is still active" } - xmax: { usage: COUNTER ,description: "First as-yet-unassigned txid" } - xnum: { usage: GAUGE ,description: "Current active transaction count" } ================================================ FILE: legacy/config/0450-pg_lock.yml ================================================ #==============================================================# # 0450 pg_lock #==============================================================# pg_lock: name: pg_lock desc: PostgreSQL lock distribution by mode and database query: | SELECT datname, mode, coalesce(count, 0) AS count FROM (SELECT d.oid AS database, d.datname, l.mode FROM pg_database d, unnest(ARRAY ['AccessShareLock','RowShareLock','RowExclusiveLock','ShareUpdateExclusiveLock', 'ShareLock','ShareRowExclusiveLock','ExclusiveLock','AccessExclusiveLock']) l(mode) WHERE d.datallowconn AND NOT d.datistemplate) base LEFT JOIN (SELECT database, mode, count(*) AS count FROM pg_locks WHERE database IS NOT NULL GROUP BY 1, 2) cnt USING (database, mode); ttl: 10 min_version: 90100 tags: [ cluster ] metrics: - datname: { usage: LABEL ,description: "Name of the database this backend is connected to" } - mode: { usage: LABEL ,description: "Name of the lock mode held or desired by this process" } - count: { usage: GAUGE ,description: "Number of locks of corresponding mode and database" } ================================================ FILE: legacy/config/0460-pg_query.yml ================================================ #==============================================================# # 0460 pg_query #==============================================================# pg_query_94: name: pg_query desc: PostgreSQL query statement metrics, require pg_stat_statements installed, 9.4 - 12 query: |- SELECT datname, queryid AS query, sum(calls) AS calls, sum(rows) AS rows, sum(total_time) AS exec_time, sum(blk_read_time) + sum(blk_write_time) AS io_time, sum(shared_blks_hit) AS sblk_hit, sum(shared_blks_read) AS sblk_read, sum(shared_blks_dirtied) AS sblk_dirtied, sum(shared_blks_written) AS sblk_written FROM pg_stat_statements(false) s JOIN pg_database d ON s.dbid = d.oid WHERE userid != 10 AND calls > 4 GROUP BY 1, 2 ORDER BY 3 DESC LIMIT 128; ttl: 10 timeout: 2 min_version: 90400 max_version: 130000 tags: [ cluster, "extension:pg_stat_statements" ] metrics: - datname: { usage: LABEL ,description: "Name of database" } - query: { usage: LABEL ,description: "QueryID generated from internal hash code, computed from the statement's parse tree" } - calls: { usage: COUNTER ,description: "Number of times the statement was executed" } - rows: { usage: COUNTER ,description: "Total number of rows retrieved or affected by the statement" } - exec_time: { usage: COUNTER ,scale: 1e-3 ,description: "Total time spent executing the statement, in seconds" } - io_time: { usage: COUNTER ,scale: 1e-3 ,description: "Total time the statement spent reading and writing blocks, in seconds" } - sblk_hit: { usage: COUNTER ,description: "Total number of shared block cache hits by the statement" } - sblk_read: { usage: COUNTER ,description: "Total number of shared blocks read by the statement" } - sblk_dirtied: { usage: COUNTER ,description: "Total number of shared blocks dirtied by the statement" } - sblk_written: { usage: COUNTER ,description: "Total number of shared blocks written by the statement" } pg_query_91: name: pg_query desc: PostgreSQL query statement metrics, require pg_stat_statements installed, 9.1 - 9.3 (no queryid) query: |- SELECT datname, md5(query) AS query, sum(calls) AS calls, sum(rows) AS rows, sum(total_time) AS exec_time, 0::FLOAT AS io_time, sum(shared_blks_hit) AS sblk_hit, sum(shared_blks_read) AS sblk_read, 0::BIGINT AS sblk_dirtied, sum(shared_blks_written) AS sblk_written FROM pg_stat_statements s JOIN pg_database d ON s.dbid = d.oid WHERE userid != 10 AND calls > 4 GROUP BY 1, 2 ORDER BY 3 DESC LIMIT 128; ttl: 10 timeout: 2 min_version: 90100 max_version: 90400 tags: [ cluster, "extension:pg_stat_statements" ] metrics: - datname: { usage: LABEL ,description: "Name of database" } - query: { usage: LABEL ,description: "MD5 hash of query text (no queryid before 9.4)" } - calls: { usage: COUNTER ,description: "Number of times the statement was executed" } - rows: { usage: COUNTER ,description: "Total number of rows retrieved or affected by the statement" } - exec_time: { usage: COUNTER ,scale: 1e-3 ,description: "Total time spent executing the statement, in seconds" } - io_time: { usage: COUNTER ,scale: 1e-3 ,description: "Total time the statement spent reading and writing blocks, in seconds (N/A before 9.4, always 0)" } - sblk_hit: { usage: COUNTER ,description: "Total number of shared block cache hits by the statement" } - sblk_read: { usage: COUNTER ,description: "Total number of shared blocks read by the statement" } - sblk_dirtied: { usage: COUNTER ,description: "Total number of shared blocks dirtied by the statement (N/A before 9.4, always 0)" } - sblk_written: { usage: COUNTER ,description: "Total number of shared blocks written by the statement" } ================================================ FILE: legacy/config/0610-pg_db.yml ================================================ #==============================================================# # 0610 pg_db #==============================================================# pg_db_92: name: pg_db desc: PostgreSQL database stats from pg_stat_database (9.2 - 9.6) query: |- SELECT d.datname, datid,age(datfrozenxid) AS age, datistemplate AS is_template, datallowconn AS allow_conn, datconnlimit AS conn_limit, datfrozenxid::TEXT::BIGINT as frozen_xid, numbackends,xact_commit,xact_rollback,xact_rollback + xact_commit AS xact_total, blks_read,blks_hit,blks_read + blks_hit AS blks_access,tup_returned,tup_fetched,tup_inserted,tup_updated,tup_deleted,tup_inserted + tup_updated + tup_deleted AS tup_modified, conflicts,temp_files,temp_bytes,deadlocks,blk_read_time,blk_write_time, extract(EPOCH FROM stats_reset) AS reset_time FROM pg_database d JOIN pg_stat_database sd ON d.oid = sd.datid; ttl: 10 min_version: 90200 tags: [ cluster ] metrics: - datname: { usage: LABEL ,description: "Name of the database" } - datid: { usage: GAUGE ,description: "OID of the database" } - age: { usage: GAUGE ,description: "Age of database calculated from datfrozenxid" } - is_template: { usage: GAUGE ,description: "If true(1), then this database can be cloned by any user with CREATEDB privileges" } - allow_conn: { usage: GAUGE ,description: "If false(0) then no one can connect to this database." } - conn_limit: { usage: GAUGE ,description: "Sets maximum number of concurrent connections that can be made to this database. -1 means no limit." } - frozen_xid: { usage: GAUGE ,description: "All transaction IDs before this one have been frozen" } - numbackends: { usage: GAUGE ,description: "Number of backends currently connected to this database" } - xact_commit: { usage: COUNTER ,description: "Number of transactions in this database that have been committed" } - xact_rollback: { usage: COUNTER ,description: "Number of transactions in this database that have been rolled back" } - xact_total: { usage: COUNTER ,description: "Number of transactions in this database" } - blks_read: { usage: COUNTER ,description: "Number of disk blocks read in this database" } - blks_hit: { usage: COUNTER ,description: "Number of times disk blocks were found already in the buffer cache" } - blks_access: { usage: COUNTER ,description: "Number of times disk blocks that accessed read+hit" } - tup_returned: { usage: COUNTER ,description: "Number of rows returned by queries in this database" } - tup_fetched: { usage: COUNTER ,description: "Number of rows fetched by queries in this database" } - tup_inserted: { usage: COUNTER ,description: "Number of rows inserted by queries in this database" } - tup_updated: { usage: COUNTER ,description: "Number of rows updated by queries in this database" } - tup_deleted: { usage: COUNTER ,description: "Number of rows deleted by queries in this database" } - tup_modified: { usage: COUNTER ,description: "Number of rows modified by queries in this database" } - conflicts: { usage: COUNTER ,description: "Number of queries canceled due to conflicts with recovery in this database" } - temp_files: { usage: COUNTER ,description: "Number of temporary files created by queries in this database" } - temp_bytes: { usage: COUNTER ,description: "Total amount of data written to temporary files by queries in this database." } - deadlocks: { usage: COUNTER ,description: "Number of deadlocks detected in this database" } - blk_read_time: { usage: COUNTER ,scale: 1e-3 ,description: "Time spent reading data file blocks by backends in this database, in seconds" } - blk_write_time: { usage: COUNTER ,scale: 1e-3 ,description: "Time spent writing data file blocks by backends in this database, in seconds" } - reset_time: { usage: GAUGE ,description: "Time at which database statistics were last reset" } pg_db_91: name: pg_db desc: PostgreSQL database stats from pg_stat_database (9.1, fewer columns) query: |- SELECT d.datname, datid,age(datfrozenxid) AS age, datistemplate AS is_template, datallowconn AS allow_conn, datconnlimit AS conn_limit, datfrozenxid::TEXT::BIGINT as frozen_xid, numbackends,xact_commit,xact_rollback,xact_rollback + xact_commit AS xact_total, blks_read,blks_hit,blks_read + blks_hit AS blks_access,tup_returned,tup_fetched,tup_inserted,tup_updated,tup_deleted,tup_inserted + tup_updated + tup_deleted AS tup_modified, conflicts, 0::BIGINT AS temp_files, 0::BIGINT AS temp_bytes, 0::BIGINT AS deadlocks, 0::BIGINT AS blk_read_time, 0::BIGINT AS blk_write_time, extract(EPOCH FROM stats_reset) AS reset_time FROM pg_database d JOIN pg_stat_database sd ON d.oid = sd.datid; ttl: 10 min_version: 90100 max_version: 90200 tags: [ cluster ] metrics: - datname: { usage: LABEL ,description: "Name of the database" } - datid: { usage: GAUGE ,description: "OID of the database" } - age: { usage: GAUGE ,description: "Age of database calculated from datfrozenxid" } - is_template: { usage: GAUGE ,description: "If true(1), then this database can be cloned by any user with CREATEDB privileges" } - allow_conn: { usage: GAUGE ,description: "If false(0) then no one can connect to this database." } - conn_limit: { usage: GAUGE ,description: "Sets maximum number of concurrent connections that can be made to this database. -1 means no limit." } - frozen_xid: { usage: GAUGE ,description: "All transaction IDs before this one have been frozen" } - numbackends: { usage: GAUGE ,description: "Number of backends currently connected to this database" } - xact_commit: { usage: COUNTER ,description: "Number of transactions in this database that have been committed" } - xact_rollback: { usage: COUNTER ,description: "Number of transactions in this database that have been rolled back" } - xact_total: { usage: COUNTER ,description: "Number of transactions in this database" } - blks_read: { usage: COUNTER ,description: "Number of disk blocks read in this database" } - blks_hit: { usage: COUNTER ,description: "Number of times disk blocks were found already in the buffer cache" } - blks_access: { usage: COUNTER ,description: "Number of times disk blocks that accessed read+hit" } - tup_returned: { usage: COUNTER ,description: "Number of rows returned by queries in this database" } - tup_fetched: { usage: COUNTER ,description: "Number of rows fetched by queries in this database" } - tup_inserted: { usage: COUNTER ,description: "Number of rows inserted by queries in this database" } - tup_updated: { usage: COUNTER ,description: "Number of rows updated by queries in this database" } - tup_deleted: { usage: COUNTER ,description: "Number of rows deleted by queries in this database" } - tup_modified: { usage: COUNTER ,description: "Number of rows modified by queries in this database" } - conflicts: { usage: COUNTER ,description: "Number of queries canceled due to conflicts with recovery in this database" } - temp_files: { usage: COUNTER ,description: "Number of temporary files created by queries in this database (N/A on 9.1, always 0)" } - temp_bytes: { usage: COUNTER ,description: "Total amount of data written to temporary files by queries in this database (N/A on 9.1, always 0)" } - deadlocks: { usage: COUNTER ,description: "Number of deadlocks detected in this database (N/A on 9.1, always 0)" } - blk_read_time: { usage: COUNTER ,scale: 1e-3 ,description: "Time spent reading data file blocks by backends in this database, in seconds (N/A on 9.1, always 0)" } - blk_write_time: { usage: COUNTER ,scale: 1e-3 ,description: "Time spent writing data file blocks by backends in this database, in seconds (N/A on 9.1, always 0)" } - reset_time: { usage: GAUGE ,description: "Time at which database statistics were last reset" } ================================================ FILE: legacy/config/0620-pg_db_confl.yml ================================================ #==============================================================# # 0620 pg_db_confl #==============================================================# # https://pgpedia.info/p/pg_stat_database_conflicts.html pg_db_confl: name: pg_db_confl desc: PostgreSQL database conflicts metrics for pg 9.1 - 9.6 query: SELECT datid,datname,confl_tablespace,confl_lock,confl_snapshot,confl_bufferpin,confl_deadlock FROM pg_stat_database_conflicts; ttl: 10 min_version: 90100 tags: [ cluster, replica ] metrics: - datid: { usage: DISCARD } - datname: { usage: LABEL ,description: "Name of this database" } - confl_tablespace: { usage: COUNTER ,description: "Number of queries in this database that have been canceled due to dropped tablespaces" } - confl_lock: { usage: COUNTER ,description: "Number of queries in this database that have been canceled due to lock timeouts" } - confl_snapshot: { usage: COUNTER ,description: "Number of queries in this database that have been canceled due to old snapshots" } - confl_bufferpin: { usage: COUNTER ,description: "Number of queries in this database that have been canceled due to pinned buffers" } - confl_deadlock: { usage: COUNTER ,description: "Number of queries in this database that have been canceled due to deadlocks" } ================================================ FILE: legacy/config/0700-pg_table.yml ================================================ #==============================================================# # 0700 pg_table #==============================================================# pg_table_94: name: pg_table desc: PostgreSQL table metrics 9.4-9.6 query: |- SELECT CURRENT_CATALOG AS datname, nsp.nspname || '.' || c.relname AS relname, c.oid AS relid, ascii(c.relkind) AS kind, c.relpages AS pages, c.reltuples AS tuples, c.relfrozenxid AS frozenxid, age(c.relfrozenxid) AS age, c.relnatts AS ncols, psut.seq_scan,psut.seq_tup_read,psut.idx_scan,psut.idx_tup_fetch,psut.seq_scan + psut.idx_scan AS tbl_scan, psut.seq_tup_read + psut.idx_tup_fetch AS tup_read, psut.n_tup_ins,psut.n_tup_upd,psut.n_tup_del,(psut.n_tup_ins + psut.n_tup_upd + psut.n_tup_del) AS n_tup_mod,psut.n_tup_hot_upd,psut.n_live_tup,psut.n_dead_tup, psut.n_mod_since_analyze,psut.last_vacuum,psut.last_autovacuum,psut.last_analyze,psut.last_autoanalyze, psut.vacuum_count,psut.autovacuum_count,psut.analyze_count,psut.autoanalyze_count, psio.heap_blks_read,psio.heap_blks_hit,psio.idx_blks_read,psio.idx_blks_hit,psio.toast_blks_read,psio.toast_blks_hit,psio.tidx_blks_read,psio.tidx_blks_hit FROM pg_class c JOIN pg_namespace nsp ON c.relnamespace = nsp.oid LEFT JOIN pg_stat_user_tables psut ON psut.relid = c.oid LEFT JOIN pg_statio_user_tables psio ON psio.relid = c.oid WHERE nsp.nspname !~ '^pg_' AND nsp.nspname !~ '^_' AND nsp.nspname !~ '^timescaledb' AND nsp.nspname !~ '^citus' AND nsp.nspname !~ '^columnar' AND nsp.nspname NOT IN ('pg_catalog','information_schema','pg_toast','repack','monitor') AND c.relkind = ANY (ARRAY ['r','m','t','p']) ORDER BY c.relpages DESC LIMIT 256; ttl: 10 timeout: 2 min_version: 90400 metrics: - datname: { usage: LABEL ,description: "Database name of this table" } - relname: { usage: LABEL ,description: "Relation name of this table" } - relid: { usage: GAUGE ,description: "Relation oid of this table" } - kind: { usage: GAUGE ,description: "Relation kind r/table/114,m/mview/109,t/toast/116" } - pages: { usage: GAUGE ,description: "Size of the on-disk representation of this table in pages" } - tuples: { usage: GAUGE ,description: "Estimated number of rows in this table" } - frozenxid: { usage: GAUGE ,description: "All txid before this have been frozen on this table" } - age: { usage: GAUGE ,description: "Age of this table in vacuum cycles" } - ncols: { usage: GAUGE ,description: "Number of columns in the table" } - seq_scan: { usage: COUNTER ,default: 0 ,description: "Number of sequential scans initiated on this table" } - seq_tup_read: { usage: COUNTER ,default: 0 ,description: "Number of live rows fetched by sequential scans" } - idx_scan: { usage: COUNTER ,default: 0 ,description: "Number of index scans initiated on this table" } - idx_tup_fetch: { usage: COUNTER ,default: 0 ,description: "Number of live rows fetched by index scans" } - tbl_scan: { usage: COUNTER ,default: 0 ,description: "Number of scans initiated on this table" } - tup_read: { usage: COUNTER ,default: 0 ,description: "Number of live rows fetched by scans" } - n_tup_ins: { usage: COUNTER ,default: 0 ,description: "Number of rows inserted" } - n_tup_upd: { usage: COUNTER ,default: 0 ,description: "Number of rows updated (includes HOT updated rows)" } - n_tup_del: { usage: COUNTER ,default: 0 ,description: "Number of rows deleted" } - n_tup_mod: { usage: COUNTER ,default: 0 ,description: "Number of rows modified (insert + update + delete)" } - n_tup_hot_upd: { usage: COUNTER ,default: 0 ,description: "Number of rows HOT updated (i.e with no separate index update required)" } - n_live_tup: { usage: GAUGE ,description: "Estimated number of live rows" } - n_dead_tup: { usage: GAUGE ,description: "Estimated number of dead rows" } - n_mod_since_analyze: { usage: GAUGE ,description: "Estimated number of rows modified since this table was last analyzed" } - last_vacuum: { usage: DISCARD ,description: "Last time at which this table was manually vacuumed (not counting VACUUM FULL)" } - last_autovacuum: { usage: DISCARD ,description: "Last time at which this table was vacuumed by the autovacuum daemon" } - last_analyze: { usage: DISCARD ,description: "Last time at which this table was manually analyzed" } - last_autoanalyze: { usage: DISCARD ,description: "Last time at which this table was analyzed by the autovacuum daemon" } - vacuum_count: { usage: COUNTER ,default: 0 ,description: "Number of times this table has been manually vacuumed (not counting VACUUM FULL)" } - autovacuum_count: { usage: COUNTER ,default: 0 ,description: "Number of times this table has been vacuumed by the autovacuum daemon" } - analyze_count: { usage: COUNTER ,default: 0 ,description: "Number of times this table has been manually analyzed" } - autoanalyze_count: { usage: COUNTER ,default: 0 ,description: "Number of times this table has been analyzed by the autovacuum daemon" } - heap_blks_read: { usage: COUNTER ,default: 0 ,description: "Number of disk blocks read from this table" } - heap_blks_hit: { usage: COUNTER ,default: 0 ,description: "Number of buffer hits in this table" } - idx_blks_read: { usage: COUNTER ,default: 0 ,description: "Number of disk blocks read from all indexes on this table" } - idx_blks_hit: { usage: COUNTER ,default: 0 ,description: "Number of buffer hits in all indexes on this table" } - toast_blks_read: { usage: DISCARD ,default: 0 ,description: "Number of disk blocks read from this table's TOAST table (if any)" } - toast_blks_hit: { usage: DISCARD ,default: 0 ,description: "Number of buffer hits in this table's TOAST table (if any)" } - tidx_blks_read: { usage: DISCARD ,default: 0 ,description: "Number of disk blocks read from this table's TOAST table indexes (if any)" } - tidx_blks_hit: { usage: DISCARD ,default: 0 ,description: "Number of buffer hits in this table's TOAST table indexes (if any)" } pg_table_91: name: pg_table desc: PostgreSQL table metrics 9.1-9.3 (no n_mod_since_analyze) query: |- SELECT CURRENT_CATALOG AS datname, nsp.nspname || '.' || c.relname AS relname, c.oid AS relid, ascii(c.relkind) AS kind, c.relpages AS pages, c.reltuples AS tuples, c.relfrozenxid AS frozenxid, age(c.relfrozenxid) AS age, c.relnatts AS ncols, psut.seq_scan,psut.seq_tup_read,psut.idx_scan,psut.idx_tup_fetch,psut.seq_scan + psut.idx_scan AS tbl_scan, psut.seq_tup_read + psut.idx_tup_fetch AS tup_read, psut.n_tup_ins,psut.n_tup_upd,psut.n_tup_del,(psut.n_tup_ins + psut.n_tup_upd + psut.n_tup_del) AS n_tup_mod,psut.n_tup_hot_upd,psut.n_live_tup,psut.n_dead_tup, NULL::BIGINT AS n_mod_since_analyze, psut.last_vacuum,psut.last_autovacuum,psut.last_analyze,psut.last_autoanalyze, psut.vacuum_count,psut.autovacuum_count,psut.analyze_count,psut.autoanalyze_count, psio.heap_blks_read,psio.heap_blks_hit,psio.idx_blks_read,psio.idx_blks_hit,psio.toast_blks_read,psio.toast_blks_hit,psio.tidx_blks_read,psio.tidx_blks_hit FROM pg_class c JOIN pg_namespace nsp ON c.relnamespace = nsp.oid LEFT JOIN pg_stat_user_tables psut ON psut.relid = c.oid LEFT JOIN pg_statio_user_tables psio ON psio.relid = c.oid WHERE nsp.nspname !~ '^pg_' AND nsp.nspname !~ '^_' AND nsp.nspname !~ '^timescaledb' AND nsp.nspname !~ '^citus' AND nsp.nspname !~ '^columnar' AND nsp.nspname NOT IN ('pg_catalog','information_schema','pg_toast','repack','monitor') AND c.relkind = ANY (ARRAY ['r','m','t','p']) ORDER BY c.relpages DESC LIMIT 256; ttl: 10 timeout: 2 min_version: 90100 max_version: 90400 metrics: - datname: { usage: LABEL ,description: "Database name of this table" } - relname: { usage: LABEL ,description: "Relation name of this table" } - relid: { usage: GAUGE ,description: "Relation oid of this table" } - kind: { usage: GAUGE ,description: "Relation kind r/table/114,t/toast/116" } - pages: { usage: GAUGE ,description: "Size of the on-disk representation of this table in pages" } - tuples: { usage: GAUGE ,description: "Estimated number of rows in this table" } - frozenxid: { usage: GAUGE ,description: "All txid before this have been frozen on this table" } - age: { usage: GAUGE ,description: "Age of this table in vacuum cycles" } - ncols: { usage: GAUGE ,description: "Number of columns in the table" } - seq_scan: { usage: COUNTER ,default: 0 ,description: "Number of sequential scans initiated on this table" } - seq_tup_read: { usage: COUNTER ,default: 0 ,description: "Number of live rows fetched by sequential scans" } - idx_scan: { usage: COUNTER ,default: 0 ,description: "Number of index scans initiated on this table" } - idx_tup_fetch: { usage: COUNTER ,default: 0 ,description: "Number of live rows fetched by index scans" } - tbl_scan: { usage: COUNTER ,default: 0 ,description: "Number of scans initiated on this table" } - tup_read: { usage: COUNTER ,default: 0 ,description: "Number of live rows fetched by scans" } - n_tup_ins: { usage: COUNTER ,default: 0 ,description: "Number of rows inserted" } - n_tup_upd: { usage: COUNTER ,default: 0 ,description: "Number of rows updated (includes HOT updated rows)" } - n_tup_del: { usage: COUNTER ,default: 0 ,description: "Number of rows deleted" } - n_tup_mod: { usage: COUNTER ,default: 0 ,description: "Number of rows modified (insert + update + delete)" } - n_tup_hot_upd: { usage: COUNTER ,default: 0 ,description: "Number of rows HOT updated (i.e with no separate index update required)" } - n_live_tup: { usage: GAUGE ,description: "Estimated number of live rows" } - n_dead_tup: { usage: GAUGE ,description: "Estimated number of dead rows" } - n_mod_since_analyze: { usage: GAUGE ,description: "Estimated number of rows modified since this table was last analyzed (N/A on 9.1-9.3, NULL)" } - last_vacuum: { usage: DISCARD ,description: "Last time at which this table was manually vacuumed (not counting VACUUM FULL)" } - last_autovacuum: { usage: DISCARD ,description: "Last time at which this table was vacuumed by the autovacuum daemon" } - last_analyze: { usage: DISCARD ,description: "Last time at which this table was manually analyzed" } - last_autoanalyze: { usage: DISCARD ,description: "Last time at which this table was analyzed by the autovacuum daemon" } - vacuum_count: { usage: COUNTER ,default: 0 ,description: "Number of times this table has been manually vacuumed (not counting VACUUM FULL)" } - autovacuum_count: { usage: COUNTER ,default: 0 ,description: "Number of times this table has been vacuumed by the autovacuum daemon" } - analyze_count: { usage: COUNTER ,default: 0 ,description: "Number of times this table has been manually analyzed" } - autoanalyze_count: { usage: COUNTER ,default: 0 ,description: "Number of times this table has been analyzed by the autovacuum daemon" } - heap_blks_read: { usage: COUNTER ,default: 0 ,description: "Number of disk blocks read from this table" } - heap_blks_hit: { usage: COUNTER ,default: 0 ,description: "Number of buffer hits in this table" } - idx_blks_read: { usage: COUNTER ,default: 0 ,description: "Number of disk blocks read from all indexes on this table" } - idx_blks_hit: { usage: COUNTER ,default: 0 ,description: "Number of buffer hits in all indexes on this table" } - toast_blks_read: { usage: DISCARD ,default: 0 ,description: "Number of disk blocks read from this table's TOAST table (if any)" } - toast_blks_hit: { usage: DISCARD ,default: 0 ,description: "Number of buffer hits in this table's TOAST table (if any)" } - tidx_blks_read: { usage: DISCARD ,default: 0 ,description: "Number of disk blocks read from this table's TOAST table indexes (if any)" } - tidx_blks_hit: { usage: DISCARD ,default: 0 ,description: "Number of buffer hits in this table's TOAST table indexes (if any)" } ================================================ FILE: legacy/config/0710-pg_index.yml ================================================ #==============================================================# # 0710 pg_index #==============================================================# pg_index: name: pg_index desc: PostgreSQL index metrics (legacy 9.1-9.6) query: |- SELECT CURRENT_CATALOG AS datname, psui.schemaname || '.' || psui.indexrelname AS idxname, psui.schemaname || '.' || psui.relname AS relname, psui.indexrelid AS relid, c.relpages, c.reltuples, psui.idx_scan, psui.idx_tup_read, psui.idx_tup_fetch, psio.idx_blks_read, psio.idx_blks_hit FROM pg_stat_user_indexes psui JOIN pg_statio_user_indexes psio ON psio.indexrelid = psui.indexrelid JOIN pg_class c ON c.oid = psui.indexrelid WHERE psui.schemaname !~ '^pg_' AND psui.schemaname !~ '^_' AND psui.schemaname !~ '^timescaledb' AND psui.schemaname !~ '^citus' AND psui.schemaname !~ '^columnar' AND psui.schemaname NOT IN ('pg_catalog','information_schema','pg_toast','repack','monitor') ORDER BY psui.idx_tup_read DESC LIMIT 512; ttl: 10 timeout: 1 min_version: 90100 metrics: - datname: { usage: LABEL ,description: "Database name of this index" } - idxname: { usage: LABEL ,description: "Name of this index (full-qualified schema name)" } - relname: { usage: LABEL ,description: "Name of the table for this index (full-qualified schema name)" } - relid: { usage: LABEL ,description: "Relation oid of this index" } - relpages: { usage: GAUGE ,description: "Size of the on-disk representation of this index in pages" } - reltuples: { usage: GAUGE ,description: "Estimate relation tuples" } - idx_scan: { usage: COUNTER ,description: "Number of index scans initiated on this index" } - idx_tup_read: { usage: COUNTER ,description: "Number of index entries returned by scans on this index" } - idx_tup_fetch: { usage: COUNTER ,description: "Number of live table rows fetched by simple index scans using this index" } - idx_blks_read: { usage: COUNTER ,description: "Number of disk blocks read from this index" } - idx_blks_hit: { usage: COUNTER ,description: "Number of buffer hits in this index" } ================================================ FILE: legacy/config/0720-pg_func.yml ================================================ #==============================================================# # 0720 pg_func #==============================================================# pg_func: desc: PostgreSQL function metrics query: SELECT CURRENT_CATALOG AS datname, schemaname || '.' || funcname AS funcname, sum(calls) AS calls, sum(total_time) AS total_time, sum(self_time) AS self_time FROM pg_stat_user_functions GROUP BY 2 ORDER BY 4 DESC LIMIT 128; ttl: 10 min_version: 90100 metrics: - datname: { usage: LABEL ,description: "Name of belonged database" } - funcname: { usage: LABEL ,description: "Name of this function, may have multiple override" } - calls: { usage: COUNTER ,description: "Number of times this function has been called" } - total_time: { usage: COUNTER ,scale: 1e-3 ,description: "Total time spent in this function and all other functions called by it, in seconds" } - self_time: { usage: COUNTER ,scale: 1e-3 ,description: "Total time spent in this function itself, not including other functions called by it, in seconds" } ================================================ FILE: legacy/config/0740-pg_relkind.yml ================================================ #==============================================================# # 0740 pg_relkind #==============================================================# pg_relkind: name: pg_relkind desc: Postgres relation count by kind query: | SELECT CURRENT_CATALOG AS datname, relkind, count(*) AS count FROM pg_class GROUP BY relkind; ttl: 60 timeout: 1 min_version: 90100 metrics: - datname: { usage: LABEL ,description: "Database name" } - relkind: { usage: LABEL ,description: "Relation kind (r,i,S,t,v,c,...)" } - count: { usage: GAUGE ,description: "Number of relations" } ================================================ FILE: legacy/config/0810-pg_table_size.yml ================================================ #==============================================================# # 0810 pg_table_size #==============================================================# pg_table_size: desc: PostgreSQL table size metrics, quite slow query: |- SELECT CURRENT_CATALOG AS datname, nsp.nspname || '.' || rel.relname AS relname, pg_total_relation_size(rel.oid) AS bytes, pg_relation_size(rel.oid) AS relsize, pg_indexes_size(rel.oid) AS indexsize, pg_total_relation_size(reltoastrelid) AS toastsize FROM pg_namespace nsp JOIN pg_class rel ON nsp.oid = rel.relnamespace WHERE nspname <> ALL(ARRAY['pg_catalog', 'information_schema']) AND rel.relkind = 'r' ORDER BY 3 DESC NULLS LAST LIMIT 256; ttl: 300 timeout: 2 min_version: 90100 metrics: - datname: { usage: LABEL ,description: "Database name of this table" } - relname: { usage: LABEL ,description: "Schema qualified table name" } - bytes: { usage: GAUGE ,default: 0 ,description: "Total bytes of this table (including toast, index, toast index)" } - relsize: { usage: GAUGE ,default: 0 ,description: "Bytes of this table itself (main, vm, fsm)" } - indexsize: { usage: GAUGE ,default: 0 ,description: "Bytes of all related indexes of this table" } - toastsize: { usage: GAUGE ,default: 0 ,description: "Bytes of toast tables of this table" } ================================================ FILE: legacy/config/0820-pg_table_bloat.yml ================================================ #==============================================================# # 0820 pg_table_bloat #==============================================================# # pg_table_bloat require auxiliary view to work. Disable it or create auxiliary view before use: pg_table_bloat: name: pg_table_bloat desc: PostgreSQL table bloat metrics, require auxiliary view pg_table_bloat to work query: SELECT datname, nspname || '.' || relname AS relname, size, ratio FROM pg_table_bloat ORDER BY size DESC LIMIT 64; ttl: 300 timeout: 2 min_version: 90400 skip: true metrics: - datname: { usage: LABEL ,description: "Database name of this table" } - relname: { usage: LABEL ,description: "Schema qualified name of this table" } - size: { usage: GAUGE ,description: "Total bytes of this table" } - ratio: { usage: GAUGE ,description: "Estimated bloat ratio of this table from 0 to 1" } ================================================ FILE: legacy/config/0830-pg_index_bloat.yml ================================================ #==============================================================# # 0830 pg_index_bloat #==============================================================# # pg_index_bloat require auxiliary view to work. Disable it or create auxiliary view before use: pg_index_bloat: name: pg_index_bloat desc: PostgreSQL index bloat metrics, require auxiliary view pg_index_bloat to work query: SELECT datname, nspname || '.' || idxname AS idxname, size, ratio FROM pg_index_bloat ORDER BY size DESC LIMIT 64; ttl: 300 timeout: 2 min_version: 90400 skip: true metrics: - datname: { usage: LABEL ,description: "Database name of this index" } - idxname: { usage: LABEL ,description: "Schema qualified name of this index" } - size: { usage: GAUGE ,description: "Total bytes of this index" } - ratio: { usage: GAUGE ,description: "Estimated bloat ratio of this index from 0 to 1" } ================================================ FILE: legacy/config/0910-pgbouncer_list.yml ================================================ #==============================================================# # 0910 pgbouncer_list #==============================================================# # http://www.pgbouncer.org/usage.html#show-lists pgbouncer_list: name: pgbouncer_list desc: Pgbouncer entry list query: SHOW LISTS; ttl: 10 min_version: 10800 fatal: true tags: [ pgbouncer ] metrics: - list: { usage: LABEL ,description: "Pgbouncer internal list name" } - items: { usage: GAUGE ,description: "Number of corresponding pgbouncer object" } ================================================ FILE: legacy/config/0920-pgbouncer_database.yml ================================================ #==============================================================# # 0920 pgbouncer_database #==============================================================# # http://www.pgbouncer.org/usage.html#show-databases pgbouncer_database_124: name: pgbouncer_database desc: Pgbouncer database stats (since 1.24) query: SHOW DATABASES; ttl: 10 min_version: 12400 tags: [ pgbouncer ] metrics: - name: { usage: LABEL ,rename: datname ,description: "Name of configured database entry" } - host: { usage: LABEL ,description: "Host that pgbouncer will connects to" } - port: { usage: LABEL ,description: "Port that pgbouncer will connects to" } - database: { usage: LABEL ,rename: real_datname ,description: "The real database name pgbouncer connects to" } - force_user: { usage: DISCARD } - pool_size: { usage: GAUGE ,description: "Maximum number of server connections" } - min_pool_size: { usage: GAUGE ,description: "Minimum number of server connections" } - reserve_pool_size: { usage: GAUGE ,rename: reserve_pool ,description: "Maximum number of additional connections for this database" } - server_lifetime: { usage: GAUGE ,description: "The maximum lifetime of a server connection for this database" } - pool_mode: { usage: DISCARD } - load_balance_hosts: { usage: DISCARD } - max_connections: { usage: GAUGE ,description: "Maximum number of allowed connections for this database" } - current_connections: { usage: GAUGE ,description: "Current number of connections for this database" } - max_client_connections: { usage: GAUGE ,description: "Maximum number of allowed client connections for this pgbouncer instance" } - current_client_connections: { usage: GAUGE ,description: "Current number of client connections for this database" } - paused: { usage: GAUGE ,description: "True(1) if this database is currently paused, else 0" } - disabled: { usage: GAUGE ,description: "True(1) if this database is currently disabled, else 0" } pgbouncer_database_123: name: pgbouncer_database desc: Pgbouncer database stats 1.23 query: SHOW DATABASES; ttl: 10 min_version: 12300 max_version: 12400 tags: [ pgbouncer ] metrics: - name: { usage: LABEL ,rename: datname ,description: "Name of configured database entry" } - host: { usage: LABEL ,description: "Host that pgbouncer will connects to" } - port: { usage: LABEL ,description: "Port that pgbouncer will connects to" } - database: { usage: LABEL ,rename: real_datname ,description: "The real database name pgbouncer connects to" } - force_user: { usage: DISCARD } - pool_size: { usage: GAUGE ,description: "Maximum number of server connections" } - min_pool_size: { usage: GAUGE ,description: "Minimum number of server connections" } - reserve_pool: { usage: GAUGE ,description: "Maximum number of additional connections for this database" } - server_lifetime: { usage: GAUGE ,description: "The maximum lifetime of a server connection for this database" } - pool_mode: { usage: DISCARD } - max_connections: { usage: GAUGE ,description: "Maximum number of allowed connections for this database" } - current_connections: { usage: GAUGE ,description: "Current number of connections for this database" } - paused: { usage: GAUGE ,description: "True(1) if this database is currently paused, else 0" } - disabled: { usage: GAUGE ,description: "True(1) if this database is currently disabled, else 0" } pgbouncer_database_116: name: pgbouncer_database desc: Pgbouncer database stats (1.16-1.22) query: SHOW DATABASES; ttl: 10 min_version: 11600 max_version: 12300 tags: [ pgbouncer ] metrics: - name: { usage: LABEL ,rename: datname ,description: "Name of configured database entry" } - host: { usage: LABEL ,description: "Host that pgbouncer will connects to" } - port: { usage: LABEL ,description: "Port that pgbouncer will connects to" } - database: { usage: LABEL ,rename: real_datname ,description: "The real database name pgbouncer connects to" } - force_user: { usage: DISCARD } - pool_size: { usage: GAUGE ,description: "Maximum number of server connections" } - min_pool_size: { usage: GAUGE ,description: "Minimum number of server connections" } - reserve_pool: { usage: GAUGE ,description: "Maximum number of additional connections for this database" } - pool_mode: { usage: DISCARD } - max_connections: { usage: GAUGE ,description: "Maximum number of allowed connections for this database" } - current_connections: { usage: GAUGE ,description: "Current number of connections for this database" } - paused: { usage: GAUGE ,description: "True(1) if this database is currently paused, else 0" } - disabled: { usage: GAUGE ,description: "True(1) if this database is currently disabled, else 0" } pgbouncer_database_108: name: pgbouncer_database desc: Pgbouncer database stats (1.08-1.15) query: SHOW DATABASES; ttl: 10 min_version: 10800 max_version: 11600 tags: [ pgbouncer ] metrics: - name: { usage: LABEL ,rename: datname ,description: "Name of configured database entry" } - host: { usage: LABEL ,description: "Host that pgbouncer will connects to" } - port: { usage: LABEL ,description: "Port that pgbouncer will connects to" } - database: { usage: LABEL ,rename: real_datname ,description: "The real database name pgbouncer connects to" } - force_user: { usage: DISCARD } - pool_size: { usage: GAUGE ,description: "Maximum number of server connections" } - reserve_pool: { usage: GAUGE ,description: "Maximum number of additional connections for this database" } - pool_mode: { usage: DISCARD } - max_connections: { usage: GAUGE ,description: "Maximum number of allowed connections for this database" } - current_connections: { usage: GAUGE ,description: "Current number of connections for this database" } - paused: { usage: GAUGE ,description: "True(1) if this database is currently paused, else 0" } - disabled: { usage: GAUGE ,description: "True(1) if this database is currently disabled, else 0" } ================================================ FILE: legacy/config/0930-pgbouncer_stat.yml ================================================ #==============================================================# # 0930 pgbouncer_stat #==============================================================# # http://www.pgbouncer.org/usage.html#show-stats pgbouncer_stat_124: name: pgbouncer_stat desc: Pgbouncer stats per database (since 1.24) query: SHOW STATS; ttl: 10 min_version: 12400 tags: [ pgbouncer ] metrics: - database: { usage: LABEL ,rename: datname ,description: "Name of database" } - total_xact_count: { usage: COUNTER ,description: "Total number of SQL transactions pooled by pgbouncer" } - total_query_count: { usage: COUNTER ,description: "Total number of SQL queries pooled by pgbouncer" } - total_server_assignment_count: { usage: COUNTER ,description: "Total times a server was assigned to a client" } - total_received: { usage: COUNTER ,description: "Total volume in bytes of network traffic received by pgbouncer" } - total_sent: { usage: COUNTER ,description: "Total volume in bytes of network traffic sent by pgbouncer" } - total_xact_time: { usage: COUNTER ,scale: 1e-6 ,description: "Total number of seconds spent when in a transaction" } - total_query_time: { usage: COUNTER ,scale: 1e-6 ,description: "Total number of seconds spent when executing queries" } - total_wait_time: { usage: COUNTER ,scale: 1e-6 ,description: "Time spent by clients waiting for a server, in seconds" } - total_client_parse_count: { usage: COUNTER ,description: "Total number of prepared statements created by clients" } - total_server_parse_count: { usage: COUNTER ,description: "Total number of prepared statements created on a server." } - total_bind_count: { usage: COUNTER ,description: "Total number of prepared statements readied for execution by clients and forwarded to postgres" } - avg_xact_count: { usage: GAUGE ,description: "Average transactions per second in last stat period" } - avg_query_count: { usage: GAUGE ,description: "Average queries per second in last stat period" } - avg_server_assignment_count: { usage: GAUGE ,description: "Average number of times a server as assigned to a client per second in the last stat period." } - avg_recv: { usage: GAUGE ,description: "Average received (from clients) bytes per second" } - avg_sent: { usage: GAUGE ,description: "Average sent (to clients) bytes per second" } - avg_xact_time: { usage: GAUGE ,scale: 1e-6 ,description: "Average transaction duration, in seconds" } - avg_query_time: { usage: GAUGE ,scale: 1e-6 ,description: "Average query duration, in seconds" } - avg_wait_time: { usage: GAUGE ,scale: 1e-6 ,description: "Time spent by clients waiting for a server, in seconds (average per second)." } - avg_client_parse_count: { usage: GAUGE ,description: "Average number of prepared statements created by clients" } - avg_server_parse_count: { usage: GAUGE ,description: "Average number of prepared statements created on a server." } - avg_bind_count: { usage: GAUGE ,description: "Average number of prepared statements readied for execution by clients and forwarded to postgres" } pgbouncer_stat_123: name: pgbouncer_stat desc: Pgbouncer stats per database (1.23) query: SHOW STATS; ttl: 10 min_version: 12300 max_version: 12400 tags: [ pgbouncer ] metrics: - database: { usage: LABEL ,rename: datname ,description: "Name of database" } - total_xact_count: { usage: COUNTER ,description: "Total number of SQL transactions pooled by pgbouncer" } - total_query_count: { usage: COUNTER ,description: "Total number of SQL queries pooled by pgbouncer" } - total_server_assignment_count: { usage: COUNTER ,description: "Total times a server was assigned to a client" } - total_received: { usage: COUNTER ,description: "Total volume in bytes of network traffic received by pgbouncer" } - total_sent: { usage: COUNTER ,description: "Total volume in bytes of network traffic sent by pgbouncer" } - total_xact_time: { usage: COUNTER ,scale: 1e-6 ,description: "Total number of seconds spent when in a transaction" } - total_query_time: { usage: COUNTER ,scale: 1e-6 ,description: "Total number of seconds spent when executing queries" } - total_wait_time: { usage: COUNTER ,scale: 1e-6 ,description: "Time spent by clients waiting for a server, in seconds" } - avg_xact_count: { usage: GAUGE ,description: "Average transactions per second in last stat period" } - avg_query_count: { usage: GAUGE ,description: "Average queries per second in last stat period" } - avg_server_assignment_count: { usage: GAUGE ,description: "Average number of times a server as assigned to a client per second in the last stat period." } - avg_recv: { usage: GAUGE ,description: "Average received (from clients) bytes per second" } - avg_sent: { usage: GAUGE ,description: "Average sent (to clients) bytes per second" } - avg_xact_time: { usage: GAUGE ,scale: 1e-6 ,description: "Average transaction duration, in seconds" } - avg_query_time: { usage: GAUGE ,scale: 1e-6 ,description: "Average query duration, in seconds" } - avg_wait_time: { usage: GAUGE ,scale: 1e-6 ,description: "Time spent by clients waiting for a server, in seconds (average per second)." } pgbouncer_stat_108: name: pgbouncer_stat desc: Pgbouncer stats per database (1.08 - 1.22) query: SHOW STATS; ttl: 10 min_version: 10800 max_version: 12300 tags: [ pgbouncer ] metrics: - database: { usage: LABEL ,rename: datname ,description: "Name of database" } - total_xact_count: { usage: COUNTER ,description: "Total number of SQL transactions pooled by pgbouncer" } - total_query_count: { usage: COUNTER ,description: "Total number of SQL queries pooled by pgbouncer" } - total_received: { usage: COUNTER ,description: "Total volume in bytes of network traffic received by pgbouncer" } - total_sent: { usage: COUNTER ,description: "Total volume in bytes of network traffic sent by pgbouncer" } - total_xact_time: { usage: COUNTER ,scale: 1e-6 ,description: "Total number of seconds spent when in a transaction" } - total_query_time: { usage: COUNTER ,scale: 1e-6 ,description: "Total number of seconds spent when executing queries" } - total_wait_time: { usage: COUNTER ,scale: 1e-6 ,description: "Time spent by clients waiting for a server, in seconds" } - avg_xact_count: { usage: GAUGE ,description: "Average transactions per second in last stat period" } - avg_query_count: { usage: GAUGE ,description: "Average queries per second in last stat period" } - avg_recv: { usage: GAUGE ,description: "Average received (from clients) bytes per second" } - avg_sent: { usage: GAUGE ,description: "Average sent (to clients) bytes per second" } - avg_xact_time: { usage: GAUGE ,scale: 1e-6 ,description: "Average transaction duration, in seconds" } - avg_query_time: { usage: GAUGE ,scale: 1e-6 ,description: "Average query duration, in seconds" } - avg_wait_time: { usage: GAUGE ,scale: 1e-6 ,description: "Time spent by clients waiting for a server, in seconds (average per second)." } ================================================ FILE: legacy/config/0940-pgbouncer_pool.yml ================================================ #==============================================================# # 0940 pgbouncer_pool #==============================================================# # http://www.pgbouncer.org/usage.html#show-pools pgbouncer_pool_124: name: pgbouncer_pool desc: Pgbouncer pool stats (1.24+) query: SHOW POOLS; ttl: 10 min_version: 12400 tags: [ pgbouncer ] metrics: - database: { usage: LABEL, rename: datname, description: "Database name of this pool" } - user: { usage: LABEL, description: "User name of this pool" } - cl_active: { usage: GAUGE, rename: active_clients, description: "Client connections that are linked to server connection and can process queries" } - cl_waiting: { usage: GAUGE, rename: waiting_clients, description: "Client connections that have sent queries but have not yet got a server connection" } - cl_active_cancel_req: { usage: GAUGE, rename: active_cancel_clients, description: "Client connections that have forwarded query cancellations to the server and are waiting for the server response." } - cl_waiting_cancel_req: { usage: GAUGE, rename: cancel_clients, description: "Client connections that have not forwarded query cancellations to the server yet." } - sv_active: { usage: GAUGE, rename: active_servers, description: "Server connections that are linked to a client" } - sv_active_cancel: { usage: GAUGE, rename: active_cancel_servers, description: "Server connections that are currently forwarding a cancel request" } - sv_being_canceled: { usage: GAUGE, rename: cancel_servers, description: "cancel requests have completed that were sent to cancel a query on this server" } - sv_idle: { usage: GAUGE, rename: idle_servers, description: "Server connections that are unused and immediately usable for client queries" } - sv_used: { usage: GAUGE, rename: used_servers, description: "Server connections that have been idle for more than server_check_delay (means have to run check query)" } - sv_tested: { usage: GAUGE, rename: tested_servers, description: "Server connections that are currently running reset or check query" } - sv_login: { usage: GAUGE, rename: login_servers, description: "Server connections currently in the process of logging in" } - maxwait: { usage: GAUGE, description: "How long the first(oldest) client in the queue has waited, in seconds, key metric" } - maxwait_us: { usage: GAUGE, description: "Microsecond part of the maximum waiting time." } - pool_mode: { usage: LABEL, description: "Pooling mode in use" } - load_balance_hosts: { usage: LABEL, description: "The load_balance_hosts in use" } pgbouncer_pool_118: name: pgbouncer_pool desc: Pgbouncer pool stats (1.18-1.23) query: SHOW POOLS; ttl: 10 min_version: 11800 max_version: 12400 tags: [ pgbouncer ] metrics: - database: { usage: LABEL, rename: datname, description: "Database name of this pool" } - user: { usage: LABEL, description: "User name of this pool" } - cl_active: { usage: GAUGE, rename: active_clients, description: "Client connections that are linked to server connection and can process queries" } - cl_waiting: { usage: GAUGE, rename: waiting_clients, description: "Client connections that have sent queries but have not yet got a server connection" } - cl_active_cancel_req: { usage: GAUGE, rename: active_cancel_clients, description: "Client connections that have forwarded query cancellations to the server and are waiting for the server response." } - cl_waiting_cancel_req: { usage: GAUGE, rename: cancel_clients, description: "Client connections that have not forwarded query cancellations to the server yet." } - sv_active: { usage: GAUGE, rename: active_servers, description: "Server connections that are linked to a client" } - sv_active_cancel: { usage: GAUGE, rename: active_cancel_servers, description: "Server connections that are currently forwarding a cancel request" } - sv_being_canceled: { usage: GAUGE, rename: cancel_servers, description: "cancel requests have completed that were sent to cancel a query on this server" } - sv_idle: { usage: GAUGE, rename: idle_servers, description: "Server connections that are unused and immediately usable for client queries" } - sv_used: { usage: GAUGE, rename: used_servers, description: "Server connections that have been idle for more than server_check_delay (means have to run check query)" } - sv_tested: { usage: GAUGE, rename: tested_servers, description: "Server connections that are currently running reset or check query" } - sv_login: { usage: GAUGE, rename: login_servers, description: "Server connections currently in the process of logging in" } - maxwait: { usage: GAUGE, description: "How long the first(oldest) client in the queue has waited, in seconds, key metric" } - maxwait_us: { usage: GAUGE, description: "Microsecond part of the maximum waiting time." } - pool_mode: { usage: LABEL, description: "Pooling mode in use" } pgbouncer_pool_116: name: pgbouncer_pool desc: Pgbouncer pool stats (1.16-1.17) query: SHOW POOLS; ttl: 10 min_version: 11600 max_version: 11800 tags: [ pgbouncer ] metrics: - database: { usage: LABEL, rename: datname, description: "Database name of this pool" } - user: { usage: LABEL, description: "User name of this pool" } - cl_active: { usage: GAUGE, rename: active_clients, description: "Client connections that are linked to server connection and can process queries" } - cl_waiting: { usage: GAUGE, rename: waiting_clients, description: "Client connections that have sent queries but have not yet got a server connection" } - cl_cancel_req: { usage: GAUGE, rename: cancel_clients, description: "Client connections that have not forwarded query cancellations to the server yet." } - sv_active: { usage: GAUGE, rename: active_servers, description: "Server connections that are linked to a client" } - sv_idle: { usage: GAUGE, rename: idle_servers, description: "Server connections that are unused and immediately usable for client queries" } - sv_used: { usage: GAUGE, rename: used_servers, description: "Server connections that have been idle for more than server_check_delay (means have to run check query)" } - sv_tested: { usage: GAUGE, rename: tested_servers, description: "Server connections that are currently running reset or check query" } - sv_login: { usage: GAUGE, rename: login_servers, description: "Server connections currently in the process of logging in" } - maxwait: { usage: GAUGE, description: "How long the first(oldest) client in the queue has waited, in seconds, key metric" } - maxwait_us: { usage: GAUGE, description: "Microsecond part of the maximum waiting time." } - pool_mode: { usage: LABEL, description: "Pooling mode in use" } pgbouncer_pool_108: name: pgbouncer_pool desc: Pgbouncer pool stats (1.08-1.15) query: SHOW POOLS; ttl: 10 min_version: 10800 max_version: 11600 tags: [ pgbouncer ] metrics: - database: { usage: LABEL, rename: datname, description: "Database name of this pool" } - user: { usage: LABEL, description: "User name of this pool" } - cl_active: { usage: GAUGE, rename: active_clients, description: "Client connections that are linked to server connection and can process queries" } - cl_waiting: { usage: GAUGE, rename: waiting_clients, description: "Client connections that have sent queries but have not yet got a server connection" } - sv_active: { usage: GAUGE, rename: active_servers, description: "Server connections that are linked to a client" } - sv_idle: { usage: GAUGE, rename: idle_servers, description: "Server connections that are unused and immediately usable for client queries" } - sv_used: { usage: GAUGE, rename: used_servers, description: "Server connections that have been idle for more than server_check_delay (means have to run check query)" } - sv_tested: { usage: GAUGE, rename: tested_servers, description: "Server connections that are currently running reset or check query" } - sv_login: { usage: GAUGE, rename: login_servers, description: "Server connections currently in the process of logging in" } - maxwait: { usage: GAUGE, description: "How long the first(oldest) client in the queue has waited, in seconds, key metric" } - maxwait_us: { usage: GAUGE, description: "Microsecond part of the maximum waiting time." } - pool_mode: { usage: LABEL, description: "Pooling mode in use" } ================================================ FILE: legacy/config/1800-pg_tsdb_hypertable.yml ================================================ #==============================================================# # 1800 pg_tsdb_hypertable #==============================================================# # this collector reqires timescaledb extension to be installed pg_tsdb_hypertable: name: pg_tsdb_hypertable desc: TimescaleDB hypertable overview query: |- SELECT current_database() AS datname, format('%I.%I', hypertable_schema, hypertable_name) AS relname, num_dimensions AS dimensions, num_chunks AS chunks, compression_enabled::BOOLEAN::int AS compressed, hypertable_size(format('"%I"."%I"', hypertable_schema, hypertable_name)::RegClass) AS bytes FROM timescaledb_information.hypertables; ttl: 60 timeout: 2 min_version: 90600 skip: true tags: [ "extension:timescaledb", "schema:timescaledb_information" ] metrics: - datname: { usage: LABEL ,description: "database name" } - relname: { usage: LABEL ,description: "Hypertable relation name" } - dimensions: { usage: GAUGE ,description: "Number of partitioning dimensions" } - chunks: { usage: GAUGE ,description: "Total chunks of this hypertable" } - compressed: { usage: GAUGE ,description: "1 if compression enabled" } - bytes: { usage: GAUGE ,description: "Total size of hypertable in bytes" } ================================================ FILE: legacy/config/1900-pg_citus.yml ================================================ #==============================================================# # 1900 pg_citus_node #==============================================================# # https://docs.citusdata.com/en/latest/develop/api_metadata.html#worker-node-table pg_citus_node: name: pg_citus_node desc: Citus worker coordinator node inventory query: |- SELECT CONCAT(nodename, ':', nodeport) AS node, current_database() AS datname, nodeid AS id, groupid AS group, hasmetadata::BOOLEAN::INT AS has_meta, isactive::BOOLEAN::INT AS is_active, metadatasynced::BOOLEAN::INT AS meta_synced, shouldhaveshards::BOOLEAN::INT AS have_shards FROM pg_dist_node; ttl: 60 min_version: 90600 tags: [ "extension:citus" ] metrics: - node: { usage: LABEL ,description: "nodename:port of the PostgreSQL instance" } - datname: { usage: LABEL ,description: "database name" } - id: { usage: GAUGE ,description: "auto‑generated node identifier" } - group: { usage: GAUGE ,description: "replication group id (primary + secondaries)" } - has_meta: { usage: GAUGE ,description: "1 = internal use flag set" } - is_active: { usage: GAUGE ,description: "1 = node currently accepts shards" } - meta_synced: { usage: GAUGE ,description: "1 = metadata fully synced to node" } - have_shards: { usage: GAUGE ,description: "1 = rebalancer may place shards here" } ================================================ FILE: legacy/config/2000-pg_heartbeat.yml ================================================ #==============================================================# # 2000 heartbeat #==============================================================# # this is a example of application monitoring and predicate queries pg_heartbeat: name: pg_heartbeat desc: monitoring heartbeat in monitor.heartbeat table predicate_queries: - name: if heartbeat table exists predicate_query: | SELECT EXISTS (SELECT 1 FROM information_schema.tables WHERE table_schema = 'monitor' AND table_name = 'heartbeat'); query: |- SELECT id AS cluster_name, extract(EPOCH FROM ts) AS ts, lsn, txid FROM monitor.heartbeat; ttl: 10 min_version: 90100 tags: [ "dbname:postgres", "schema:monitor" ] skip: true metrics: - cluster_name: { usage: LABEL ,description: "cluster_name param of this database cluster" } - ts: { usage: GAUGE ,description: "unix timestamp of the heartbeat" } - lsn: { usage: COUNTER ,description: "lsn of the heartbeat" } - txid: { usage: GAUGE ,description: "txid of the heartbeat" } ================================================ FILE: legacy/pg_exporter.yml ================================================ #==============================================================# # Desc : pg_exporter metrics collector definition (Legacy) # Ver : PostgreSQL 9.1 ~ 9.6 and pgbouncer 1.9~1.25+ # Ctime : 2019-12-09 # Mtime : 2026-02-07 # Homepage : https://pigsty.io # Author : Ruohang Feng (rh@vonng.com) # License : Apache-2.0 @ https://github.com/pgsty/pg_exporter # Copyright : 2018-2026 Ruohang Feng / Vonng (rh@vonng.com) #==============================================================# #==============================================================# # 1. Config File #==============================================================# # The configuration file for pg_exporter is a YAML file. # Default configurations are retrieved via following precedence: # 1. command line args: --config= # 2. environment variables: PG_EXPORTER_CONFIG= # 3. pg_exporter.yml (Current directory) # 4. /etc/pg_exporter.yml (config file) # 5. /etc/pg_exporter (config dir) #==============================================================# # 2. Config Format #==============================================================# # pg_exporter config could be a single YAML file, or a directory containing a series of separated YAML files. # Each YAML config file consists of one or more metrics Collector definition, which are top-level objects. # If a directory is provided, all YAML in that directory will be merged in alphabetic order. #==============================================================# # 3. Version Compatibility #==============================================================# # Each collector has two optional version compatibility parameters: `min_version` and `max_version`. # These two parameters specify the version compatibility of the collector. If target postgres/pgbouncer's # version is less than `min_version`, or higher than `max_version`, the collector will not be installed. # # These two parameters are using PostgreSQL server version number format, which is a 6-digit integer # format as :. # # For example: # - 90100 stands for 9.1 # - 90600 stands for 9.6 # - 100000 stands for 10.0 # # Version compatibility range is left-inclusive right-exclusive: [min, max) #==============================================================# # 0110 pg #==============================================================# pg_primary_only: name: pg desc: PostgreSQL basic information (on primary) query: |- SELECT extract(EPOCH FROM CURRENT_TIMESTAMP) AS timestamp, extract(EPOCH FROM now() - pg_postmaster_start_time()) AS uptime, extract(EPOCH FROM pg_postmaster_start_time()) AS boot_time, (('x' || lpad(split_part(pg_current_xlog_location()::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(pg_current_xlog_location()::text, '/', 2), 8, '0'))::bit(32)::bigint) AS lsn, (('x' || lpad(split_part(pg_current_xlog_insert_location()::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(pg_current_xlog_insert_location()::text, '/', 2), 8, '0'))::bit(32)::bigint) AS insert_lsn, (('x' || lpad(split_part(pg_current_xlog_location()::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(pg_current_xlog_location()::text, '/', 2), 8, '0'))::bit(32)::bigint) AS write_lsn, (('x' || lpad(split_part(pg_current_xlog_location()::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(pg_current_xlog_location()::text, '/', 2), 8, '0'))::bit(32)::bigint) AS flush_lsn, NULL::BIGINT AS receive_lsn, NULL::BIGINT AS replay_lsn, extract(EPOCH FROM pg_conf_load_time()) AS reload_time, extract(EPOCH FROM now() - pg_conf_load_time()) AS conf_reload_time, NULL::FLOAT AS last_replay_time, 0::FLOAT AS lag, pg_is_in_recovery() AS is_in_recovery, FALSE AS is_wal_replay_paused; tags: [ cluster, primary ] ttl: 1 min_version: 90100 max_version: 100000 fatal: true skip: false metrics: - timestamp: { usage: GAUGE ,description: "current database timestamp in unix epoch" } - uptime: { usage: GAUGE ,description: "seconds since postmaster start" } - boot_time: { usage: GAUGE ,description: "postmaster boot timestamp in unix epoch" } - lsn: { usage: COUNTER ,description: "log sequence number, current write location" } - insert_lsn: { usage: COUNTER ,description: "primary only, location of current wal inserting" } - write_lsn: { usage: COUNTER ,description: "primary only, location of current wal writing" } - flush_lsn: { usage: COUNTER ,description: "primary only, location of current wal syncing" } - receive_lsn: { usage: COUNTER ,description: "replica only, location of wal synced to disk" } - replay_lsn: { usage: COUNTER ,description: "replica only, location of wal applied" } - reload_time: { usage: GAUGE ,description: "time when configuration was last reloaded" } - conf_reload_time: { usage: GAUGE ,description: "seconds since last configuration reload" } - last_replay_time: { usage: GAUGE ,description: "time when last transaction been replayed" } - lag: { usage: GAUGE ,description: "replica only, replication lag in seconds" } - is_in_recovery: { usage: GAUGE ,description: "1 if in recovery mode" } - is_wal_replay_paused: { usage: GAUGE ,description: "1 if wal play is paused" } pg_replica_only: name: pg desc: PostgreSQL basic information (on replica, 9.1+) query: |- SELECT extract(EPOCH FROM CURRENT_TIMESTAMP) AS timestamp, extract(EPOCH FROM now() - pg_postmaster_start_time()) AS uptime, extract(EPOCH FROM pg_postmaster_start_time()) AS boot_time, (('x' || lpad(split_part(pg_last_xlog_replay_location()::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(pg_last_xlog_replay_location()::text, '/', 2), 8, '0'))::bit(32)::bigint) AS lsn, NULL::BIGINT AS insert_lsn, NULL::BIGINT AS write_lsn, NULL::BIGINT AS flush_lsn, (('x' || lpad(split_part(pg_last_xlog_receive_location()::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(pg_last_xlog_receive_location()::text, '/', 2), 8, '0'))::bit(32)::bigint) AS receive_lsn, (('x' || lpad(split_part(pg_last_xlog_replay_location()::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(pg_last_xlog_replay_location()::text, '/', 2), 8, '0'))::bit(32)::bigint) AS replay_lsn, extract(EPOCH FROM pg_conf_load_time()) AS reload_time, extract(EPOCH FROM now() - pg_conf_load_time()) AS conf_reload_time, extract(EPOCH FROM pg_last_xact_replay_timestamp()) AS last_replay_time, CASE WHEN pg_last_xlog_receive_location() = pg_last_xlog_replay_location() THEN 0 ELSE EXTRACT(EPOCH FROM now() - pg_last_xact_replay_timestamp()) END AS lag, pg_is_in_recovery() AS is_in_recovery, pg_is_xlog_replay_paused() AS is_wal_replay_paused; tags: [ cluster, replica ] ttl: 1 min_version: 90100 max_version: 100000 fatal: true skip: false metrics: - timestamp: { usage: GAUGE ,description: "current database timestamp in unix epoch" } - uptime: { usage: GAUGE ,description: "seconds since postmaster start" } - boot_time: { usage: GAUGE ,description: "postmaster boot timestamp in unix epoch" } - lsn: { usage: COUNTER ,description: "log sequence number, current write location" } - insert_lsn: { usage: COUNTER ,description: "primary only, location of current wal inserting" } - write_lsn: { usage: COUNTER ,description: "primary only, location of current wal writing" } - flush_lsn: { usage: COUNTER ,description: "primary only, location of current wal syncing" } - receive_lsn: { usage: COUNTER ,description: "replica only, location of wal synced to disk" } - replay_lsn: { usage: COUNTER ,description: "replica only, location of wal applied" } - reload_time: { usage: GAUGE ,description: "time when configuration was last reloaded" } - conf_reload_time: { usage: GAUGE ,description: "seconds since last configuration reload" } - last_replay_time: { usage: GAUGE ,description: "time when last transaction been replayed" } - lag: { usage: GAUGE ,description: "replica only, replication lag in seconds" } - is_in_recovery: { usage: GAUGE ,description: "1 if in recovery mode" } - is_wal_replay_paused: { usage: GAUGE ,description: "1 if wal play is paused" } #==============================================================# # 0120 pg_meta #==============================================================# pg_meta_96: name: pg_meta desc: PostgreSQL meta info for pg 9.6 (with pg_control_system) query: | SELECT (SELECT system_identifier FROM pg_control_system()) AS cluster_id, coalesce((SELECT setting FROM pg_settings WHERE name = 'cluster_name'), 'N/A') AS cluster_name, (SELECT setting FROM pg_settings WHERE name = 'port') AS listen_port, (SELECT setting FROM pg_settings WHERE name = 'data_directory') AS data_dir, (SELECT setting FROM pg_settings WHERE name = 'config_file') AS conf_path, (SELECT setting FROM pg_settings WHERE name = 'hba_file') AS hba_path, (SELECT setting FROM pg_settings WHERE name = 'wal_level') AS wal_level, (SELECT setting FROM pg_settings WHERE name = 'server_encoding') AS encoding, (SELECT setting FROM pg_settings WHERE name = 'server_version') AS version, (SELECT setting FROM pg_settings WHERE name = 'server_version_num') AS ver_num, version() AS ver_str, (SELECT setting FROM pg_settings WHERE name = 'shared_preload_libraries') AS extensions, coalesce((SELECT setting FROM pg_settings WHERE name = 'primary_conninfo'), 'N/A') AS primary_conninfo, 1 AS info; ttl: 10 min_version: 90600 tags: [ cluster ] metrics: - cluster_id: { usage: LABEL ,description: "cluster system identifier" } - cluster_name: { usage: LABEL ,description: "cluster name" } - listen_port: { usage: LABEL ,description: "listen port" } - data_dir: { usage: LABEL ,description: "path to data directory" } - conf_path: { usage: LABEL ,description: "path to postgresql.conf" } - hba_path: { usage: LABEL ,description: "path to pg_hba.conf" } - wal_level: { usage: LABEL ,description: "wal level" } - encoding: { usage: LABEL ,description: "server encoding" } - version: { usage: LABEL ,description: "server version in human-readable format" } - ver_num: { usage: LABEL ,description: "server version number in machine-readable format" } - ver_str: { usage: LABEL ,description: "complete version string" } - extensions: { usage: LABEL ,description: "server installed preload libraries" } - primary_conninfo: { usage: LABEL ,description: "connection string to upstream (do not set password here)" } - info: { usage: GAUGE ,description: "constant 1" } pg_meta_91: name: pg_meta desc: PostgreSQL meta info for pg 9.1 - 9.5 query: | SELECT 'N/A' AS cluster_id, coalesce((SELECT setting FROM pg_settings WHERE name = 'cluster_name'), 'N/A') AS cluster_name, (SELECT setting FROM pg_settings WHERE name = 'port') AS listen_port, (SELECT setting FROM pg_settings WHERE name = 'data_directory') AS data_dir, (SELECT setting FROM pg_settings WHERE name = 'config_file') AS conf_path, (SELECT setting FROM pg_settings WHERE name = 'hba_file') AS hba_path, (SELECT setting FROM pg_settings WHERE name = 'wal_level') AS wal_level, (SELECT setting FROM pg_settings WHERE name = 'server_encoding') AS encoding, (SELECT setting FROM pg_settings WHERE name = 'server_version') AS version, (SELECT setting FROM pg_settings WHERE name = 'server_version_num') AS ver_num, version() AS ver_str, (SELECT setting FROM pg_settings WHERE name = 'shared_preload_libraries') AS extensions, coalesce((SELECT setting FROM pg_settings WHERE name = 'primary_conninfo'), 'N/A') AS primary_conninfo, 1 AS info; ttl: 10 min_version: 90100 max_version: 90600 tags: [ cluster ] metrics: - cluster_id: { usage: LABEL ,description: "cluster system identifier" } - cluster_name: { usage: LABEL ,description: "cluster name" } - listen_port: { usage: LABEL ,description: "listen port" } - data_dir: { usage: LABEL ,description: "path to data directory" } - conf_path: { usage: LABEL ,description: "path to postgresql.conf" } - hba_path: { usage: LABEL ,description: "path to pg_hba.conf" } - wal_level: { usage: LABEL ,description: "wal level" } - encoding: { usage: LABEL ,description: "server encoding" } - version: { usage: LABEL ,description: "server version in human-readable format" } - ver_num: { usage: LABEL ,description: "server version number in machine-readable format" } - ver_str: { usage: LABEL ,description: "complete version string" } - extensions: { usage: LABEL ,description: "server installed preload libraries" } - primary_conninfo: { usage: LABEL ,description: "connection string to upstream (do not set password here)" } - info: { usage: GAUGE ,description: "constant 1" } #==============================================================# # 0130 pg_setting #==============================================================# # Key PostgreSQL configuration parameters for PostgreSQL 9.1 - 9.6 # Use scalar subquery on pg_settings for "missing_ok" semantics (return NULL if not exist) pg_setting: name: pg_setting desc: PostgreSQL shared configuration parameters (legacy 9.1-9.6) query: | SELECT (SELECT setting::int FROM pg_settings WHERE name = 'max_connections') AS max_connections, (SELECT setting::int FROM pg_settings WHERE name = 'max_prepared_transactions') AS max_prepared_transactions, (SELECT setting::int FROM pg_settings WHERE name = 'max_locks_per_transaction') AS max_locks_per_transaction, (SELECT setting::int FROM pg_settings WHERE name = 'max_worker_processes') AS max_worker_processes, (SELECT setting::int FROM pg_settings WHERE name = 'max_parallel_workers') AS max_parallel_workers, (SELECT setting::int FROM pg_settings WHERE name = 'max_parallel_workers_per_gather') AS max_parallel_workers_per_gather, (SELECT setting::int FROM pg_settings WHERE name = 'max_parallel_maintenance_workers') AS max_parallel_maintenance_workers, (SELECT setting::int FROM pg_settings WHERE name = 'max_replication_slots') AS max_replication_slots, (SELECT setting::int FROM pg_settings WHERE name = 'max_wal_senders') AS max_wal_senders, (SELECT setting::int FROM pg_settings WHERE name = 'block_size') AS block_size, (SELECT setting::int FROM pg_settings WHERE name = 'wal_block_size') AS wal_block_size, (SELECT setting::bigint * CASE unit WHEN '8kB' THEN 8192 WHEN 'kB' THEN 1024 WHEN 'MB' THEN 1048576 WHEN 'GB' THEN 1073741824 ELSE 1 END FROM pg_settings WHERE name = 'segment_size') AS segment_size, (SELECT setting::bigint * CASE unit WHEN '8kB' THEN 8192 WHEN 'kB' THEN 1024 WHEN 'MB' THEN 1048576 WHEN 'GB' THEN 1073741824 ELSE 1 END FROM pg_settings WHERE name = 'wal_segment_size') AS wal_segment_size, (SELECT CASE setting WHEN 'on' THEN 1 ELSE 0 END FROM pg_settings WHERE name = 'data_checksums') AS data_checksums, (SELECT CASE setting WHEN 'on' THEN 1 ELSE 0 END FROM pg_settings WHERE name = 'wal_log_hints') AS wal_log_hints, (SELECT CASE setting WHEN 'on' THEN 1 ELSE 0 END FROM pg_settings WHERE name = 'fsync') AS fsync, (SELECT CASE setting WHEN 'on' THEN 1 ELSE 0 END FROM pg_settings WHERE name = 'full_page_writes') AS full_page_writes, (SELECT CASE setting WHEN 'minimal' THEN 1 WHEN 'archive' THEN 2 WHEN 'hot_standby' THEN 3 ELSE 0 END FROM pg_settings WHERE name = 'wal_level') AS wal_level, (SELECT setting::int FROM pg_settings WHERE name = 'checkpoint_segments') AS checkpoint_segments, (SELECT setting::bigint * CASE unit WHEN '8kB' THEN 8192 WHEN 'kB' THEN 1024 WHEN 'MB' THEN 1048576 WHEN 'GB' THEN 1073741824 ELSE 1 END FROM pg_settings WHERE name = 'min_wal_size') AS min_wal_size, (SELECT setting::bigint * CASE unit WHEN '8kB' THEN 8192 WHEN 'kB' THEN 1024 WHEN 'MB' THEN 1048576 WHEN 'GB' THEN 1073741824 ELSE 1 END FROM pg_settings WHERE name = 'max_wal_size') AS max_wal_size, (SELECT setting::int FROM pg_settings WHERE name = 'wal_keep_segments') AS wal_keep_segments, (SELECT setting::bigint * CASE unit WHEN '8kB' THEN 8192 WHEN 'kB' THEN 1024 WHEN 'MB' THEN 1048576 WHEN 'GB' THEN 1073741824 ELSE 1 END FROM pg_settings WHERE name = 'shared_buffers') AS shared_buffers, (SELECT setting::bigint * CASE unit WHEN '8kB' THEN 8192 WHEN 'kB' THEN 1024 WHEN 'MB' THEN 1048576 WHEN 'GB' THEN 1073741824 ELSE 1 END FROM pg_settings WHERE name = 'work_mem') AS work_mem, (SELECT setting::bigint * CASE unit WHEN '8kB' THEN 8192 WHEN 'kB' THEN 1024 WHEN 'MB' THEN 1048576 WHEN 'GB' THEN 1073741824 ELSE 1 END FROM pg_settings WHERE name = 'maintenance_work_mem') AS maintenance_work_mem, (SELECT setting::bigint * CASE unit WHEN '8kB' THEN 8192 WHEN 'kB' THEN 1024 WHEN 'MB' THEN 1048576 WHEN 'GB' THEN 1073741824 ELSE 1 END FROM pg_settings WHERE name = 'effective_cache_size') AS effective_cache_size, (SELECT CASE setting WHEN 'off' THEN 0 WHEN 'on' THEN 1 WHEN 'always' THEN 2 ELSE -1 END FROM pg_settings WHERE name = 'archive_mode') AS archive_mode, (SELECT CASE setting WHEN 'on' THEN 1 ELSE 0 END FROM pg_settings WHERE name = 'autovacuum') AS autovacuum, (SELECT setting::int FROM pg_settings WHERE name = 'autovacuum_max_workers') AS autovacuum_max_workers, (SELECT setting::int FROM pg_settings WHERE name = 'checkpoint_timeout') AS checkpoint_timeout, (SELECT setting::float FROM pg_settings WHERE name = 'checkpoint_completion_target') AS checkpoint_completion_target, (SELECT CASE setting WHEN 'on' THEN 1 ELSE 0 END FROM pg_settings WHERE name = 'hot_standby') AS hot_standby, (SELECT CASE setting WHEN 'off' THEN 0 WHEN 'local' THEN 1 WHEN 'remote_write' THEN 2 WHEN 'on' THEN 3 WHEN 'remote_apply' THEN 4 ELSE -1 END FROM pg_settings WHERE name = 'synchronous_commit') AS synchronous_commit; ttl: 10 min_version: 90100 tags: [ cluster ] metrics: - max_connections: { usage: GAUGE ,description: "maximum number of concurrent connections to the database server" } - max_prepared_transactions: { usage: GAUGE ,description: "maximum number of transactions that can be in the prepared state simultaneously" } - max_locks_per_transaction: { usage: GAUGE ,description: "maximum number of locks per transaction" } - max_worker_processes: { usage: GAUGE ,description: "maximum number of background processes (9.4+)" } - max_parallel_workers: { usage: GAUGE ,description: "maximum number of parallel workers that can be active at one time (9.6+)" } - max_parallel_workers_per_gather: { usage: GAUGE ,description: "maximum number of parallel workers per Gather node (9.6+)" } - max_parallel_maintenance_workers: { usage: GAUGE ,description: "maximum number of parallel maintenance workers (NULL on 9.x)" } - max_replication_slots: { usage: GAUGE ,description: "maximum number of replication slots (9.4+)" } - max_wal_senders: { usage: GAUGE ,description: "maximum number of concurrent WAL sender connections" } - block_size: { usage: GAUGE ,description: "database block size in bytes (default 8192)" } - wal_block_size: { usage: GAUGE ,description: "WAL block size in bytes" } - segment_size: { usage: GAUGE ,description: "database file segment size in bytes" } - wal_segment_size: { usage: GAUGE ,description: "WAL segment size in bytes" } - data_checksums: { usage: GAUGE ,description: "data checksums enabled, 1=on 0=off (9.3+)" } - wal_log_hints: { usage: GAUGE ,description: "WAL log hints enabled, 1=on 0=off (9.4+)" } - fsync: { usage: GAUGE ,description: "fsync enabled (CRITICAL for data safety), 1=on 0=off" } - full_page_writes: { usage: GAUGE ,description: "full page writes enabled, 1=on 0=off" } - wal_level: { usage: GAUGE ,description: "WAL level, 1=minimal 2=archive 3=hot_standby" } - checkpoint_segments: { usage: GAUGE ,description: "number of checkpoint segments (pre-9.5)" } - min_wal_size: { usage: GAUGE ,description: "minimum WAL size in bytes (9.5+)" } - max_wal_size: { usage: GAUGE ,description: "maximum WAL size in bytes (9.5+)" } - wal_keep_segments: { usage: GAUGE ,description: "WAL segments kept for standby replication (pg_basebackup/streaming)" } - shared_buffers: { usage: GAUGE ,description: "shared buffer size in bytes" } - work_mem: { usage: GAUGE ,description: "work memory size in bytes" } - maintenance_work_mem: { usage: GAUGE ,description: "maintenance work memory size in bytes" } - effective_cache_size: { usage: GAUGE ,description: "planner's assumption about effective OS cache size in bytes" } - archive_mode: { usage: GAUGE ,description: "archive mode, 0=off 1=on 2=always" } - autovacuum: { usage: GAUGE ,description: "autovacuum enabled, 1=on 0=off" } - autovacuum_max_workers: { usage: GAUGE ,description: "maximum number of autovacuum worker processes" } - checkpoint_timeout: { usage: GAUGE ,description: "checkpoint timeout in seconds" } - checkpoint_completion_target: { usage: GAUGE ,description: "checkpoint completion target (0.0-1.0)" } - hot_standby: { usage: GAUGE ,description: "hot standby mode enabled, 1=on 0=off" } - synchronous_commit: { usage: GAUGE ,description: "synchronous commit level, 0=off 1=local 2=remote_write 3=on 4=remote_apply" } #==============================================================# # 0210 pg_repl #==============================================================# pg_repl_94: name: pg_repl desc: PostgreSQL replication stat metrics 9.4 - 9.6 (with backend_xmin) query: |- SELECT appname, usename, address, pid, client_port, state, sync_state, sync_priority, backend_xmin, lsn, lsn - sent_lsn AS sent_diff, lsn - write_lsn AS write_diff, lsn - flush_lsn AS flush_diff, lsn - replay_lsn AS replay_diff, sent_lsn, write_lsn, flush_lsn, replay_lsn, 0::FLOAT AS write_lag, 0::FLOAT AS flush_lag, 0::FLOAT AS replay_lag, extract(EPOCH FROM current_timestamp) AS "time", extract(EPOCH FROM backend_start) AS launch_time FROM ( SELECT application_name AS appname, usename, coalesce(client_addr::TEXT,'localhost') AS address, pid::TEXT, client_port, CASE state WHEN 'streaming' THEN 0 WHEN 'startup' THEN 1 WHEN 'catchup' THEN 2 WHEN 'backup' THEN 3 WHEN 'stopping' THEN 4 ELSE -1 END AS state, CASE sync_state WHEN 'async' THEN 0 WHEN 'potential' THEN 1 WHEN 'sync' THEN 2 WHEN 'quorum' THEN 3 ELSE -1 END AS sync_state, sync_priority, backend_xmin::TEXT::BIGINT AS backend_xmin, (('x' || lpad(split_part(current.loc::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(current.loc::text, '/', 2), 8, '0'))::bit(32)::bigint) AS lsn, (('x' || lpad(split_part(sent_location::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(sent_location::text, '/', 2), 8, '0'))::bit(32)::bigint) AS sent_lsn, (('x' || lpad(split_part(write_location::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(write_location::text, '/', 2), 8, '0'))::bit(32)::bigint) AS write_lsn, (('x' || lpad(split_part(flush_location::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(flush_location::text, '/', 2), 8, '0'))::bit(32)::bigint) AS flush_lsn, (('x' || lpad(split_part(replay_location::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(replay_location::text, '/', 2), 8, '0'))::bit(32)::bigint) AS replay_lsn, backend_start FROM pg_stat_replication, (SELECT CASE WHEN pg_is_in_recovery() THEN pg_last_xlog_replay_location() ELSE pg_current_xlog_location() END AS loc) current ) d; ttl: 10 min_version: 90400 max_version: 100000 tags: [ cluster ] metrics: - appname: { usage: LABEL ,description: "Name of the application that is connected to this WAL sender" } - usename: { usage: LABEL ,description: "Name of the user logged into this WAL sender process" } - address: { usage: LABEL ,description: "IP address of the client connected to this WAL sender, localhost for unix socket" } - pid: { usage: LABEL ,description: "Process ID of the WAL sender process" } - client_port: { usage: GAUGE ,description: "TCP port number that the client is using for communication with this WAL sender, or -1 if a Unix socket is used" } - state: { usage: GAUGE ,description: "Current WAL sender encoded state 0-4 for streaming|startup|catchup|backup|stopping" } - sync_state: { usage: GAUGE ,description: "Encoded synchronous state of this standby server, 0-3 for async|potential|sync|quorum" } - sync_priority: { usage: GAUGE ,description: "Priority of this standby server for being chosen as the synchronous standby" } - backend_xmin: { usage: COUNTER ,description: "This standby's xmin horizon reported by hot_standby_feedback." } - lsn: { usage: COUNTER ,description: "Current log position on this server" } - sent_diff: { usage: GAUGE ,description: "Last log position sent to this standby server diff with current lsn" } - write_diff: { usage: GAUGE ,description: "Last log position written to disk by this standby server diff with current lsn" } - flush_diff: { usage: GAUGE ,description: "Last log position flushed to disk by this standby server diff with current lsn" } - replay_diff: { usage: GAUGE ,description: "Last log position replayed into the database on this standby server diff with current lsn" } - sent_lsn: { usage: COUNTER ,description: "Last write-ahead log location sent on this connection" } - write_lsn: { usage: COUNTER ,description: "Last write-ahead log location written to disk by this standby server" } - flush_lsn: { usage: COUNTER ,description: "Last write-ahead log location flushed to disk by this standby server" } - replay_lsn: { usage: COUNTER ,description: "Last write-ahead log location replayed into the database on this standby server" } - write_lag: { usage: GAUGE ,description: "Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written it (N/A on 9.x)" } - flush_lag: { usage: GAUGE ,description: "Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written and flushed it (N/A on 9.x)" } - replay_lag: { usage: GAUGE ,description: "Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written, flushed and applied it (N/A on 9.x)" } - time: { usage: COUNTER ,description: "Current timestamp in unix epoch" } - launch_time: { usage: COUNTER ,description: "Time when this process was started, i.e., when the client connected to this WAL sender" } pg_repl_92: name: pg_repl desc: PostgreSQL replication stat metrics 9.2 - 9.3 query: |- SELECT appname, usename, address, pid, client_port, state, sync_state, sync_priority, backend_xmin, lsn, lsn - sent_lsn AS sent_diff, lsn - write_lsn AS write_diff, lsn - flush_lsn AS flush_diff, lsn - replay_lsn AS replay_diff, sent_lsn, write_lsn, flush_lsn, replay_lsn, 0::FLOAT AS write_lag, 0::FLOAT AS flush_lag, 0::FLOAT AS replay_lag, extract(EPOCH FROM current_timestamp) AS "time", extract(EPOCH FROM backend_start) AS launch_time FROM ( SELECT application_name AS appname, usename, coalesce(client_addr::TEXT,'localhost') AS address, pid::TEXT, client_port, CASE state WHEN 'streaming' THEN 0 WHEN 'startup' THEN 1 WHEN 'catchup' THEN 2 WHEN 'backup' THEN 3 WHEN 'stopping' THEN 4 ELSE -1 END AS state, CASE sync_state WHEN 'async' THEN 0 WHEN 'potential' THEN 1 WHEN 'sync' THEN 2 WHEN 'quorum' THEN 3 ELSE -1 END AS sync_state, sync_priority, 0::BIGINT AS backend_xmin, (('x' || lpad(split_part(current.loc::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(current.loc::text, '/', 2), 8, '0'))::bit(32)::bigint) AS lsn, (('x' || lpad(split_part(sent_location::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(sent_location::text, '/', 2), 8, '0'))::bit(32)::bigint) AS sent_lsn, (('x' || lpad(split_part(write_location::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(write_location::text, '/', 2), 8, '0'))::bit(32)::bigint) AS write_lsn, (('x' || lpad(split_part(flush_location::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(flush_location::text, '/', 2), 8, '0'))::bit(32)::bigint) AS flush_lsn, (('x' || lpad(split_part(replay_location::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(replay_location::text, '/', 2), 8, '0'))::bit(32)::bigint) AS replay_lsn, backend_start FROM pg_stat_replication, (SELECT CASE WHEN pg_is_in_recovery() THEN pg_last_xlog_replay_location() ELSE pg_current_xlog_location() END AS loc) current ) d; ttl: 10 min_version: 90200 max_version: 90400 tags: [ cluster ] metrics: - appname: { usage: LABEL ,description: "Name of the application that is connected to this WAL sender" } - usename: { usage: LABEL ,description: "Name of the user logged into this WAL sender process" } - address: { usage: LABEL ,description: "IP address of the client connected to this WAL sender, localhost for unix socket" } - pid: { usage: LABEL ,description: "Process ID of the WAL sender process" } - client_port: { usage: GAUGE ,description: "TCP port number that the client is using for communication with this WAL sender, or -1 if a Unix socket is used" } - state: { usage: GAUGE ,description: "Current WAL sender encoded state 0-4 for streaming|startup|catchup|backup|stopping" } - sync_state: { usage: GAUGE ,description: "Encoded synchronous state of this standby server, 0-3 for async|potential|sync|quorum" } - sync_priority: { usage: GAUGE ,description: "Priority of this standby server for being chosen as the synchronous standby" } - backend_xmin: { usage: COUNTER ,description: "This standby's xmin horizon reported by hot_standby_feedback (N/A before 9.4)" } - lsn: { usage: COUNTER ,description: "Current log position on this server" } - sent_diff: { usage: GAUGE ,description: "Last log position sent to this standby server diff with current lsn" } - write_diff: { usage: GAUGE ,description: "Last log position written to disk by this standby server diff with current lsn" } - flush_diff: { usage: GAUGE ,description: "Last log position flushed to disk by this standby server diff with current lsn" } - replay_diff: { usage: GAUGE ,description: "Last log position replayed into the database on this standby server diff with current lsn" } - sent_lsn: { usage: COUNTER ,description: "Last write-ahead log location sent on this connection" } - write_lsn: { usage: COUNTER ,description: "Last write-ahead log location written to disk by this standby server" } - flush_lsn: { usage: COUNTER ,description: "Last write-ahead log location flushed to disk by this standby server" } - replay_lsn: { usage: COUNTER ,description: "Last write-ahead log location replayed into the database on this standby server" } - write_lag: { usage: GAUGE ,description: "Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written it (N/A on 9.x)" } - flush_lag: { usage: GAUGE ,description: "Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written and flushed it (N/A on 9.x)" } - replay_lag: { usage: GAUGE ,description: "Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written, flushed and applied it (N/A on 9.x)" } - time: { usage: COUNTER ,description: "Current timestamp in unix epoch" } - launch_time: { usage: COUNTER ,description: "Time when this process was started, i.e., when the client connected to this WAL sender" } pg_repl_91: name: pg_repl desc: PostgreSQL replication stat metrics 9.1 (procpid, no state/sync columns) query: |- SELECT appname, usename, address, pid, client_port, state, sync_state, sync_priority, backend_xmin, lsn, lsn - sent_lsn AS sent_diff, lsn - write_lsn AS write_diff, lsn - flush_lsn AS flush_diff, lsn - replay_lsn AS replay_diff, sent_lsn, write_lsn, flush_lsn, replay_lsn, 0::FLOAT AS write_lag, 0::FLOAT AS flush_lag, 0::FLOAT AS replay_lag, extract(EPOCH FROM current_timestamp) AS "time", extract(EPOCH FROM backend_start) AS launch_time FROM ( SELECT application_name AS appname, usename, coalesce(client_addr::TEXT,'localhost') AS address, procpid::TEXT AS pid, client_port, 0::INT AS state, 0::INT AS sync_state, 0::INT AS sync_priority, 0::BIGINT AS backend_xmin, (('x' || lpad(split_part(current.loc::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(current.loc::text, '/', 2), 8, '0'))::bit(32)::bigint) AS lsn, (('x' || lpad(split_part(sent_location::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(sent_location::text, '/', 2), 8, '0'))::bit(32)::bigint) AS sent_lsn, (('x' || lpad(split_part(write_location::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(write_location::text, '/', 2), 8, '0'))::bit(32)::bigint) AS write_lsn, (('x' || lpad(split_part(flush_location::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(flush_location::text, '/', 2), 8, '0'))::bit(32)::bigint) AS flush_lsn, (('x' || lpad(split_part(replay_location::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(replay_location::text, '/', 2), 8, '0'))::bit(32)::bigint) AS replay_lsn, backend_start FROM pg_stat_replication, (SELECT CASE WHEN pg_is_in_recovery() THEN pg_last_xlog_replay_location() ELSE pg_current_xlog_location() END AS loc) current ) d; ttl: 10 min_version: 90100 max_version: 90200 tags: [ cluster ] metrics: - appname: { usage: LABEL ,description: "Name of the application that is connected to this WAL sender" } - usename: { usage: LABEL ,description: "Name of the user logged into this WAL sender process" } - address: { usage: LABEL ,description: "IP address of the client connected to this WAL sender, localhost for unix socket" } - pid: { usage: LABEL ,description: "Process ID of the WAL sender process" } - client_port: { usage: GAUGE ,description: "TCP port number that the client is using for communication with this WAL sender, or -1 if a Unix socket is used" } - state: { usage: GAUGE ,description: "Current WAL sender encoded state 0-4 for streaming|startup|catchup|backup|stopping" } - sync_state: { usage: GAUGE ,description: "Encoded synchronous state of this standby server, 0-3 for async|potential|sync|quorum" } - sync_priority: { usage: GAUGE ,description: "Priority of this standby server for being chosen as the synchronous standby" } - backend_xmin: { usage: COUNTER ,description: "This standby's xmin horizon reported by hot_standby_feedback (N/A before 9.4)" } - lsn: { usage: COUNTER ,description: "Current log position on this server" } - sent_diff: { usage: GAUGE ,description: "Last log position sent to this standby server diff with current lsn" } - write_diff: { usage: GAUGE ,description: "Last log position written to disk by this standby server diff with current lsn" } - flush_diff: { usage: GAUGE ,description: "Last log position flushed to disk by this standby server diff with current lsn" } - replay_diff: { usage: GAUGE ,description: "Last log position replayed into the database on this standby server diff with current lsn" } - sent_lsn: { usage: COUNTER ,description: "Last write-ahead log location sent on this connection" } - write_lsn: { usage: COUNTER ,description: "Last write-ahead log location written to disk by this standby server" } - flush_lsn: { usage: COUNTER ,description: "Last write-ahead log location flushed to disk by this standby server" } - replay_lsn: { usage: COUNTER ,description: "Last write-ahead log location replayed into the database on this standby server" } - write_lag: { usage: GAUGE ,description: "Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written it (N/A on 9.x)" } - flush_lag: { usage: GAUGE ,description: "Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written and flushed it (N/A on 9.x)" } - replay_lag: { usage: GAUGE ,description: "Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written, flushed and applied it (N/A on 9.x)" } - time: { usage: COUNTER ,description: "Current timestamp in unix epoch" } - launch_time: { usage: COUNTER ,description: "Time when this process was started, i.e., when the client connected to this WAL sender" } #==============================================================# # 0220 pg_sync_standby #==============================================================# pg_sync_standby: name: pg_sync_standby desc: PostgreSQL synchronous standby status and names query: |- SELECT CASE WHEN names <> '' THEN names ELSE '' END AS names, CASE WHEN names <> '' THEN 1 ELSE 0 END AS enabled FROM (SELECT current_setting('synchronous_standby_names') AS names) n; ttl: 10 min_version: 90100 tags: [ cluster ] metrics: - names: { usage: LABEL ,description: "List of standby servers that can support synchronous replication" } - enabled: { usage: GAUGE ,description: "Synchronous commit enabled, 1 if enabled, 0 if disabled" } #==============================================================# # 0230 pg_downstream #==============================================================# pg_downstream: name: pg_downstream desc: PostgreSQL replication client count (no state column on 9.1) query: |- SELECT 'connected' AS state, count(*) AS count FROM pg_stat_replication; ttl: 10 min_version: 90100 max_version: 90200 tags: [ cluster ] metrics: - state: { usage: LABEL ,description: "Replication client state" } - count: { usage: GAUGE ,description: "Count of replication clients by state" } pg_downstream_92: name: pg_downstream desc: PostgreSQL replication client count (group by state) query: |- SELECT state, count(*) AS count FROM pg_stat_replication GROUP BY state; ttl: 10 min_version: 90200 tags: [ cluster ] metrics: - state: { usage: LABEL ,description: "Replication client state" } - count: { usage: GAUGE ,description: "Count of replication clients by state" } #==============================================================# # 0240 pg_slot #==============================================================# pg_slot_96: name: pg_slot desc: PostgreSQL replication slot metrics 9.6 (with active_pid, confirmed_flush_lsn) query: |- SELECT slot_name, slot_type, plugin, database AS datname, datoid, active_pid, active, FALSE AS temporary, xmin::TEXT::BIGINT AS xmin, catalog_xmin::TEXT::BIGINT AS catalog_xmin, restart_lsn, confirm_lsn, current_lsn - restart_lsn AS retained_bytes FROM ( SELECT slot_name, slot_type, plugin, database, datoid, active_pid, active, xmin, catalog_xmin, (('x' || lpad(split_part(restart_lsn::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(restart_lsn::text, '/', 2), 8, '0'))::bit(32)::bigint) AS restart_lsn, (('x' || lpad(split_part(confirmed_flush_lsn::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(confirmed_flush_lsn::text, '/', 2), 8, '0'))::bit(32)::bigint) AS confirm_lsn, (('x' || lpad(split_part(current.loc::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(current.loc::text, '/', 2), 8, '0'))::bit(32)::bigint) AS current_lsn FROM pg_replication_slots, (SELECT CASE WHEN pg_is_in_recovery() THEN pg_last_xlog_replay_location() ELSE pg_current_xlog_location() END AS loc) current ) d; ttl: 10 min_version: 90600 max_version: 100000 tags: [ cluster, primary ] metrics: - slot_name: { usage: LABEL ,description: "A unique, cluster-wide identifier for the replication slot" } - slot_type: { usage: LABEL ,description: "The slot type, physical or logical" } - plugin: { usage: LABEL ,description: "The base name of the shared object containing the output plugin this logical slot is using, or null for physical slots." } - datname: { usage: LABEL ,description: "The name of the database this slot is associated with, logical slots only, null for physical slot" } - datoid: { usage: GAUGE ,description: "The OID of the database this slot is associated with, logical slots only, null for physical slot" } - active_pid: { usage: GAUGE ,description: "The process ID of the session streaming data for this slot. NULL if inactive." } - active: { usage: GAUGE ,description: "True(1) if this slot is currently actively being used" } - temporary: { usage: GAUGE ,description: "True(1) if this is a temporary replication slot (N/A on 9.x, always 0)" } - xmin: { usage: COUNTER ,description: "The oldest transaction that this slot needs the database to retain." } - catalog_xmin: { usage: COUNTER ,description: "The oldest transaction affecting the system catalogs that this slot needs the database to retain." } - restart_lsn: { usage: COUNTER ,description: "The address (LSN) of oldest WAL which still might be required by the consumer of this slot" } - confirm_lsn: { usage: COUNTER ,description: "The address (LSN) up to which the logical slot's consumer has confirmed receiving data." } - retained_bytes: { usage: GAUGE ,description: "Size of bytes that retained for this slot" } pg_slot_95: name: pg_slot desc: PostgreSQL replication slot metrics 9.5 (no confirmed_flush_lsn) query: |- SELECT slot_name, slot_type, plugin, database AS datname, datoid, active_pid, active, FALSE AS temporary, xmin::TEXT::BIGINT AS xmin, catalog_xmin::TEXT::BIGINT AS catalog_xmin, restart_lsn, confirm_lsn, current_lsn - restart_lsn AS retained_bytes FROM ( SELECT slot_name, slot_type, plugin, database, datoid, active_pid, active, xmin, catalog_xmin, (('x' || lpad(split_part(restart_lsn::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(restart_lsn::text, '/', 2), 8, '0'))::bit(32)::bigint) AS restart_lsn, 0::BIGINT AS confirm_lsn, (('x' || lpad(split_part(current.loc::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(current.loc::text, '/', 2), 8, '0'))::bit(32)::bigint) AS current_lsn FROM pg_replication_slots, (SELECT CASE WHEN pg_is_in_recovery() THEN pg_last_xlog_replay_location() ELSE pg_current_xlog_location() END AS loc) current ) d; ttl: 10 min_version: 90500 max_version: 90600 tags: [ cluster, primary ] metrics: - slot_name: { usage: LABEL ,description: "A unique, cluster-wide identifier for the replication slot" } - slot_type: { usage: LABEL ,description: "The slot type, physical or logical" } - plugin: { usage: LABEL ,description: "The base name of the shared object containing the output plugin this logical slot is using, or null for physical slots." } - datname: { usage: LABEL ,description: "The name of the database this slot is associated with, logical slots only, null for physical slot" } - datoid: { usage: GAUGE ,description: "The OID of the database this slot is associated with, logical slots only, null for physical slot" } - active_pid: { usage: GAUGE ,description: "The process ID of the session streaming data for this slot. NULL if inactive." } - active: { usage: GAUGE ,description: "True(1) if this slot is currently actively being used" } - temporary: { usage: GAUGE ,description: "True(1) if this is a temporary replication slot (N/A on 9.x, always 0)" } - xmin: { usage: COUNTER ,description: "The oldest transaction that this slot needs the database to retain." } - catalog_xmin: { usage: COUNTER ,description: "The oldest transaction affecting the system catalogs that this slot needs the database to retain." } - restart_lsn: { usage: COUNTER ,description: "The address (LSN) of oldest WAL which still might be required by the consumer of this slot" } - confirm_lsn: { usage: COUNTER ,description: "Confirmed flush lsn (N/A before 9.6, always 0)" } - retained_bytes: { usage: GAUGE ,description: "Size of bytes that retained for this slot" } pg_slot_94: name: pg_slot desc: PostgreSQL replication slot metrics 9.4 (no active_pid, confirmed_flush_lsn) query: |- SELECT slot_name, slot_type, plugin, database AS datname, datoid, active_pid, active, FALSE AS temporary, xmin::TEXT::BIGINT AS xmin, catalog_xmin::TEXT::BIGINT AS catalog_xmin, restart_lsn, confirm_lsn, current_lsn - restart_lsn AS retained_bytes FROM ( SELECT slot_name, slot_type, plugin, database, datoid, NULL::INT AS active_pid, active, xmin, catalog_xmin, (('x' || lpad(split_part(restart_lsn::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(restart_lsn::text, '/', 2), 8, '0'))::bit(32)::bigint) AS restart_lsn, 0::BIGINT AS confirm_lsn, (('x' || lpad(split_part(current.loc::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(current.loc::text, '/', 2), 8, '0'))::bit(32)::bigint) AS current_lsn FROM pg_replication_slots, (SELECT CASE WHEN pg_is_in_recovery() THEN pg_last_xlog_replay_location() ELSE pg_current_xlog_location() END AS loc) current ) d; ttl: 10 min_version: 90400 max_version: 90500 tags: [ cluster, primary ] metrics: - slot_name: { usage: LABEL ,description: "A unique, cluster-wide identifier for the replication slot" } - slot_type: { usage: LABEL ,description: "The slot type, physical or logical" } - plugin: { usage: LABEL ,description: "The base name of the shared object containing the output plugin this logical slot is using, or null for physical slots." } - datname: { usage: LABEL ,description: "The name of the database this slot is associated with, logical slots only, null for physical slot" } - datoid: { usage: GAUGE ,description: "The OID of the database this slot is associated with, logical slots only, null for physical slot" } - active_pid: { usage: GAUGE ,description: "Process ID is not available before 9.5 (NULL)" } - active: { usage: GAUGE ,description: "True(1) if this slot is currently actively being used" } - temporary: { usage: GAUGE ,description: "True(1) if this is a temporary replication slot (N/A on 9.x, always 0)" } - xmin: { usage: COUNTER ,description: "The oldest transaction that this slot needs the database to retain." } - catalog_xmin: { usage: COUNTER ,description: "The oldest transaction affecting the system catalogs that this slot needs the database to retain." } - restart_lsn: { usage: COUNTER ,description: "The address (LSN) of oldest WAL which still might be required by the consumer of this slot" } - confirm_lsn: { usage: COUNTER ,description: "Confirmed flush lsn (N/A before 9.6, always 0)" } - retained_bytes: { usage: GAUGE ,description: "Size of bytes that retained for this slot" } #==============================================================# # 0250 pg_recv #==============================================================# pg_recv_96: name: pg_recv desc: PostgreSQL walreceiver metrics (9.6 - 12) query: |- SELECT (regexp_match(conninfo, '.*host=(\S+).*'))[1] AS sender_host, (regexp_match(conninfo, '.*port=(\S+).*'))[1] AS sender_port, coalesce(slot_name, 'NULL') AS slot_name, pid, CASE status WHEN 'streaming' THEN 0 WHEN 'startup' THEN 1 WHEN 'catchup' THEN 2 WHEN 'backup' THEN 3 WHEN 'stopping' THEN 4 ELSE -1 END AS state, (('x' || lpad(split_part(receive_start_lsn::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(receive_start_lsn::text, '/', 2), 8, '0'))::bit(32)::bigint) AS init_lsn, receive_start_tli AS init_tli, (('x' || lpad(split_part(received_lsn::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(received_lsn::text, '/', 2), 8, '0'))::bit(32)::bigint) AS flush_lsn, received_tli AS flush_tli, (('x' || lpad(split_part(latest_end_lsn::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(latest_end_lsn::text, '/', 2), 8, '0'))::bit(32)::bigint) AS reported_lsn, last_msg_send_time AS msg_send_time, last_msg_receipt_time AS msg_recv_time, latest_end_time AS reported_time, now() AS time FROM pg_stat_wal_receiver; ttl: 10 min_version: 90600 max_version: 130000 tags: [ cluster, replica ] metrics: - sender_host: { usage: LABEL ,description: "Host of the PostgreSQL instance this WAL receiver is connected to" } - sender_port: { usage: LABEL ,description: "Port number of the PostgreSQL instance this WAL receiver is connected to." } - slot_name: { usage: LABEL ,description: "Replication slot name used by this WAL receiver" } - pid: { usage: GAUGE ,description: "Process ID of the WAL receiver process" } - state: { usage: GAUGE ,description: "Encoded activity status of the WAL receiver process 0-4 for streaming|startup|catchup|backup|stopping" } - init_lsn: { usage: COUNTER ,description: "First write-ahead log location used when WAL receiver is started" } - init_tli: { usage: COUNTER ,description: "First timeline number used when WAL receiver is started" } - flush_lsn: { usage: COUNTER ,description: "Last write-ahead log location already received and flushed to disk" } - flush_tli: { usage: COUNTER ,description: "Timeline number of last write-ahead log location received and flushed to disk" } - reported_lsn: { usage: COUNTER ,description: "Last write-ahead log location reported to origin WAL sender" } - msg_send_time: { usage: GAUGE ,description: "Send time of last message received from origin WAL sender" } - msg_recv_time: { usage: GAUGE ,description: "Receipt time of last message received from origin WAL sender" } - reported_time: { usage: GAUGE ,description: "Time of last write-ahead log location reported to origin WAL sender" } - time: { usage: GAUGE ,description: "Time of current snapshot" } #==============================================================# # 0270 pg_origin #==============================================================# # skip by default, require additional privilege setup # GRANT SELECT ON pg_replication_origin, pg_replication_origin_status TO pg_monitor; pg_origin: name: pg_origin desc: PostgreSQL replay state (approximate) for a certain origin query: |- SELECT roname, (('x' || lpad(split_part(remote_lsn::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(remote_lsn::text, '/', 2), 8, '0'))::bit(32)::bigint) AS remote_lsn, (('x' || lpad(split_part(local_lsn::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(local_lsn::text, '/', 2), 8, '0'))::bit(32)::bigint) AS local_lsn FROM pg_replication_origin o LEFT JOIN pg_replication_origin_status os ON o.roident = os.local_id; ttl: 10 min_version: 90500 skip: true tags: [ cluster ] metrics: - roname: { usage: LABEL ,description: "The external, user defined, name of a replication origin." } - remote_lsn: { usage: COUNTER ,description: "The origin node's LSN up to which data has been replicated." } - local_lsn: { usage: COUNTER ,description: "This node's LSN at which remote_lsn has been replicated." } #==============================================================# # 0310 pg_size #==============================================================# pg_size: name: pg_size desc: PostgreSQL database size (legacy 9.1-9.6) query: |- SELECT datname, pg_database_size(oid) AS bytes FROM pg_database; ttl: 60 timeout: 1 min_version: 90100 tags: [ cluster ] metrics: - datname: { usage: LABEL ,description: "Database name" } - bytes: { usage: GAUGE ,description: "Database size in bytes" } #==============================================================# # 0320 pg_archiver #==============================================================# pg_archiver: name: pg_archiver desc: PostgreSQL archiver process statistics query: |- SELECT archived_count AS finish_count,failed_count, extract(epoch FROM last_archived_time) AS finish_time, extract(epoch FROM last_failed_time) AS failed_time, extract(epoch FROM stats_reset) AS reset_time FROM pg_stat_archiver; ttl: 60 min_version: 90400 tags: [ cluster ] metrics: - finish_count: { usage: COUNTER ,description: "Number of WAL files that have been successfully archived" } - failed_count: { usage: COUNTER ,description: "Number of failed attempts for archiving WAL files" } - finish_time: { usage: GAUGE ,description: "Time of the last successful archive operation" } - failed_time: { usage: GAUGE ,description: "Time of the last failed archival operation" } - reset_time: { usage: GAUGE ,description: "Time at which archive statistics were last reset" } #==============================================================# # 0330 pg_bgwriter #==============================================================# # https://pgpedia.info/p/pg_stat_bgwriter.html pg_bgwriter_94: name: pg_bgwriter desc: "PostgreSQL background writer metrics (PG 9.4-16)" query: SELECT checkpoints_timed, checkpoints_req, checkpoint_write_time, checkpoint_sync_time, buffers_checkpoint, buffers_clean, buffers_backend, maxwritten_clean, buffers_backend_fsync, buffers_alloc, extract(EPOCH FROM stats_reset) AS reset_time FROM pg_stat_bgwriter; ttl: 10 min_version: 90400 max_version: 170000 tags: [ cluster ] metrics: - checkpoints_timed: { usage: COUNTER ,description: "Number of scheduled checkpoints that have been performed" } - checkpoints_req: { usage: COUNTER ,description: "Number of requested checkpoints that have been performed" } - checkpoint_write_time: { usage: COUNTER ,scale: 1e-3 ,description: "Total amount of time that has been spent in the portion of checkpoint processing where files are written to disk, in seconds" } - checkpoint_sync_time: { usage: COUNTER ,scale: 1e-3 ,description: "Total amount of time that has been spent in the portion of checkpoint processing where files are synchronized to disk, in seconds" } - buffers_checkpoint: { usage: COUNTER ,description: "Number of buffers written during checkpoints" } - buffers_clean: { usage: COUNTER ,description: "Number of buffers written by the background writer" } - buffers_backend: { usage: COUNTER ,description: "Number of buffers written directly by a backend" } - maxwritten_clean: { usage: COUNTER ,description: "Number of times the background writer stopped a cleaning scan because it had written too many buffers" } - buffers_backend_fsync: { usage: COUNTER ,description: "Number of times a backend had to execute its own fsync call" } - buffers_alloc: { usage: COUNTER ,description: "Number of buffers allocated" } - reset_time: { usage: GAUGE ,description: "Time at which bgwriter statistics were last reset" } pg_bgwriter_91: name: pg_bgwriter desc: "PostgreSQL background writer metrics (PG 9.1-9.3)" query: SELECT checkpoints_timed, checkpoints_req, 0::BIGINT AS checkpoint_write_time, 0::BIGINT AS checkpoint_sync_time, buffers_checkpoint, buffers_clean, buffers_backend, maxwritten_clean, buffers_backend_fsync, buffers_alloc, extract(EPOCH FROM stats_reset) AS reset_time FROM pg_stat_bgwriter; ttl: 10 min_version: 90100 max_version: 90400 tags: [ cluster ] metrics: - checkpoints_timed: { usage: COUNTER ,description: "Number of scheduled checkpoints that have been performed" } - checkpoints_req: { usage: COUNTER ,description: "Number of requested checkpoints that have been performed" } - checkpoint_write_time: { usage: COUNTER ,scale: 1e-3 ,description: "Total amount of time spent writing checkpoint files, in seconds (N/A on 9.1-9.3, always 0)" } - checkpoint_sync_time: { usage: COUNTER ,scale: 1e-3 ,description: "Total amount of time spent syncing checkpoint files, in seconds (N/A on 9.1-9.3, always 0)" } - buffers_checkpoint: { usage: COUNTER ,description: "Number of buffers written during checkpoints" } - buffers_clean: { usage: COUNTER ,description: "Number of buffers written by the background writer" } - buffers_backend: { usage: COUNTER ,description: "Number of buffers written directly by a backend" } - maxwritten_clean: { usage: COUNTER ,description: "Number of times the background writer stopped a cleaning scan because it had written too many buffers" } - buffers_backend_fsync: { usage: COUNTER ,description: "Number of times a backend had to execute its own fsync call" } - buffers_alloc: { usage: COUNTER ,description: "Number of buffers allocated" } - reset_time: { usage: GAUGE ,description: "Time at which bgwriter statistics were last reset" } #==============================================================# # 0331 pg_checkpointer #==============================================================# pg_checkpointer_94: name: pg_checkpointer desc: "PostgreSQL checkpointer stat metrics for pg 9.4-16" query: SELECT checkpoints_timed, checkpoints_req, checkpoint_write_time, checkpoint_sync_time, buffers_checkpoint, extract(EPOCH FROM stats_reset) AS reset_time FROM pg_stat_bgwriter; ttl: 10 min_version: 90400 max_version: 170000 tags: [ cluster ] metrics: - checkpoints_timed: { usage: COUNTER ,rename: timed ,description: "Number of scheduled checkpoints that have been performed" } - checkpoints_req: { usage: COUNTER ,rename: req ,description: "Number of requested checkpoints that have been performed" } - checkpoint_write_time: { usage: COUNTER ,rename: write_time ,scale: 1e-3 ,description: "Total amount of time that has been spent writing checkpoint files, in seconds" } - checkpoint_sync_time: { usage: COUNTER ,rename: sync_time ,scale: 1e-3 ,description: "Total amount of time that has been spent synchronizing checkpoint files to disk, in seconds" } - buffers_checkpoint: { usage: COUNTER ,rename: buffers_written ,description: "Number of buffers written during checkpoints and restartpoints" } - reset_time: { usage: GAUGE ,description: "Time at which checkpointer statistics were last reset" } pg_checkpointer_91: name: pg_checkpointer desc: "PostgreSQL checkpointer stat metrics for pg 9.1-9.3" query: SELECT checkpoints_timed, checkpoints_req, 0::BIGINT AS checkpoint_write_time, 0::BIGINT AS checkpoint_sync_time, buffers_checkpoint, extract(EPOCH FROM stats_reset) AS reset_time FROM pg_stat_bgwriter; ttl: 10 min_version: 90100 max_version: 90400 tags: [ cluster ] metrics: - checkpoints_timed: { usage: COUNTER ,rename: timed ,description: "Number of scheduled checkpoints that have been performed" } - checkpoints_req: { usage: COUNTER ,rename: req ,description: "Number of requested checkpoints that have been performed" } - checkpoint_write_time: { usage: COUNTER ,rename: write_time ,scale: 1e-3 ,description: "Total amount of time that has been spent writing checkpoint files, in seconds (N/A on 9.1-9.3, always 0)" } - checkpoint_sync_time: { usage: COUNTER ,rename: sync_time ,scale: 1e-3 ,description: "Total amount of time that has been spent synchronizing checkpoint files to disk, in seconds (N/A on 9.1-9.3, always 0)" } - buffers_checkpoint: { usage: COUNTER ,rename: buffers_written ,description: "Number of buffers written during checkpoints and restartpoints" } - reset_time: { usage: GAUGE ,description: "Time at which checkpointer statistics were last reset" } #==============================================================# # 0340 pg_ssl #==============================================================# pg_ssl: name: pg_ssl desc: PostgreSQL SSL client connection count query: | SELECT count(*) FILTER (WHERE ssl) AS enabled, count(*) FILTER ( WHERE NOT ssl) AS disabled FROM pg_stat_ssl; ttl: 10 min_version: 90500 tags: [ cluster ] metrics: - enabled: { usage: GAUGE ,description: "Number of client connection that use ssl" } - disabled: { usage: GAUGE ,description: "Number of client connection that does not use ssl" } #==============================================================# # 0350 pg_checkpoint #==============================================================# pg_checkpoint: name: pg_checkpoint desc: checkpoint information from pg_control_checkpoint (9.6) query: |- SELECT (('x' || lpad(split_part(checkpoint_location::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(checkpoint_location::text, '/', 2), 8, '0'))::bit(32)::bigint) AS checkpoint_lsn, (('x' || lpad(split_part(redo_location::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(redo_location::text, '/', 2), 8, '0'))::bit(32)::bigint) AS redo_lsn, timeline_id AS tli, prev_timeline_id AS prev_tli, full_page_writes, split_part(next_xid, ':', 1) AS next_xid_epoch, split_part(next_xid, ':', 2) AS next_xid, next_oid::BIGINT, next_multixact_id::text::BIGINT, next_multi_offset::text::BIGINT, oldest_xid::text::BIGINT, oldest_xid_dbid::text::BIGINT, oldest_active_xid::text::BIGINT, oldest_multi_xid::text::BIGINT, oldest_multi_dbid::BIGINT, oldest_commit_ts_xid::text::BIGINT, newest_commit_ts_xid::text::BIGINT, checkpoint_time AS time, extract(epoch from now() - checkpoint_time) AS elapse FROM pg_control_checkpoint(); ttl: 60 min_version: 90600 tags: [ cluster ] metrics: - checkpoint_lsn: { usage: COUNTER ,description: "Latest checkpoint location" } - redo_lsn: { usage: COUNTER ,description: "Latest checkpoint's REDO location" } - tli: { usage: COUNTER ,description: "Latest checkpoint's TimeLineID" } - prev_tli: { usage: COUNTER ,description: "Latest checkpoint's PrevTimeLineID" } - full_page_writes: { usage: GAUGE ,description: "Latest checkpoint's full_page_writes enabled" } - next_xid_epoch: { usage: COUNTER ,description: "Latest checkpoint's NextXID epoch" } - next_xid: { usage: COUNTER ,description: "Latest checkpoint's NextXID xid" } - next_oid: { usage: COUNTER ,description: "Latest checkpoint's NextOID" } - next_multixact_id: { usage: COUNTER ,description: "Latest checkpoint's NextMultiXactId" } - next_multi_offset: { usage: COUNTER ,description: "Latest checkpoint's NextMultiOffset" } - oldest_xid: { usage: COUNTER ,description: "Latest checkpoint's oldestXID" } - oldest_xid_dbid: { usage: GAUGE ,description: "Latest checkpoint's oldestXID's DB OID" } - oldest_active_xid: { usage: COUNTER ,description: "Latest checkpoint's oldestActiveXID" } - oldest_multi_xid: { usage: COUNTER ,description: "Latest checkpoint's oldestMultiXid" } - oldest_multi_dbid: { usage: GAUGE ,description: "Latest checkpoint's oldestMulti's DB OID" } - oldest_commit_ts_xid: { usage: COUNTER ,description: "Latest checkpoint's oldestCommitTsXid" } - newest_commit_ts_xid: { usage: COUNTER ,description: "Latest checkpoint's newestCommitTsXid" } - time: { usage: COUNTER ,description: "Time of latest checkpoint" } - elapse: { usage: GAUGE ,description: "Seconds elapsed since latest checkpoint in seconds" } #==============================================================# # 0355 pg_timeline #==============================================================# pg_timeline: name: pg_timeline desc: Current timeline ID from primary or replica query: | SELECT COALESCE( (SELECT received_tli FROM pg_stat_wal_receiver), (SELECT timeline_id FROM pg_control_checkpoint()) ) AS id; ttl: 10 min_version: 90600 tags: [ cluster ] metrics: - id: { usage: GAUGE ,description: "Current timeline ID" } #==============================================================# # 0360 pg_recovery #==============================================================# pg_recovery: name: pg_recovery desc: PostgreSQL control recovery metrics (9.6) query: | SELECT min_recovery_end_timeline AS min_timeline, (('x' || lpad(split_part(min_recovery_end_lsn::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(min_recovery_end_lsn::text, '/', 2), 8, '0'))::bit(32)::bigint) AS min_lsn, (('x' || lpad(split_part(backup_start_lsn::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(backup_start_lsn::text, '/', 2), 8, '0'))::bit(32)::bigint) AS backup_start_lsn, (('x' || lpad(split_part(backup_end_lsn::text, '/', 1), 8, '0'))::bit(32)::bigint * 4294967296 + ('x' || lpad(split_part(backup_end_lsn::text, '/', 2), 8, '0'))::bit(32)::bigint) AS backup_end_lsn, end_of_backup_record_required AS require_record FROM pg_control_recovery(); ttl: 10 min_version: 90600 tags: [ cluster, replica ] metrics: - min_timeline: { usage: COUNTER ,description: "Min recovery ending loc's timeline" } - min_lsn: { usage: COUNTER ,description: "Minimum recovery ending location" } - backup_start_lsn: { usage: COUNTER ,description: "Backup start location" } - backup_end_lsn: { usage: COUNTER ,description: "Backup end location" } - require_record: { usage: GAUGE ,description: "End-of-backup record required" } #==============================================================# # 0410 pg_activity #==============================================================# pg_activity_92: name: pg_activity desc: PostgreSQL backend activity group by database and state (9.2+) query: |- SELECT datname, state, coalesce(count, 0) AS count, coalesce(max_duration, 0) AS max_duration, coalesce(max_tx_duration, 0) AS max_tx_duration, coalesce(max_conn_duration, 0) AS max_conn_duration FROM (SELECT d.datname, a.state FROM pg_database d, unnest(ARRAY ['active','idle','idle in transaction','idle in transaction (aborted)','fastpath function call','disabled']) a(state) WHERE d.datallowconn AND NOT d.datistemplate) base LEFT JOIN (SELECT datname, state, count(*) AS count, max(extract(epoch from now() - state_change)) AS max_duration, max(extract(epoch from now() - xact_start)) AS max_tx_duration, max(extract(epoch from now() - backend_start)) AS max_conn_duration FROM pg_stat_activity WHERE pid <> pg_backend_pid() GROUP BY 1,2) data USING (datname,state); ttl: 10 min_version: 90200 tags: [ cluster ] metrics: - datname: { usage: LABEL ,description: "Name of the database this backend is connected to" } - state: { usage: LABEL ,description: "Current overall state of this backend." } - count: { usage: GAUGE ,description: "Count of connection among (datname,state)" } - max_duration: { usage: GAUGE ,description: "Max duration since last state change among (datname, state)" } - max_tx_duration: { usage: GAUGE ,description: "Max transaction duration since state change among (datname, state)" } - max_conn_duration: { usage: GAUGE ,description: "Max backend session duration since state change among (datname, state)" } pg_activity_91: name: pg_activity desc: PostgreSQL backend activity group by database (9.1) query: | SELECT datname, 'active' AS state, count(*) AS count, max(extract(epoch from now() - query_start)) AS max_duration, max(extract(epoch from now() - xact_start)) AS max_tx_duration, max(extract(epoch from now() - backend_start)) AS max_conn_duration FROM pg_stat_activity WHERE procpid <> pg_backend_pid() AND datname IS NOT NULL GROUP BY datname; ttl: 10 min_version: 90100 max_version: 90200 tags: [ cluster ] metrics: - datname: { usage: LABEL ,description: "Name of the database this backend is connected to" } - state: { usage: LABEL ,description: "Current overall state of this backend (always active on 9.1)" } - count: { usage: GAUGE ,description: "Count of connection among (datname,state)" } - max_duration: { usage: GAUGE ,description: "Max duration since query start among (datname)" } - max_tx_duration: { usage: GAUGE ,description: "Max transaction duration among (datname)" } - max_conn_duration: { usage: GAUGE ,description: "Max backend session duration among (datname)" } #==============================================================# # 0420 pg_wait #==============================================================# pg_wait_96: name: pg_wait desc: PostgreSQL backend client count group by wait event type (9.6) query: | SELECT coalesce(datname, '_system') AS datname, coalesce(wait_event_type, 'Running') AS event, count(*) AS count FROM pg_stat_activity GROUP BY 1, 2; ttl: 10 min_version: 90600 tags: [ cluster ] metrics: - datname: { usage: LABEL ,description: "Name of the database, _system for global process" } - event: { usage: LABEL ,description: "Wait event type" } - count: { usage: GAUGE ,description: "Count of WaitEvent on target database" } pg_wait_91: name: pg_wait desc: PostgreSQL backend client count group by waiting flag (9.1-9.5) query: | SELECT coalesce(datname, '_system') AS datname, CASE WHEN waiting THEN 'Waiting' ELSE 'Running' END AS event, count(*) AS count FROM pg_stat_activity GROUP BY 1, 2; ttl: 10 min_version: 90100 max_version: 90600 tags: [ cluster ] metrics: - datname: { usage: LABEL ,description: "Name of the database, _system for global process" } - event: { usage: LABEL ,description: "Waiting flag, Waiting or Running" } - count: { usage: GAUGE ,description: "Backend count group by waiting flag" } #==============================================================# # 0440 pg_xact #==============================================================# pg_xact: name: pg_xact desc: PostgreSQL transaction identifier metrics query: | WITH snap(v) AS (SELECT txid_current_snapshot()), xset(v) AS (SELECT txid_snapshot_xip(v) FROM snap), xnum(v) AS (SELECT count(*) FROM xset), xmin(v) AS (SELECT txid_snapshot_xmin(v) FROM snap), xmax(v) AS (SELECT txid_snapshot_xmax(v) FROM snap) SELECT xmin.v AS xmin, xmax.v AS xmax, xnum.v AS xnum FROM xmin, xmax, xnum; ttl: 10 min_version: 90100 tags: [ cluster ] metrics: - xmin: { usage: COUNTER ,description: "Earliest txid that is still active" } - xmax: { usage: COUNTER ,description: "First as-yet-unassigned txid" } - xnum: { usage: GAUGE ,description: "Current active transaction count" } #==============================================================# # 0450 pg_lock #==============================================================# pg_lock: name: pg_lock desc: PostgreSQL lock distribution by mode and database query: | SELECT datname, mode, coalesce(count, 0) AS count FROM (SELECT d.oid AS database, d.datname, l.mode FROM pg_database d, unnest(ARRAY ['AccessShareLock','RowShareLock','RowExclusiveLock','ShareUpdateExclusiveLock', 'ShareLock','ShareRowExclusiveLock','ExclusiveLock','AccessExclusiveLock']) l(mode) WHERE d.datallowconn AND NOT d.datistemplate) base LEFT JOIN (SELECT database, mode, count(*) AS count FROM pg_locks WHERE database IS NOT NULL GROUP BY 1, 2) cnt USING (database, mode); ttl: 10 min_version: 90100 tags: [ cluster ] metrics: - datname: { usage: LABEL ,description: "Name of the database this backend is connected to" } - mode: { usage: LABEL ,description: "Name of the lock mode held or desired by this process" } - count: { usage: GAUGE ,description: "Number of locks of corresponding mode and database" } #==============================================================# # 0460 pg_query #==============================================================# pg_query_94: name: pg_query desc: PostgreSQL query statement metrics, require pg_stat_statements installed, 9.4 - 12 query: |- SELECT datname, queryid AS query, sum(calls) AS calls, sum(rows) AS rows, sum(total_time) AS exec_time, sum(blk_read_time) + sum(blk_write_time) AS io_time, sum(shared_blks_hit) AS sblk_hit, sum(shared_blks_read) AS sblk_read, sum(shared_blks_dirtied) AS sblk_dirtied, sum(shared_blks_written) AS sblk_written FROM pg_stat_statements(false) s JOIN pg_database d ON s.dbid = d.oid WHERE userid != 10 AND calls > 4 GROUP BY 1, 2 ORDER BY 3 DESC LIMIT 128; ttl: 10 timeout: 2 min_version: 90400 max_version: 130000 tags: [ cluster, "extension:pg_stat_statements" ] metrics: - datname: { usage: LABEL ,description: "Name of database" } - query: { usage: LABEL ,description: "QueryID generated from internal hash code, computed from the statement's parse tree" } - calls: { usage: COUNTER ,description: "Number of times the statement was executed" } - rows: { usage: COUNTER ,description: "Total number of rows retrieved or affected by the statement" } - exec_time: { usage: COUNTER ,scale: 1e-3 ,description: "Total time spent executing the statement, in seconds" } - io_time: { usage: COUNTER ,scale: 1e-3 ,description: "Total time the statement spent reading and writing blocks, in seconds" } - sblk_hit: { usage: COUNTER ,description: "Total number of shared block cache hits by the statement" } - sblk_read: { usage: COUNTER ,description: "Total number of shared blocks read by the statement" } - sblk_dirtied: { usage: COUNTER ,description: "Total number of shared blocks dirtied by the statement" } - sblk_written: { usage: COUNTER ,description: "Total number of shared blocks written by the statement" } pg_query_91: name: pg_query desc: PostgreSQL query statement metrics, require pg_stat_statements installed, 9.1 - 9.3 (no queryid) query: |- SELECT datname, md5(query) AS query, sum(calls) AS calls, sum(rows) AS rows, sum(total_time) AS exec_time, 0::FLOAT AS io_time, sum(shared_blks_hit) AS sblk_hit, sum(shared_blks_read) AS sblk_read, 0::BIGINT AS sblk_dirtied, sum(shared_blks_written) AS sblk_written FROM pg_stat_statements s JOIN pg_database d ON s.dbid = d.oid WHERE userid != 10 AND calls > 4 GROUP BY 1, 2 ORDER BY 3 DESC LIMIT 128; ttl: 10 timeout: 2 min_version: 90100 max_version: 90400 tags: [ cluster, "extension:pg_stat_statements" ] metrics: - datname: { usage: LABEL ,description: "Name of database" } - query: { usage: LABEL ,description: "MD5 hash of query text (no queryid before 9.4)" } - calls: { usage: COUNTER ,description: "Number of times the statement was executed" } - rows: { usage: COUNTER ,description: "Total number of rows retrieved or affected by the statement" } - exec_time: { usage: COUNTER ,scale: 1e-3 ,description: "Total time spent executing the statement, in seconds" } - io_time: { usage: COUNTER ,scale: 1e-3 ,description: "Total time the statement spent reading and writing blocks, in seconds (N/A before 9.4, always 0)" } - sblk_hit: { usage: COUNTER ,description: "Total number of shared block cache hits by the statement" } - sblk_read: { usage: COUNTER ,description: "Total number of shared blocks read by the statement" } - sblk_dirtied: { usage: COUNTER ,description: "Total number of shared blocks dirtied by the statement (N/A before 9.4, always 0)" } - sblk_written: { usage: COUNTER ,description: "Total number of shared blocks written by the statement" } #==============================================================# # 0610 pg_db #==============================================================# pg_db_92: name: pg_db desc: PostgreSQL database stats from pg_stat_database (9.2 - 9.6) query: |- SELECT d.datname, datid,age(datfrozenxid) AS age, datistemplate AS is_template, datallowconn AS allow_conn, datconnlimit AS conn_limit, datfrozenxid::TEXT::BIGINT as frozen_xid, numbackends,xact_commit,xact_rollback,xact_rollback + xact_commit AS xact_total, blks_read,blks_hit,blks_read + blks_hit AS blks_access,tup_returned,tup_fetched,tup_inserted,tup_updated,tup_deleted,tup_inserted + tup_updated + tup_deleted AS tup_modified, conflicts,temp_files,temp_bytes,deadlocks,blk_read_time,blk_write_time, extract(EPOCH FROM stats_reset) AS reset_time FROM pg_database d JOIN pg_stat_database sd ON d.oid = sd.datid; ttl: 10 min_version: 90200 tags: [ cluster ] metrics: - datname: { usage: LABEL ,description: "Name of the database" } - datid: { usage: GAUGE ,description: "OID of the database" } - age: { usage: GAUGE ,description: "Age of database calculated from datfrozenxid" } - is_template: { usage: GAUGE ,description: "If true(1), then this database can be cloned by any user with CREATEDB privileges" } - allow_conn: { usage: GAUGE ,description: "If false(0) then no one can connect to this database." } - conn_limit: { usage: GAUGE ,description: "Sets maximum number of concurrent connections that can be made to this database. -1 means no limit." } - frozen_xid: { usage: GAUGE ,description: "All transaction IDs before this one have been frozen" } - numbackends: { usage: GAUGE ,description: "Number of backends currently connected to this database" } - xact_commit: { usage: COUNTER ,description: "Number of transactions in this database that have been committed" } - xact_rollback: { usage: COUNTER ,description: "Number of transactions in this database that have been rolled back" } - xact_total: { usage: COUNTER ,description: "Number of transactions in this database" } - blks_read: { usage: COUNTER ,description: "Number of disk blocks read in this database" } - blks_hit: { usage: COUNTER ,description: "Number of times disk blocks were found already in the buffer cache" } - blks_access: { usage: COUNTER ,description: "Number of times disk blocks that accessed read+hit" } - tup_returned: { usage: COUNTER ,description: "Number of rows returned by queries in this database" } - tup_fetched: { usage: COUNTER ,description: "Number of rows fetched by queries in this database" } - tup_inserted: { usage: COUNTER ,description: "Number of rows inserted by queries in this database" } - tup_updated: { usage: COUNTER ,description: "Number of rows updated by queries in this database" } - tup_deleted: { usage: COUNTER ,description: "Number of rows deleted by queries in this database" } - tup_modified: { usage: COUNTER ,description: "Number of rows modified by queries in this database" } - conflicts: { usage: COUNTER ,description: "Number of queries canceled due to conflicts with recovery in this database" } - temp_files: { usage: COUNTER ,description: "Number of temporary files created by queries in this database" } - temp_bytes: { usage: COUNTER ,description: "Total amount of data written to temporary files by queries in this database." } - deadlocks: { usage: COUNTER ,description: "Number of deadlocks detected in this database" } - blk_read_time: { usage: COUNTER ,scale: 1e-3 ,description: "Time spent reading data file blocks by backends in this database, in seconds" } - blk_write_time: { usage: COUNTER ,scale: 1e-3 ,description: "Time spent writing data file blocks by backends in this database, in seconds" } - reset_time: { usage: GAUGE ,description: "Time at which database statistics were last reset" } pg_db_91: name: pg_db desc: PostgreSQL database stats from pg_stat_database (9.1, fewer columns) query: |- SELECT d.datname, datid,age(datfrozenxid) AS age, datistemplate AS is_template, datallowconn AS allow_conn, datconnlimit AS conn_limit, datfrozenxid::TEXT::BIGINT as frozen_xid, numbackends,xact_commit,xact_rollback,xact_rollback + xact_commit AS xact_total, blks_read,blks_hit,blks_read + blks_hit AS blks_access,tup_returned,tup_fetched,tup_inserted,tup_updated,tup_deleted,tup_inserted + tup_updated + tup_deleted AS tup_modified, conflicts, 0::BIGINT AS temp_files, 0::BIGINT AS temp_bytes, 0::BIGINT AS deadlocks, 0::BIGINT AS blk_read_time, 0::BIGINT AS blk_write_time, extract(EPOCH FROM stats_reset) AS reset_time FROM pg_database d JOIN pg_stat_database sd ON d.oid = sd.datid; ttl: 10 min_version: 90100 max_version: 90200 tags: [ cluster ] metrics: - datname: { usage: LABEL ,description: "Name of the database" } - datid: { usage: GAUGE ,description: "OID of the database" } - age: { usage: GAUGE ,description: "Age of database calculated from datfrozenxid" } - is_template: { usage: GAUGE ,description: "If true(1), then this database can be cloned by any user with CREATEDB privileges" } - allow_conn: { usage: GAUGE ,description: "If false(0) then no one can connect to this database." } - conn_limit: { usage: GAUGE ,description: "Sets maximum number of concurrent connections that can be made to this database. -1 means no limit." } - frozen_xid: { usage: GAUGE ,description: "All transaction IDs before this one have been frozen" } - numbackends: { usage: GAUGE ,description: "Number of backends currently connected to this database" } - xact_commit: { usage: COUNTER ,description: "Number of transactions in this database that have been committed" } - xact_rollback: { usage: COUNTER ,description: "Number of transactions in this database that have been rolled back" } - xact_total: { usage: COUNTER ,description: "Number of transactions in this database" } - blks_read: { usage: COUNTER ,description: "Number of disk blocks read in this database" } - blks_hit: { usage: COUNTER ,description: "Number of times disk blocks were found already in the buffer cache" } - blks_access: { usage: COUNTER ,description: "Number of times disk blocks that accessed read+hit" } - tup_returned: { usage: COUNTER ,description: "Number of rows returned by queries in this database" } - tup_fetched: { usage: COUNTER ,description: "Number of rows fetched by queries in this database" } - tup_inserted: { usage: COUNTER ,description: "Number of rows inserted by queries in this database" } - tup_updated: { usage: COUNTER ,description: "Number of rows updated by queries in this database" } - tup_deleted: { usage: COUNTER ,description: "Number of rows deleted by queries in this database" } - tup_modified: { usage: COUNTER ,description: "Number of rows modified by queries in this database" } - conflicts: { usage: COUNTER ,description: "Number of queries canceled due to conflicts with recovery in this database" } - temp_files: { usage: COUNTER ,description: "Number of temporary files created by queries in this database (N/A on 9.1, always 0)" } - temp_bytes: { usage: COUNTER ,description: "Total amount of data written to temporary files by queries in this database (N/A on 9.1, always 0)" } - deadlocks: { usage: COUNTER ,description: "Number of deadlocks detected in this database (N/A on 9.1, always 0)" } - blk_read_time: { usage: COUNTER ,scale: 1e-3 ,description: "Time spent reading data file blocks by backends in this database, in seconds (N/A on 9.1, always 0)" } - blk_write_time: { usage: COUNTER ,scale: 1e-3 ,description: "Time spent writing data file blocks by backends in this database, in seconds (N/A on 9.1, always 0)" } - reset_time: { usage: GAUGE ,description: "Time at which database statistics were last reset" } #==============================================================# # 0620 pg_db_confl #==============================================================# # https://pgpedia.info/p/pg_stat_database_conflicts.html pg_db_confl: name: pg_db_confl desc: PostgreSQL database conflicts metrics for pg 9.1 - 9.6 query: SELECT datid,datname,confl_tablespace,confl_lock,confl_snapshot,confl_bufferpin,confl_deadlock FROM pg_stat_database_conflicts; ttl: 10 min_version: 90100 tags: [ cluster, replica ] metrics: - datid: { usage: DISCARD } - datname: { usage: LABEL ,description: "Name of this database" } - confl_tablespace: { usage: COUNTER ,description: "Number of queries in this database that have been canceled due to dropped tablespaces" } - confl_lock: { usage: COUNTER ,description: "Number of queries in this database that have been canceled due to lock timeouts" } - confl_snapshot: { usage: COUNTER ,description: "Number of queries in this database that have been canceled due to old snapshots" } - confl_bufferpin: { usage: COUNTER ,description: "Number of queries in this database that have been canceled due to pinned buffers" } - confl_deadlock: { usage: COUNTER ,description: "Number of queries in this database that have been canceled due to deadlocks" } #==============================================================# # 0700 pg_table #==============================================================# pg_table_94: name: pg_table desc: PostgreSQL table metrics 9.4-9.6 query: |- SELECT CURRENT_CATALOG AS datname, nsp.nspname || '.' || c.relname AS relname, c.oid AS relid, ascii(c.relkind) AS kind, c.relpages AS pages, c.reltuples AS tuples, c.relfrozenxid AS frozenxid, age(c.relfrozenxid) AS age, c.relnatts AS ncols, psut.seq_scan,psut.seq_tup_read,psut.idx_scan,psut.idx_tup_fetch,psut.seq_scan + psut.idx_scan AS tbl_scan, psut.seq_tup_read + psut.idx_tup_fetch AS tup_read, psut.n_tup_ins,psut.n_tup_upd,psut.n_tup_del,(psut.n_tup_ins + psut.n_tup_upd + psut.n_tup_del) AS n_tup_mod,psut.n_tup_hot_upd,psut.n_live_tup,psut.n_dead_tup, psut.n_mod_since_analyze,psut.last_vacuum,psut.last_autovacuum,psut.last_analyze,psut.last_autoanalyze, psut.vacuum_count,psut.autovacuum_count,psut.analyze_count,psut.autoanalyze_count, psio.heap_blks_read,psio.heap_blks_hit,psio.idx_blks_read,psio.idx_blks_hit,psio.toast_blks_read,psio.toast_blks_hit,psio.tidx_blks_read,psio.tidx_blks_hit FROM pg_class c JOIN pg_namespace nsp ON c.relnamespace = nsp.oid LEFT JOIN pg_stat_user_tables psut ON psut.relid = c.oid LEFT JOIN pg_statio_user_tables psio ON psio.relid = c.oid WHERE nsp.nspname !~ '^pg_' AND nsp.nspname !~ '^_' AND nsp.nspname !~ '^timescaledb' AND nsp.nspname !~ '^citus' AND nsp.nspname !~ '^columnar' AND nsp.nspname NOT IN ('pg_catalog','information_schema','pg_toast','repack','monitor') AND c.relkind = ANY (ARRAY ['r','m','t','p']) ORDER BY c.relpages DESC LIMIT 256; ttl: 10 timeout: 2 min_version: 90400 metrics: - datname: { usage: LABEL ,description: "Database name of this table" } - relname: { usage: LABEL ,description: "Relation name of this table" } - relid: { usage: GAUGE ,description: "Relation oid of this table" } - kind: { usage: GAUGE ,description: "Relation kind r/table/114,m/mview/109,t/toast/116" } - pages: { usage: GAUGE ,description: "Size of the on-disk representation of this table in pages" } - tuples: { usage: GAUGE ,description: "Estimated number of rows in this table" } - frozenxid: { usage: GAUGE ,description: "All txid before this have been frozen on this table" } - age: { usage: GAUGE ,description: "Age of this table in vacuum cycles" } - ncols: { usage: GAUGE ,description: "Number of columns in the table" } - seq_scan: { usage: COUNTER ,default: 0 ,description: "Number of sequential scans initiated on this table" } - seq_tup_read: { usage: COUNTER ,default: 0 ,description: "Number of live rows fetched by sequential scans" } - idx_scan: { usage: COUNTER ,default: 0 ,description: "Number of index scans initiated on this table" } - idx_tup_fetch: { usage: COUNTER ,default: 0 ,description: "Number of live rows fetched by index scans" } - tbl_scan: { usage: COUNTER ,default: 0 ,description: "Number of scans initiated on this table" } - tup_read: { usage: COUNTER ,default: 0 ,description: "Number of live rows fetched by scans" } - n_tup_ins: { usage: COUNTER ,default: 0 ,description: "Number of rows inserted" } - n_tup_upd: { usage: COUNTER ,default: 0 ,description: "Number of rows updated (includes HOT updated rows)" } - n_tup_del: { usage: COUNTER ,default: 0 ,description: "Number of rows deleted" } - n_tup_mod: { usage: COUNTER ,default: 0 ,description: "Number of rows modified (insert + update + delete)" } - n_tup_hot_upd: { usage: COUNTER ,default: 0 ,description: "Number of rows HOT updated (i.e with no separate index update required)" } - n_live_tup: { usage: GAUGE ,description: "Estimated number of live rows" } - n_dead_tup: { usage: GAUGE ,description: "Estimated number of dead rows" } - n_mod_since_analyze: { usage: GAUGE ,description: "Estimated number of rows modified since this table was last analyzed" } - last_vacuum: { usage: DISCARD ,description: "Last time at which this table was manually vacuumed (not counting VACUUM FULL)" } - last_autovacuum: { usage: DISCARD ,description: "Last time at which this table was vacuumed by the autovacuum daemon" } - last_analyze: { usage: DISCARD ,description: "Last time at which this table was manually analyzed" } - last_autoanalyze: { usage: DISCARD ,description: "Last time at which this table was analyzed by the autovacuum daemon" } - vacuum_count: { usage: COUNTER ,default: 0 ,description: "Number of times this table has been manually vacuumed (not counting VACUUM FULL)" } - autovacuum_count: { usage: COUNTER ,default: 0 ,description: "Number of times this table has been vacuumed by the autovacuum daemon" } - analyze_count: { usage: COUNTER ,default: 0 ,description: "Number of times this table has been manually analyzed" } - autoanalyze_count: { usage: COUNTER ,default: 0 ,description: "Number of times this table has been analyzed by the autovacuum daemon" } - heap_blks_read: { usage: COUNTER ,default: 0 ,description: "Number of disk blocks read from this table" } - heap_blks_hit: { usage: COUNTER ,default: 0 ,description: "Number of buffer hits in this table" } - idx_blks_read: { usage: COUNTER ,default: 0 ,description: "Number of disk blocks read from all indexes on this table" } - idx_blks_hit: { usage: COUNTER ,default: 0 ,description: "Number of buffer hits in all indexes on this table" } - toast_blks_read: { usage: DISCARD ,default: 0 ,description: "Number of disk blocks read from this table's TOAST table (if any)" } - toast_blks_hit: { usage: DISCARD ,default: 0 ,description: "Number of buffer hits in this table's TOAST table (if any)" } - tidx_blks_read: { usage: DISCARD ,default: 0 ,description: "Number of disk blocks read from this table's TOAST table indexes (if any)" } - tidx_blks_hit: { usage: DISCARD ,default: 0 ,description: "Number of buffer hits in this table's TOAST table indexes (if any)" } pg_table_91: name: pg_table desc: PostgreSQL table metrics 9.1-9.3 (no n_mod_since_analyze) query: |- SELECT CURRENT_CATALOG AS datname, nsp.nspname || '.' || c.relname AS relname, c.oid AS relid, ascii(c.relkind) AS kind, c.relpages AS pages, c.reltuples AS tuples, c.relfrozenxid AS frozenxid, age(c.relfrozenxid) AS age, c.relnatts AS ncols, psut.seq_scan,psut.seq_tup_read,psut.idx_scan,psut.idx_tup_fetch,psut.seq_scan + psut.idx_scan AS tbl_scan, psut.seq_tup_read + psut.idx_tup_fetch AS tup_read, psut.n_tup_ins,psut.n_tup_upd,psut.n_tup_del,(psut.n_tup_ins + psut.n_tup_upd + psut.n_tup_del) AS n_tup_mod,psut.n_tup_hot_upd,psut.n_live_tup,psut.n_dead_tup, NULL::BIGINT AS n_mod_since_analyze, psut.last_vacuum,psut.last_autovacuum,psut.last_analyze,psut.last_autoanalyze, psut.vacuum_count,psut.autovacuum_count,psut.analyze_count,psut.autoanalyze_count, psio.heap_blks_read,psio.heap_blks_hit,psio.idx_blks_read,psio.idx_blks_hit,psio.toast_blks_read,psio.toast_blks_hit,psio.tidx_blks_read,psio.tidx_blks_hit FROM pg_class c JOIN pg_namespace nsp ON c.relnamespace = nsp.oid LEFT JOIN pg_stat_user_tables psut ON psut.relid = c.oid LEFT JOIN pg_statio_user_tables psio ON psio.relid = c.oid WHERE nsp.nspname !~ '^pg_' AND nsp.nspname !~ '^_' AND nsp.nspname !~ '^timescaledb' AND nsp.nspname !~ '^citus' AND nsp.nspname !~ '^columnar' AND nsp.nspname NOT IN ('pg_catalog','information_schema','pg_toast','repack','monitor') AND c.relkind = ANY (ARRAY ['r','m','t','p']) ORDER BY c.relpages DESC LIMIT 256; ttl: 10 timeout: 2 min_version: 90100 max_version: 90400 metrics: - datname: { usage: LABEL ,description: "Database name of this table" } - relname: { usage: LABEL ,description: "Relation name of this table" } - relid: { usage: GAUGE ,description: "Relation oid of this table" } - kind: { usage: GAUGE ,description: "Relation kind r/table/114,t/toast/116" } - pages: { usage: GAUGE ,description: "Size of the on-disk representation of this table in pages" } - tuples: { usage: GAUGE ,description: "Estimated number of rows in this table" } - frozenxid: { usage: GAUGE ,description: "All txid before this have been frozen on this table" } - age: { usage: GAUGE ,description: "Age of this table in vacuum cycles" } - ncols: { usage: GAUGE ,description: "Number of columns in the table" } - seq_scan: { usage: COUNTER ,default: 0 ,description: "Number of sequential scans initiated on this table" } - seq_tup_read: { usage: COUNTER ,default: 0 ,description: "Number of live rows fetched by sequential scans" } - idx_scan: { usage: COUNTER ,default: 0 ,description: "Number of index scans initiated on this table" } - idx_tup_fetch: { usage: COUNTER ,default: 0 ,description: "Number of live rows fetched by index scans" } - tbl_scan: { usage: COUNTER ,default: 0 ,description: "Number of scans initiated on this table" } - tup_read: { usage: COUNTER ,default: 0 ,description: "Number of live rows fetched by scans" } - n_tup_ins: { usage: COUNTER ,default: 0 ,description: "Number of rows inserted" } - n_tup_upd: { usage: COUNTER ,default: 0 ,description: "Number of rows updated (includes HOT updated rows)" } - n_tup_del: { usage: COUNTER ,default: 0 ,description: "Number of rows deleted" } - n_tup_mod: { usage: COUNTER ,default: 0 ,description: "Number of rows modified (insert + update + delete)" } - n_tup_hot_upd: { usage: COUNTER ,default: 0 ,description: "Number of rows HOT updated (i.e with no separate index update required)" } - n_live_tup: { usage: GAUGE ,description: "Estimated number of live rows" } - n_dead_tup: { usage: GAUGE ,description: "Estimated number of dead rows" } - n_mod_since_analyze: { usage: GAUGE ,description: "Estimated number of rows modified since this table was last analyzed (N/A on 9.1-9.3, NULL)" } - last_vacuum: { usage: DISCARD ,description: "Last time at which this table was manually vacuumed (not counting VACUUM FULL)" } - last_autovacuum: { usage: DISCARD ,description: "Last time at which this table was vacuumed by the autovacuum daemon" } - last_analyze: { usage: DISCARD ,description: "Last time at which this table was manually analyzed" } - last_autoanalyze: { usage: DISCARD ,description: "Last time at which this table was analyzed by the autovacuum daemon" } - vacuum_count: { usage: COUNTER ,default: 0 ,description: "Number of times this table has been manually vacuumed (not counting VACUUM FULL)" } - autovacuum_count: { usage: COUNTER ,default: 0 ,description: "Number of times this table has been vacuumed by the autovacuum daemon" } - analyze_count: { usage: COUNTER ,default: 0 ,description: "Number of times this table has been manually analyzed" } - autoanalyze_count: { usage: COUNTER ,default: 0 ,description: "Number of times this table has been analyzed by the autovacuum daemon" } - heap_blks_read: { usage: COUNTER ,default: 0 ,description: "Number of disk blocks read from this table" } - heap_blks_hit: { usage: COUNTER ,default: 0 ,description: "Number of buffer hits in this table" } - idx_blks_read: { usage: COUNTER ,default: 0 ,description: "Number of disk blocks read from all indexes on this table" } - idx_blks_hit: { usage: COUNTER ,default: 0 ,description: "Number of buffer hits in all indexes on this table" } - toast_blks_read: { usage: DISCARD ,default: 0 ,description: "Number of disk blocks read from this table's TOAST table (if any)" } - toast_blks_hit: { usage: DISCARD ,default: 0 ,description: "Number of buffer hits in this table's TOAST table (if any)" } - tidx_blks_read: { usage: DISCARD ,default: 0 ,description: "Number of disk blocks read from this table's TOAST table indexes (if any)" } - tidx_blks_hit: { usage: DISCARD ,default: 0 ,description: "Number of buffer hits in this table's TOAST table indexes (if any)" } #==============================================================# # 0710 pg_index #==============================================================# pg_index: name: pg_index desc: PostgreSQL index metrics (legacy 9.1-9.6) query: |- SELECT CURRENT_CATALOG AS datname, psui.schemaname || '.' || psui.indexrelname AS idxname, psui.schemaname || '.' || psui.relname AS relname, psui.indexrelid AS relid, c.relpages, c.reltuples, psui.idx_scan, psui.idx_tup_read, psui.idx_tup_fetch, psio.idx_blks_read, psio.idx_blks_hit FROM pg_stat_user_indexes psui JOIN pg_statio_user_indexes psio ON psio.indexrelid = psui.indexrelid JOIN pg_class c ON c.oid = psui.indexrelid WHERE psui.schemaname !~ '^pg_' AND psui.schemaname !~ '^_' AND psui.schemaname !~ '^timescaledb' AND psui.schemaname !~ '^citus' AND psui.schemaname !~ '^columnar' AND psui.schemaname NOT IN ('pg_catalog','information_schema','pg_toast','repack','monitor') ORDER BY psui.idx_tup_read DESC LIMIT 512; ttl: 10 timeout: 1 min_version: 90100 metrics: - datname: { usage: LABEL ,description: "Database name of this index" } - idxname: { usage: LABEL ,description: "Name of this index (full-qualified schema name)" } - relname: { usage: LABEL ,description: "Name of the table for this index (full-qualified schema name)" } - relid: { usage: LABEL ,description: "Relation oid of this index" } - relpages: { usage: GAUGE ,description: "Size of the on-disk representation of this index in pages" } - reltuples: { usage: GAUGE ,description: "Estimate relation tuples" } - idx_scan: { usage: COUNTER ,description: "Number of index scans initiated on this index" } - idx_tup_read: { usage: COUNTER ,description: "Number of index entries returned by scans on this index" } - idx_tup_fetch: { usage: COUNTER ,description: "Number of live table rows fetched by simple index scans using this index" } - idx_blks_read: { usage: COUNTER ,description: "Number of disk blocks read from this index" } - idx_blks_hit: { usage: COUNTER ,description: "Number of buffer hits in this index" } #==============================================================# # 0720 pg_func #==============================================================# pg_func: desc: PostgreSQL function metrics query: SELECT CURRENT_CATALOG AS datname, schemaname || '.' || funcname AS funcname, sum(calls) AS calls, sum(total_time) AS total_time, sum(self_time) AS self_time FROM pg_stat_user_functions GROUP BY 2 ORDER BY 4 DESC LIMIT 128; ttl: 10 min_version: 90100 metrics: - datname: { usage: LABEL ,description: "Name of belonged database" } - funcname: { usage: LABEL ,description: "Name of this function, may have multiple override" } - calls: { usage: COUNTER ,description: "Number of times this function has been called" } - total_time: { usage: COUNTER ,scale: 1e-3 ,description: "Total time spent in this function and all other functions called by it, in seconds" } - self_time: { usage: COUNTER ,scale: 1e-3 ,description: "Total time spent in this function itself, not including other functions called by it, in seconds" } #==============================================================# # 0740 pg_relkind #==============================================================# pg_relkind: name: pg_relkind desc: Postgres relation count by kind query: | SELECT CURRENT_CATALOG AS datname, relkind, count(*) AS count FROM pg_class GROUP BY relkind; ttl: 60 timeout: 1 min_version: 90100 metrics: - datname: { usage: LABEL ,description: "Database name" } - relkind: { usage: LABEL ,description: "Relation kind (r,i,S,t,v,c,...)" } - count: { usage: GAUGE ,description: "Number of relations" } #==============================================================# # 0810 pg_table_size #==============================================================# pg_table_size: desc: PostgreSQL table size metrics, quite slow query: |- SELECT CURRENT_CATALOG AS datname, nsp.nspname || '.' || rel.relname AS relname, pg_total_relation_size(rel.oid) AS bytes, pg_relation_size(rel.oid) AS relsize, pg_indexes_size(rel.oid) AS indexsize, pg_total_relation_size(reltoastrelid) AS toastsize FROM pg_namespace nsp JOIN pg_class rel ON nsp.oid = rel.relnamespace WHERE nspname <> ALL(ARRAY['pg_catalog', 'information_schema']) AND rel.relkind = 'r' ORDER BY 3 DESC NULLS LAST LIMIT 256; ttl: 300 timeout: 2 min_version: 90100 metrics: - datname: { usage: LABEL ,description: "Database name of this table" } - relname: { usage: LABEL ,description: "Schema qualified table name" } - bytes: { usage: GAUGE ,default: 0 ,description: "Total bytes of this table (including toast, index, toast index)" } - relsize: { usage: GAUGE ,default: 0 ,description: "Bytes of this table itself (main, vm, fsm)" } - indexsize: { usage: GAUGE ,default: 0 ,description: "Bytes of all related indexes of this table" } - toastsize: { usage: GAUGE ,default: 0 ,description: "Bytes of toast tables of this table" } #==============================================================# # 0820 pg_table_bloat #==============================================================# # pg_table_bloat require auxiliary view to work. Disable it or create auxiliary view before use: pg_table_bloat: name: pg_table_bloat desc: PostgreSQL table bloat metrics, require auxiliary view pg_table_bloat to work query: SELECT datname, nspname || '.' || relname AS relname, size, ratio FROM pg_table_bloat ORDER BY size DESC LIMIT 64; ttl: 300 timeout: 2 min_version: 90400 skip: true metrics: - datname: { usage: LABEL ,description: "Database name of this table" } - relname: { usage: LABEL ,description: "Schema qualified name of this table" } - size: { usage: GAUGE ,description: "Total bytes of this table" } - ratio: { usage: GAUGE ,description: "Estimated bloat ratio of this table from 0 to 1" } #==============================================================# # 0830 pg_index_bloat #==============================================================# # pg_index_bloat require auxiliary view to work. Disable it or create auxiliary view before use: pg_index_bloat: name: pg_index_bloat desc: PostgreSQL index bloat metrics, require auxiliary view pg_index_bloat to work query: SELECT datname, nspname || '.' || idxname AS idxname, size, ratio FROM pg_index_bloat ORDER BY size DESC LIMIT 64; ttl: 300 timeout: 2 min_version: 90400 skip: true metrics: - datname: { usage: LABEL ,description: "Database name of this index" } - idxname: { usage: LABEL ,description: "Schema qualified name of this index" } - size: { usage: GAUGE ,description: "Total bytes of this index" } - ratio: { usage: GAUGE ,description: "Estimated bloat ratio of this index from 0 to 1" } #==============================================================# # 0910 pgbouncer_list #==============================================================# # http://www.pgbouncer.org/usage.html#show-lists pgbouncer_list: name: pgbouncer_list desc: Pgbouncer entry list query: SHOW LISTS; ttl: 10 min_version: 10800 fatal: true tags: [ pgbouncer ] metrics: - list: { usage: LABEL ,description: "Pgbouncer internal list name" } - items: { usage: GAUGE ,description: "Number of corresponding pgbouncer object" } #==============================================================# # 0920 pgbouncer_database #==============================================================# # http://www.pgbouncer.org/usage.html#show-databases pgbouncer_database_124: name: pgbouncer_database desc: Pgbouncer database stats (since 1.24) query: SHOW DATABASES; ttl: 10 min_version: 12400 tags: [ pgbouncer ] metrics: - name: { usage: LABEL ,rename: datname ,description: "Name of configured database entry" } - host: { usage: LABEL ,description: "Host that pgbouncer will connects to" } - port: { usage: LABEL ,description: "Port that pgbouncer will connects to" } - database: { usage: LABEL ,rename: real_datname ,description: "The real database name pgbouncer connects to" } - force_user: { usage: DISCARD } - pool_size: { usage: GAUGE ,description: "Maximum number of server connections" } - min_pool_size: { usage: GAUGE ,description: "Minimum number of server connections" } - reserve_pool_size: { usage: GAUGE ,rename: reserve_pool ,description: "Maximum number of additional connections for this database" } - server_lifetime: { usage: GAUGE ,description: "The maximum lifetime of a server connection for this database" } - pool_mode: { usage: DISCARD } - load_balance_hosts: { usage: DISCARD } - max_connections: { usage: GAUGE ,description: "Maximum number of allowed connections for this database" } - current_connections: { usage: GAUGE ,description: "Current number of connections for this database" } - max_client_connections: { usage: GAUGE ,description: "Maximum number of allowed client connections for this pgbouncer instance" } - current_client_connections: { usage: GAUGE ,description: "Current number of client connections for this database" } - paused: { usage: GAUGE ,description: "True(1) if this database is currently paused, else 0" } - disabled: { usage: GAUGE ,description: "True(1) if this database is currently disabled, else 0" } pgbouncer_database_123: name: pgbouncer_database desc: Pgbouncer database stats 1.23 query: SHOW DATABASES; ttl: 10 min_version: 12300 max_version: 12400 tags: [ pgbouncer ] metrics: - name: { usage: LABEL ,rename: datname ,description: "Name of configured database entry" } - host: { usage: LABEL ,description: "Host that pgbouncer will connects to" } - port: { usage: LABEL ,description: "Port that pgbouncer will connects to" } - database: { usage: LABEL ,rename: real_datname ,description: "The real database name pgbouncer connects to" } - force_user: { usage: DISCARD } - pool_size: { usage: GAUGE ,description: "Maximum number of server connections" } - min_pool_size: { usage: GAUGE ,description: "Minimum number of server connections" } - reserve_pool: { usage: GAUGE ,description: "Maximum number of additional connections for this database" } - server_lifetime: { usage: GAUGE ,description: "The maximum lifetime of a server connection for this database" } - pool_mode: { usage: DISCARD } - max_connections: { usage: GAUGE ,description: "Maximum number of allowed connections for this database" } - current_connections: { usage: GAUGE ,description: "Current number of connections for this database" } - paused: { usage: GAUGE ,description: "True(1) if this database is currently paused, else 0" } - disabled: { usage: GAUGE ,description: "True(1) if this database is currently disabled, else 0" } pgbouncer_database_116: name: pgbouncer_database desc: Pgbouncer database stats (1.16-1.22) query: SHOW DATABASES; ttl: 10 min_version: 11600 max_version: 12300 tags: [ pgbouncer ] metrics: - name: { usage: LABEL ,rename: datname ,description: "Name of configured database entry" } - host: { usage: LABEL ,description: "Host that pgbouncer will connects to" } - port: { usage: LABEL ,description: "Port that pgbouncer will connects to" } - database: { usage: LABEL ,rename: real_datname ,description: "The real database name pgbouncer connects to" } - force_user: { usage: DISCARD } - pool_size: { usage: GAUGE ,description: "Maximum number of server connections" } - min_pool_size: { usage: GAUGE ,description: "Minimum number of server connections" } - reserve_pool: { usage: GAUGE ,description: "Maximum number of additional connections for this database" } - pool_mode: { usage: DISCARD } - max_connections: { usage: GAUGE ,description: "Maximum number of allowed connections for this database" } - current_connections: { usage: GAUGE ,description: "Current number of connections for this database" } - paused: { usage: GAUGE ,description: "True(1) if this database is currently paused, else 0" } - disabled: { usage: GAUGE ,description: "True(1) if this database is currently disabled, else 0" } pgbouncer_database_108: name: pgbouncer_database desc: Pgbouncer database stats (1.08-1.15) query: SHOW DATABASES; ttl: 10 min_version: 10800 max_version: 11600 tags: [ pgbouncer ] metrics: - name: { usage: LABEL ,rename: datname ,description: "Name of configured database entry" } - host: { usage: LABEL ,description: "Host that pgbouncer will connects to" } - port: { usage: LABEL ,description: "Port that pgbouncer will connects to" } - database: { usage: LABEL ,rename: real_datname ,description: "The real database name pgbouncer connects to" } - force_user: { usage: DISCARD } - pool_size: { usage: GAUGE ,description: "Maximum number of server connections" } - reserve_pool: { usage: GAUGE ,description: "Maximum number of additional connections for this database" } - pool_mode: { usage: DISCARD } - max_connections: { usage: GAUGE ,description: "Maximum number of allowed connections for this database" } - current_connections: { usage: GAUGE ,description: "Current number of connections for this database" } - paused: { usage: GAUGE ,description: "True(1) if this database is currently paused, else 0" } - disabled: { usage: GAUGE ,description: "True(1) if this database is currently disabled, else 0" } #==============================================================# # 0930 pgbouncer_stat #==============================================================# # http://www.pgbouncer.org/usage.html#show-stats pgbouncer_stat_124: name: pgbouncer_stat desc: Pgbouncer stats per database (since 1.24) query: SHOW STATS; ttl: 10 min_version: 12400 tags: [ pgbouncer ] metrics: - database: { usage: LABEL ,rename: datname ,description: "Name of database" } - total_xact_count: { usage: COUNTER ,description: "Total number of SQL transactions pooled by pgbouncer" } - total_query_count: { usage: COUNTER ,description: "Total number of SQL queries pooled by pgbouncer" } - total_server_assignment_count: { usage: COUNTER ,description: "Total times a server was assigned to a client" } - total_received: { usage: COUNTER ,description: "Total volume in bytes of network traffic received by pgbouncer" } - total_sent: { usage: COUNTER ,description: "Total volume in bytes of network traffic sent by pgbouncer" } - total_xact_time: { usage: COUNTER ,scale: 1e-6 ,description: "Total number of seconds spent when in a transaction" } - total_query_time: { usage: COUNTER ,scale: 1e-6 ,description: "Total number of seconds spent when executing queries" } - total_wait_time: { usage: COUNTER ,scale: 1e-6 ,description: "Time spent by clients waiting for a server, in seconds" } - total_client_parse_count: { usage: COUNTER ,description: "Total number of prepared statements created by clients" } - total_server_parse_count: { usage: COUNTER ,description: "Total number of prepared statements created on a server." } - total_bind_count: { usage: COUNTER ,description: "Total number of prepared statements readied for execution by clients and forwarded to postgres" } - avg_xact_count: { usage: GAUGE ,description: "Average transactions per second in last stat period" } - avg_query_count: { usage: GAUGE ,description: "Average queries per second in last stat period" } - avg_server_assignment_count: { usage: GAUGE ,description: "Average number of times a server as assigned to a client per second in the last stat period." } - avg_recv: { usage: GAUGE ,description: "Average received (from clients) bytes per second" } - avg_sent: { usage: GAUGE ,description: "Average sent (to clients) bytes per second" } - avg_xact_time: { usage: GAUGE ,scale: 1e-6 ,description: "Average transaction duration, in seconds" } - avg_query_time: { usage: GAUGE ,scale: 1e-6 ,description: "Average query duration, in seconds" } - avg_wait_time: { usage: GAUGE ,scale: 1e-6 ,description: "Time spent by clients waiting for a server, in seconds (average per second)." } - avg_client_parse_count: { usage: GAUGE ,description: "Average number of prepared statements created by clients" } - avg_server_parse_count: { usage: GAUGE ,description: "Average number of prepared statements created on a server." } - avg_bind_count: { usage: GAUGE ,description: "Average number of prepared statements readied for execution by clients and forwarded to postgres" } pgbouncer_stat_123: name: pgbouncer_stat desc: Pgbouncer stats per database (1.23) query: SHOW STATS; ttl: 10 min_version: 12300 max_version: 12400 tags: [ pgbouncer ] metrics: - database: { usage: LABEL ,rename: datname ,description: "Name of database" } - total_xact_count: { usage: COUNTER ,description: "Total number of SQL transactions pooled by pgbouncer" } - total_query_count: { usage: COUNTER ,description: "Total number of SQL queries pooled by pgbouncer" } - total_server_assignment_count: { usage: COUNTER ,description: "Total times a server was assigned to a client" } - total_received: { usage: COUNTER ,description: "Total volume in bytes of network traffic received by pgbouncer" } - total_sent: { usage: COUNTER ,description: "Total volume in bytes of network traffic sent by pgbouncer" } - total_xact_time: { usage: COUNTER ,scale: 1e-6 ,description: "Total number of seconds spent when in a transaction" } - total_query_time: { usage: COUNTER ,scale: 1e-6 ,description: "Total number of seconds spent when executing queries" } - total_wait_time: { usage: COUNTER ,scale: 1e-6 ,description: "Time spent by clients waiting for a server, in seconds" } - avg_xact_count: { usage: GAUGE ,description: "Average transactions per second in last stat period" } - avg_query_count: { usage: GAUGE ,description: "Average queries per second in last stat period" } - avg_server_assignment_count: { usage: GAUGE ,description: "Average number of times a server as assigned to a client per second in the last stat period." } - avg_recv: { usage: GAUGE ,description: "Average received (from clients) bytes per second" } - avg_sent: { usage: GAUGE ,description: "Average sent (to clients) bytes per second" } - avg_xact_time: { usage: GAUGE ,scale: 1e-6 ,description: "Average transaction duration, in seconds" } - avg_query_time: { usage: GAUGE ,scale: 1e-6 ,description: "Average query duration, in seconds" } - avg_wait_time: { usage: GAUGE ,scale: 1e-6 ,description: "Time spent by clients waiting for a server, in seconds (average per second)." } pgbouncer_stat_108: name: pgbouncer_stat desc: Pgbouncer stats per database (1.08 - 1.22) query: SHOW STATS; ttl: 10 min_version: 10800 max_version: 12300 tags: [ pgbouncer ] metrics: - database: { usage: LABEL ,rename: datname ,description: "Name of database" } - total_xact_count: { usage: COUNTER ,description: "Total number of SQL transactions pooled by pgbouncer" } - total_query_count: { usage: COUNTER ,description: "Total number of SQL queries pooled by pgbouncer" } - total_received: { usage: COUNTER ,description: "Total volume in bytes of network traffic received by pgbouncer" } - total_sent: { usage: COUNTER ,description: "Total volume in bytes of network traffic sent by pgbouncer" } - total_xact_time: { usage: COUNTER ,scale: 1e-6 ,description: "Total number of seconds spent when in a transaction" } - total_query_time: { usage: COUNTER ,scale: 1e-6 ,description: "Total number of seconds spent when executing queries" } - total_wait_time: { usage: COUNTER ,scale: 1e-6 ,description: "Time spent by clients waiting for a server, in seconds" } - avg_xact_count: { usage: GAUGE ,description: "Average transactions per second in last stat period" } - avg_query_count: { usage: GAUGE ,description: "Average queries per second in last stat period" } - avg_recv: { usage: GAUGE ,description: "Average received (from clients) bytes per second" } - avg_sent: { usage: GAUGE ,description: "Average sent (to clients) bytes per second" } - avg_xact_time: { usage: GAUGE ,scale: 1e-6 ,description: "Average transaction duration, in seconds" } - avg_query_time: { usage: GAUGE ,scale: 1e-6 ,description: "Average query duration, in seconds" } - avg_wait_time: { usage: GAUGE ,scale: 1e-6 ,description: "Time spent by clients waiting for a server, in seconds (average per second)." } #==============================================================# # 0940 pgbouncer_pool #==============================================================# # http://www.pgbouncer.org/usage.html#show-pools pgbouncer_pool_124: name: pgbouncer_pool desc: Pgbouncer pool stats (1.24+) query: SHOW POOLS; ttl: 10 min_version: 12400 tags: [ pgbouncer ] metrics: - database: { usage: LABEL, rename: datname, description: "Database name of this pool" } - user: { usage: LABEL, description: "User name of this pool" } - cl_active: { usage: GAUGE, rename: active_clients, description: "Client connections that are linked to server connection and can process queries" } - cl_waiting: { usage: GAUGE, rename: waiting_clients, description: "Client connections that have sent queries but have not yet got a server connection" } - cl_active_cancel_req: { usage: GAUGE, rename: active_cancel_clients, description: "Client connections that have forwarded query cancellations to the server and are waiting for the server response." } - cl_waiting_cancel_req: { usage: GAUGE, rename: cancel_clients, description: "Client connections that have not forwarded query cancellations to the server yet." } - sv_active: { usage: GAUGE, rename: active_servers, description: "Server connections that are linked to a client" } - sv_active_cancel: { usage: GAUGE, rename: active_cancel_servers, description: "Server connections that are currently forwarding a cancel request" } - sv_being_canceled: { usage: GAUGE, rename: cancel_servers, description: "cancel requests have completed that were sent to cancel a query on this server" } - sv_idle: { usage: GAUGE, rename: idle_servers, description: "Server connections that are unused and immediately usable for client queries" } - sv_used: { usage: GAUGE, rename: used_servers, description: "Server connections that have been idle for more than server_check_delay (means have to run check query)" } - sv_tested: { usage: GAUGE, rename: tested_servers, description: "Server connections that are currently running reset or check query" } - sv_login: { usage: GAUGE, rename: login_servers, description: "Server connections currently in the process of logging in" } - maxwait: { usage: GAUGE, description: "How long the first(oldest) client in the queue has waited, in seconds, key metric" } - maxwait_us: { usage: GAUGE, description: "Microsecond part of the maximum waiting time." } - pool_mode: { usage: LABEL, description: "Pooling mode in use" } - load_balance_hosts: { usage: LABEL, description: "The load_balance_hosts in use" } pgbouncer_pool_118: name: pgbouncer_pool desc: Pgbouncer pool stats (1.18-1.23) query: SHOW POOLS; ttl: 10 min_version: 11800 max_version: 12400 tags: [ pgbouncer ] metrics: - database: { usage: LABEL, rename: datname, description: "Database name of this pool" } - user: { usage: LABEL, description: "User name of this pool" } - cl_active: { usage: GAUGE, rename: active_clients, description: "Client connections that are linked to server connection and can process queries" } - cl_waiting: { usage: GAUGE, rename: waiting_clients, description: "Client connections that have sent queries but have not yet got a server connection" } - cl_active_cancel_req: { usage: GAUGE, rename: active_cancel_clients, description: "Client connections that have forwarded query cancellations to the server and are waiting for the server response." } - cl_waiting_cancel_req: { usage: GAUGE, rename: cancel_clients, description: "Client connections that have not forwarded query cancellations to the server yet." } - sv_active: { usage: GAUGE, rename: active_servers, description: "Server connections that are linked to a client" } - sv_active_cancel: { usage: GAUGE, rename: active_cancel_servers, description: "Server connections that are currently forwarding a cancel request" } - sv_being_canceled: { usage: GAUGE, rename: cancel_servers, description: "cancel requests have completed that were sent to cancel a query on this server" } - sv_idle: { usage: GAUGE, rename: idle_servers, description: "Server connections that are unused and immediately usable for client queries" } - sv_used: { usage: GAUGE, rename: used_servers, description: "Server connections that have been idle for more than server_check_delay (means have to run check query)" } - sv_tested: { usage: GAUGE, rename: tested_servers, description: "Server connections that are currently running reset or check query" } - sv_login: { usage: GAUGE, rename: login_servers, description: "Server connections currently in the process of logging in" } - maxwait: { usage: GAUGE, description: "How long the first(oldest) client in the queue has waited, in seconds, key metric" } - maxwait_us: { usage: GAUGE, description: "Microsecond part of the maximum waiting time." } - pool_mode: { usage: LABEL, description: "Pooling mode in use" } pgbouncer_pool_116: name: pgbouncer_pool desc: Pgbouncer pool stats (1.16-1.17) query: SHOW POOLS; ttl: 10 min_version: 11600 max_version: 11800 tags: [ pgbouncer ] metrics: - database: { usage: LABEL, rename: datname, description: "Database name of this pool" } - user: { usage: LABEL, description: "User name of this pool" } - cl_active: { usage: GAUGE, rename: active_clients, description: "Client connections that are linked to server connection and can process queries" } - cl_waiting: { usage: GAUGE, rename: waiting_clients, description: "Client connections that have sent queries but have not yet got a server connection" } - cl_cancel_req: { usage: GAUGE, rename: cancel_clients, description: "Client connections that have not forwarded query cancellations to the server yet." } - sv_active: { usage: GAUGE, rename: active_servers, description: "Server connections that are linked to a client" } - sv_idle: { usage: GAUGE, rename: idle_servers, description: "Server connections that are unused and immediately usable for client queries" } - sv_used: { usage: GAUGE, rename: used_servers, description: "Server connections that have been idle for more than server_check_delay (means have to run check query)" } - sv_tested: { usage: GAUGE, rename: tested_servers, description: "Server connections that are currently running reset or check query" } - sv_login: { usage: GAUGE, rename: login_servers, description: "Server connections currently in the process of logging in" } - maxwait: { usage: GAUGE, description: "How long the first(oldest) client in the queue has waited, in seconds, key metric" } - maxwait_us: { usage: GAUGE, description: "Microsecond part of the maximum waiting time." } - pool_mode: { usage: LABEL, description: "Pooling mode in use" } pgbouncer_pool_108: name: pgbouncer_pool desc: Pgbouncer pool stats (1.08-1.15) query: SHOW POOLS; ttl: 10 min_version: 10800 max_version: 11600 tags: [ pgbouncer ] metrics: - database: { usage: LABEL, rename: datname, description: "Database name of this pool" } - user: { usage: LABEL, description: "User name of this pool" } - cl_active: { usage: GAUGE, rename: active_clients, description: "Client connections that are linked to server connection and can process queries" } - cl_waiting: { usage: GAUGE, rename: waiting_clients, description: "Client connections that have sent queries but have not yet got a server connection" } - sv_active: { usage: GAUGE, rename: active_servers, description: "Server connections that are linked to a client" } - sv_idle: { usage: GAUGE, rename: idle_servers, description: "Server connections that are unused and immediately usable for client queries" } - sv_used: { usage: GAUGE, rename: used_servers, description: "Server connections that have been idle for more than server_check_delay (means have to run check query)" } - sv_tested: { usage: GAUGE, rename: tested_servers, description: "Server connections that are currently running reset or check query" } - sv_login: { usage: GAUGE, rename: login_servers, description: "Server connections currently in the process of logging in" } - maxwait: { usage: GAUGE, description: "How long the first(oldest) client in the queue has waited, in seconds, key metric" } - maxwait_us: { usage: GAUGE, description: "Microsecond part of the maximum waiting time." } - pool_mode: { usage: LABEL, description: "Pooling mode in use" } #==============================================================# # 1800 pg_tsdb_hypertable #==============================================================# # this collector reqires timescaledb extension to be installed pg_tsdb_hypertable: name: pg_tsdb_hypertable desc: TimescaleDB hypertable overview query: |- SELECT current_database() AS datname, format('%I.%I', hypertable_schema, hypertable_name) AS relname, num_dimensions AS dimensions, num_chunks AS chunks, compression_enabled::BOOLEAN::int AS compressed, hypertable_size(format('"%I"."%I"', hypertable_schema, hypertable_name)::RegClass) AS bytes FROM timescaledb_information.hypertables; ttl: 60 timeout: 2 min_version: 90600 skip: true tags: [ "extension:timescaledb", "schema:timescaledb_information" ] metrics: - datname: { usage: LABEL ,description: "database name" } - relname: { usage: LABEL ,description: "Hypertable relation name" } - dimensions: { usage: GAUGE ,description: "Number of partitioning dimensions" } - chunks: { usage: GAUGE ,description: "Total chunks of this hypertable" } - compressed: { usage: GAUGE ,description: "1 if compression enabled" } - bytes: { usage: GAUGE ,description: "Total size of hypertable in bytes" } #==============================================================# # 1900 pg_citus_node #==============================================================# # https://docs.citusdata.com/en/latest/develop/api_metadata.html#worker-node-table pg_citus_node: name: pg_citus_node desc: Citus worker coordinator node inventory query: |- SELECT CONCAT(nodename, ':', nodeport) AS node, current_database() AS datname, nodeid AS id, groupid AS group, hasmetadata::BOOLEAN::INT AS has_meta, isactive::BOOLEAN::INT AS is_active, metadatasynced::BOOLEAN::INT AS meta_synced, shouldhaveshards::BOOLEAN::INT AS have_shards FROM pg_dist_node; ttl: 60 min_version: 90600 tags: [ "extension:citus" ] metrics: - node: { usage: LABEL ,description: "nodename:port of the PostgreSQL instance" } - datname: { usage: LABEL ,description: "database name" } - id: { usage: GAUGE ,description: "auto‑generated node identifier" } - group: { usage: GAUGE ,description: "replication group id (primary + secondaries)" } - has_meta: { usage: GAUGE ,description: "1 = internal use flag set" } - is_active: { usage: GAUGE ,description: "1 = node currently accepts shards" } - meta_synced: { usage: GAUGE ,description: "1 = metadata fully synced to node" } - have_shards: { usage: GAUGE ,description: "1 = rebalancer may place shards here" } #==============================================================# # 2000 heartbeat #==============================================================# # this is a example of application monitoring and predicate queries pg_heartbeat: name: pg_heartbeat desc: monitoring heartbeat in monitor.heartbeat table predicate_queries: - name: if heartbeat table exists predicate_query: | SELECT EXISTS (SELECT 1 FROM information_schema.tables WHERE table_schema = 'monitor' AND table_name = 'heartbeat'); query: |- SELECT id AS cluster_name, extract(EPOCH FROM ts) AS ts, lsn, txid FROM monitor.heartbeat; ttl: 10 min_version: 90100 tags: [ "dbname:postgres", "schema:monitor" ] skip: true metrics: - cluster_name: { usage: LABEL ,description: "cluster_name param of this database cluster" } - ts: { usage: GAUGE ,description: "unix timestamp of the heartbeat" } - lsn: { usage: COUNTER ,description: "lsn of the heartbeat" } - txid: { usage: GAUGE ,description: "txid of the heartbeat" } ================================================ FILE: main.go ================================================ /***********************************************************************\ Copyright © 2018-2026 Ruohang Feng Contributors: https://github.com/pgsty/pg_exporter/graphs/contributors Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. \***********************************************************************/ package main import "pg_exporter/exporter" func main() { exporter.Run() } ================================================ FILE: monitor/initdb.sh ================================================ #!/bin/bash set -euo pipefail #==============================================================# # File : initdb.sh # Mtime : 2020-09-02 # Desc : initdb.sh # Path : /pg/bin/initdb.sh # Depend : CentOS 7 # Author : Vonng(rh@vonng.com) # Note : Run this as dbsu (postgres) #==============================================================# PROG_NAME="$(basename $0))" PROG_DIR="$(cd $(dirname $0) && pwd)" #--------------------------------------------------------------------------- function log() { printf "[$(date "+%Y-%m-%d %H:%M:%S")][$HOSTNAME][INITDB] $*\n" >> /pg/log/initdb.log } #--------------------------------------------------------------------------- #---------------------------------------------------------------------------- # template variables #---------------------------------------------------------------------------- PG_DBSU='{{ pg_dbsu }}' PG_REPLICATION_USERNAME='{{ pg_replication_username }}' PG_REPLICATION_PASSWORD='{{ pg_replication_password }}' PG_MONITOR_USERNAME='{{ pg_monitor_username }}' PG_MONITOR_PASSWORD='{{ pg_monitor_password }}' PG_DEFAULT_USERNAME='{{ pg_default_username }}' PG_DEFAULT_PASSWORD='{{ pg_default_password }}' PG_DEFAULT_DATABASE='{{ pg_default_database }}' PG_DEFAULT_SCHEMA='{{ pg_default_schema }}' PG_DEFAULT_EXTENSIONS='{{ pg_default_extensions }}' #---------------------------------------------------------------------------- # system users #---------------------------------------------------------------------------- log "initdb: create system users: ${PG_REPLICATION_USERNAME} , ${PG_MONITOR_USERNAME}" psql -AXtwq postgres <<- EOF -- dbsu CREATE USER ${PG_DBSU}; ALTER USER "${PG_DBSU}" SUPERUSER; COMMENT ON ROLE "${PG_REPLICATION_USERNAME}" IS 'system default sa'; -- replication user (also used as rewind user) CREATE USER ${PG_REPLICATION_USERNAME}; COMMENT ON ROLE "${PG_REPLICATION_USERNAME}" IS 'system user for replication'; ALTER USER ${PG_REPLICATION_USERNAME} REPLICATION PASSWORD '${PG_REPLICATION_PASSWORD}'; GRANT EXECUTE ON function pg_catalog.pg_ls_dir(text, boolean, boolean) TO "${PG_REPLICATION_USERNAME}"; GRANT EXECUTE ON function pg_catalog.pg_stat_file(text, boolean) TO "${PG_REPLICATION_USERNAME}"; GRANT EXECUTE ON function pg_catalog.pg_read_binary_file(text) TO "${PG_REPLICATION_USERNAME}"; GRANT EXECUTE ON function pg_catalog.pg_read_binary_file(text, bigint, bigint, boolean) TO "${PG_REPLICATION_USERNAME}"; -- system user: dbuser_monitor CREATE USER "${PG_MONITOR_USERNAME}"; COMMENT ON ROLE "${PG_MONITOR_USERNAME}" IS 'system user for monitor'; ALTER USER "${PG_MONITOR_USERNAME}" LOGIN NOSUPERUSER INHERIT NOCREATEROLE NOCREATEDB NOREPLICATION; ALTER USER "${PG_MONITOR_USERNAME}" PASSWORD '${PG_MONITOR_PASSWORD}' CONNECTION LIMIT 8; ALTER USER "${PG_MONITOR_USERNAME}" SET search_path = public,monitor; GRANT pg_monitor TO "${PG_MONITOR_USERNAME}"; EOF #---------------------------------------------------------------------------- # default roles #---------------------------------------------------------------------------- log "initdb: create default roles: dbrole_admin, dbrole_readwrite, dbrole_readonly" psql -AXtwq postgres <<- EOF -- default read-only role: personal account, analysis & etl purpose CREATE ROLE dbrole_readonly; -- analysis , personal account, etc... COMMENT ON ROLE dbrole_readonly IS 'read-only role, for personal, analysis, etl purpose'; ALTER ROLE dbrole_readonly NOLOGIN NOSUPERUSER INHERIT NOCREATEROLE NOCREATEDB NOREPLICATION NOBYPASSRLS; -- default read-write role: common production account CREATE ROLE dbrole_readwrite; -- common read-write, production account ALTER ROLE dbrole_readwrite NOLOGIN NOSUPERUSER INHERIT NOCREATEROLE NOCREATEDB NOREPLICATION NOBYPASSRLS; COMMENT ON ROLE dbrole_readwrite IS 'read-write role, common production account'; -- default admin role: create database,role,table, partition, index, etc... CREATE ROLE dbrole_admin; -- admin role, create db, role, table, partition, index, etc... COMMENT ON ROLE dbrole_admin IS 'admin role, create db, role, table, partition, index, etc...'; ALTER ROLE dbrole_admin NOLOGIN NOSUPERUSER INHERIT CREATEROLE CREATEDB NOREPLICATION BYPASSRLS; -- grant GRANT dbrole_readonly TO dbrole_readwrite; GRANT dbrole_readonly TO "${PG_MONITOR_USERNAME}"; -- since monitor user can only access from local or meta nodes GRANT dbrole_readwrite TO dbrole_admin; EOF #---------------------------------------------------------------------------- # default user (business account) #---------------------------------------------------------------------------- if [ ${PG_DEFAULT_USERNAME} != 'postgres' ]; then log "initdb: create default business user: ${PG_DEFAULT_USERNAME}" psql -AXtwq postgres <<- EOF -- default user CREATE USER "${PG_DEFAULT_USERNAME}"; COMMENT ON ROLE "${PG_DEFAULT_USERNAME}" IS 'default business user'; ALTER USER "${PG_DEFAULT_USERNAME}" LOGIN NOSUPERUSER INHERIT NOCREATEROLE NOCREATEDB NOREPLICATION BYPASSRLS; ALTER USER "${PG_DEFAULT_USERNAME}" PASSWORD '${PG_DEFAULT_PASSWORD}'; GRANT dbrole_readwrite TO "${PG_DEFAULT_USERNAME}"; EOF fi #---------------------------------------------------------------------------- # create pgpass #---------------------------------------------------------------------------- log "initdb: create pgpass file" echo "" >> ~/.pgpass function add_pgpass(){ local username=$1 local password=$2 if grep -q "${username}": ~/.pgpass; then sed -i "/${username}/d" ~/.pgpass fi echo '*:*:*'"${username}:${password}" >> ~/.pgpass chmod 0600 ~/.pgpass } add_pgpass ${PG_REPLICATION_USERNAME} ${PG_REPLICATION_PASSWORD} add_pgpass ${PG_MONITOR_USERNAME} ${PG_MONITOR_PASSWORD} if [[ ${PG_DEFAULT_USERNAME} != 'postgres' ]]; then add_pgpass ${PG_DEFAULT_USERNAME} ${PG_DEFAULT_PASSWORD} fi #---------------------------------------------------------------------------- # default privilege #---------------------------------------------------------------------------- log "initdb: alter default privilege: postgres template1" for database in postgres template1 do psql -AXtwq ${database} <<- EOF ALTER DEFAULT PRIVILEGES FOR ROLE dbrole_admin GRANT USAGE ON SCHEMAS TO dbrole_readonly; ALTER DEFAULT PRIVILEGES FOR ROLE dbrole_admin GRANT SELECT ON TABLES TO dbrole_readonly; ALTER DEFAULT PRIVILEGES FOR ROLE dbrole_admin GRANT SELECT ON SEQUENCES TO dbrole_readonly; ALTER DEFAULT PRIVILEGES FOR ROLE dbrole_admin GRANT EXECUTE ON FUNCTIONS TO dbrole_readonly; ALTER DEFAULT PRIVILEGES FOR ROLE dbrole_admin GRANT INSERT, UPDATE, DELETE ON TABLES TO dbrole_readwrite; ALTER DEFAULT PRIVILEGES FOR ROLE dbrole_admin GRANT USAGE, UPDATE ON SEQUENCES TO dbrole_readwrite; ALTER DEFAULT PRIVILEGES FOR ROLE dbrole_admin GRANT TRUNCATE, REFERENCES, TRIGGER ON TABLES TO dbrole_admin; ALTER DEFAULT PRIVILEGES FOR ROLE dbrole_admin GRANT CREATE ON SCHEMAS TO dbrole_admin; ALTER DEFAULT PRIVILEGES FOR ROLE dbrole_admin GRANT USAGE ON TYPES TO dbrole_admin; ALTER DEFAULT PRIVILEGES FOR ROLE postgres GRANT USAGE ON SCHEMAS TO dbrole_readonly; ALTER DEFAULT PRIVILEGES FOR ROLE postgres GRANT SELECT ON TABLES TO dbrole_readonly; ALTER DEFAULT PRIVILEGES FOR ROLE postgres GRANT SELECT ON SEQUENCES TO dbrole_readonly; ALTER DEFAULT PRIVILEGES FOR ROLE postgres GRANT EXECUTE ON FUNCTIONS TO dbrole_readonly; ALTER DEFAULT PRIVILEGES FOR ROLE postgres GRANT INSERT, UPDATE, DELETE ON TABLES TO dbrole_readwrite; ALTER DEFAULT PRIVILEGES FOR ROLE postgres GRANT USAGE, UPDATE ON SEQUENCES TO dbrole_readwrite; ALTER DEFAULT PRIVILEGES FOR ROLE postgres GRANT TRUNCATE, REFERENCES, TRIGGER ON TABLES TO dbrole_admin; ALTER DEFAULT PRIVILEGES FOR ROLE postgres GRANT CREATE ON SCHEMAS TO dbrole_admin; ALTER DEFAULT PRIVILEGES FOR ROLE postgres GRANT USAGE ON TYPES TO dbrole_admin; EOF done #---------------------------------------------------------------------------- # template database #---------------------------------------------------------------------------- log "initdb: init database template: postgres, template1" for database in postgres template1; do psql -AXtwq ${database} <<-EOF CREATE SCHEMA IF NOT EXISTS monitor; SET search_path = public, monitor; -- create stats extensions within monitor schema CREATE EXTENSION IF NOT EXISTS pg_stat_statements WITH SCHEMA monitor; CREATE EXTENSION IF NOT EXISTS pgstattuple WITH SCHEMA monitor; CREATE EXTENSION IF NOT EXISTS pg_qualstats WITH SCHEMA monitor; CREATE EXTENSION IF NOT EXISTS pg_buffercache WITH SCHEMA monitor; CREATE EXTENSION IF NOT EXISTS pageinspect WITH SCHEMA monitor; CREATE EXTENSION IF NOT EXISTS pg_prewarm WITH SCHEMA monitor; CREATE EXTENSION IF NOT EXISTS pg_visibility WITH SCHEMA monitor; CREATE EXTENSION IF NOT EXISTS pg_freespacemap WITH SCHEMA monitor; CREATE EXTENSION IF NOT EXISTS pg_repack WITH SCHEMA monitor; CREATE EXTENSION IF NOT EXISTS pg_stat_kcache WITH SCHEMA monitor; -- CREATE EXTENSION IF NOT EXISTS pg_cron WITH SCHEMA monitor; -- Table bloat estimate CREATE OR REPLACE VIEW monitor.pg_table_bloat AS SELECT CURRENT_CATALOG AS datname, nspname, relname , bs * tblpages AS size, CASE WHEN tblpages - est_tblpages_ff > 0 THEN (tblpages - est_tblpages_ff)/tblpages::FLOAT ELSE 0 END AS ratio FROM ( SELECT ceil( reltuples / ( (bs-page_hdr)*fillfactor/(tpl_size*100) ) ) + ceil( toasttuples / 4 ) AS est_tblpages_ff, tblpages, fillfactor, bs, tblid, nspname, relname, is_na FROM ( SELECT ( 4 + tpl_hdr_size + tpl_data_size + (2 * ma) - CASE WHEN tpl_hdr_size % ma = 0 THEN ma ELSE tpl_hdr_size % ma END - CASE WHEN ceil(tpl_data_size)::INT % ma = 0 THEN ma ELSE ceil(tpl_data_size)::INT % ma END ) AS tpl_size, (heappages + toastpages) AS tblpages, heappages, toastpages, reltuples, toasttuples, bs, page_hdr, tblid, nspname, relname, fillfactor, is_na FROM ( SELECT tbl.oid AS tblid, ns.nspname , tbl.relname, tbl.reltuples, tbl.relpages AS heappages, coalesce(toast.relpages, 0) AS toastpages, coalesce(toast.reltuples, 0) AS toasttuples, coalesce(substring(array_to_string(tbl.reloptions, ' ') FROM 'fillfactor=([0-9]+)')::smallint, 100) AS fillfactor, current_setting('block_size')::numeric AS bs, CASE WHEN version()~'mingw32' OR version()~'64-bit|x86_64|ppc64|ia64|amd64' THEN 8 ELSE 4 END AS ma, 24 AS page_hdr, 23 + CASE WHEN MAX(coalesce(s.null_frac,0)) > 0 THEN ( 7 + count(s.attname) ) / 8 ELSE 0::int END + CASE WHEN bool_or(att.attname = 'oid' and att.attnum < 0) THEN 4 ELSE 0 END AS tpl_hdr_size, sum( (1-coalesce(s.null_frac, 0)) * coalesce(s.avg_width, 0) ) AS tpl_data_size, bool_or(att.atttypid = 'pg_catalog.name'::regtype) OR sum(CASE WHEN att.attnum > 0 THEN 1 ELSE 0 END) <> count(s.attname) AS is_na FROM pg_attribute AS att JOIN pg_class AS tbl ON att.attrelid = tbl.oid JOIN pg_namespace AS ns ON ns.oid = tbl.relnamespace LEFT JOIN pg_stats AS s ON s.schemaname=ns.nspname AND s.tablename = tbl.relname AND s.inherited=false AND s.attname=att.attname LEFT JOIN pg_class AS toast ON tbl.reltoastrelid = toast.oid WHERE NOT att.attisdropped AND tbl.relkind = 'r' AND nspname NOT IN ('pg_catalog','information_schema') GROUP BY 1,2,3,4,5,6,7,8,9,10 ) AS s ) AS s2 ) AS s3 WHERE NOT is_na; -- Index bloat estimate CREATE OR REPLACE VIEW monitor.pg_index_bloat AS SELECT CURRENT_CATALOG AS datname, nspname, idxname AS relname, relpages::BIGINT * bs AS size, COALESCE((relpages - ( reltuples * (6 + ma - (CASE WHEN index_tuple_hdr % ma = 0 THEN ma ELSE index_tuple_hdr % ma END) + nulldatawidth + ma - (CASE WHEN nulldatawidth % ma = 0 THEN ma ELSE nulldatawidth % ma END)) / (bs - pagehdr)::FLOAT + 1 )), 0) / relpages::FLOAT AS ratio FROM ( SELECT nspname, idxname, reltuples, relpages, current_setting('block_size')::INTEGER AS bs, (CASE WHEN version() ~ 'mingw32' OR version() ~ '64-bit|x86_64|ppc64|ia64|amd64' THEN 8 ELSE 4 END) AS ma, 24 AS pagehdr, (CASE WHEN max(COALESCE(pg_stats.null_frac, 0)) = 0 THEN 2 ELSE 6 END) AS index_tuple_hdr, sum((1.0 - COALESCE(pg_stats.null_frac, 0.0)) * COALESCE(pg_stats.avg_width, 1024))::INTEGER AS nulldatawidth FROM pg_attribute JOIN ( SELECT pg_namespace.nspname, ic.relname AS idxname, ic.reltuples, ic.relpages, pg_index.indrelid, pg_index.indexrelid, tc.relname AS tablename, regexp_split_to_table(pg_index.indkey::TEXT, ' ') :: INTEGER AS attnum, pg_index.indexrelid AS index_oid FROM pg_index JOIN pg_class ic ON pg_index.indexrelid = ic.oid JOIN pg_class tc ON pg_index.indrelid = tc.oid JOIN pg_namespace ON pg_namespace.oid = ic.relnamespace JOIN pg_am ON ic.relam = pg_am.oid WHERE pg_am.amname = 'btree' AND ic.relpages > 0 AND nspname NOT IN ('pg_catalog', 'information_schema') ) ind_atts ON pg_attribute.attrelid = ind_atts.indexrelid AND pg_attribute.attnum = ind_atts.attnum JOIN pg_stats ON pg_stats.schemaname = ind_atts.nspname AND ((pg_stats.tablename = ind_atts.tablename AND pg_stats.attname = pg_get_indexdef(pg_attribute.attrelid, pg_attribute.attnum, TRUE)) OR (pg_stats.tablename = ind_atts.idxname AND pg_stats.attname = pg_attribute.attname)) WHERE pg_attribute.attnum > 0 GROUP BY 1, 2, 3, 4, 5, 6 ) est LIMIT 512; -- index bloat overview CREATE OR REPLACE VIEW monitor.pg_table_bloat_human AS SELECT nspname || '.' || relname AS name, pg_size_pretty(size) AS size, pg_size_pretty((size * ratio)::BIGINT) AS wasted, round(100 * ratio::NUMERIC, 2) as ratio FROM monitor.pg_table_bloat ORDER BY wasted DESC NULLS LAST; CREATE OR REPLACE VIEW monitor.pg_index_bloat_human AS SELECT nspname || '.' || relname AS name, pg_size_pretty(size) AS size, pg_size_pretty((size * ratio)::BIGINT) AS wasted, round(100 * ratio::NUMERIC, 2) as ratio FROM monitor.pg_index_bloat; -- pg session DROP VIEW IF EXISTS monitor.pg_session; CREATE OR REPLACE VIEW monitor.pg_session AS SELECT coalesce(datname, 'all') AS datname, numbackends, active, idle, ixact, max_duration, max_tx_duration, max_conn_duration FROM ( SELECT datname, count(*) AS numbackends, count(*) FILTER ( WHERE state = 'active' ) AS active, count(*) FILTER ( WHERE state = 'idle' ) AS idle, count(*) FILTER ( WHERE state = 'idle in transaction' OR state = 'idle in transaction (aborted)' ) AS ixact, max(extract(epoch from now() - state_change)) FILTER ( WHERE state = 'active' ) AS max_duration, max(extract(epoch from now() - xact_start)) AS max_tx_duration, max(extract(epoch from now() - backend_start)) AS max_conn_duration FROM pg_stat_activity WHERE backend_type = 'client backend' AND pid <> pg_backend_pid() GROUP BY ROLLUP (1) ORDER BY 1 NULLS FIRST ) t; COMMENT ON VIEW monitor.pg_session IS 'postgres session stats'; DROP VIEW IF EXISTS monitor.pg_kill; CREATE OR REPLACE VIEW monitor.pg_kill AS SELECT pid, pg_terminate_backend(pid) AS killed, datname AS dat, usename AS usr, application_name AS app, client_addr AS addr, state, extract(epoch from now() - state_change) AS query_time, extract(epoch from now() - xact_start) AS xact_time, extract(epoch from now() - backend_start) AS conn_time, substring(query, 1, 40) AS query FROM pg_stat_activity WHERE backend_type = 'client backend' AND pid <> pg_backend_pid(); COMMENT ON VIEW monitor.pg_kill IS 'kill all backend session'; -- quick cancel view DROP VIEW IF EXISTS monitor.pg_cancel; CREATE OR REPLACE VIEW monitor.pg_cancel AS SELECT pid, pg_cancel_backend(pid) AS cancel, datname AS dat, usename AS usr, application_name AS app, client_addr AS addr, state, extract(epoch from now() - state_change) AS query_time, extract(epoch from now() - xact_start) AS xact_time, extract(epoch from now() - backend_start) AS conn_time, substring(query, 1, 40) FROM pg_stat_activity WHERE state = 'active' AND backend_type = 'client backend' and pid <> pg_backend_pid(); COMMENT ON VIEW monitor.pg_cancel IS 'cancel backend queries'; -- seq scan DROP VIEW IF EXISTS monitor.pg_seq_scan; CREATE OR REPLACE VIEW monitor.pg_seq_scan AS SELECT schemaname AS nspname, relname, seq_scan, seq_tup_read, seq_tup_read / seq_scan AS seq_tup_avg, idx_scan, n_live_tup + n_dead_tup AS tuples, n_live_tup / (n_live_tup + n_dead_tup) AS dead_ratio FROM pg_stat_user_tables WHERE seq_scan > 0 and (n_live_tup + n_dead_tup) > 0 ORDER BY seq_tup_read DESC LIMIT 50; COMMENT ON VIEW monitor.pg_seq_scan IS 'table that have seq scan'; EOF psql -AXtwq ${database} <<-'EOF' CREATE OR REPLACE FUNCTION monitor.pg_shmem() RETURNS SETOF pg_shmem_allocations AS $$ SELECT * FROM pg_shmem_allocations;$$ LANGUAGE SQL SECURITY DEFINER; EOF done #---------------------------------------------------------------------------- # default database #---------------------------------------------------------------------------- if [ ${PG_DEFAULT_DATABASE} != 'postgres' ]; then log "initdb: create default database: ${PG_DEFAULT_DATABASE}" psql -AXtwq postgres <<- EOF CREATE DATABASE "${PG_DEFAULT_DATABASE}"; EOF if [ ${PG_DEFAULT_SCHEMA} != 'public' ]; then log "initdb: create default schema on ${PG_DEFAULT_DATABASE} : ${PG_DEFAULT_SCHEMA}" psql -AXtwq ${PG_DEFAULT_DATABASE} <<- EOF CREATE SCHEMA IF NOT EXISTS ${PG_DEFAULT_SCHEMA}; ALTER USER "${PG_DBSU}" SET search_path = ${PG_DEFAULT_SCHEMA},public,monitor; EOF fi if [ ${PG_DEFAULT_EXTENSIONS} != '' ]; then log "initdb: create default extensions on ${PG_DEFAULT_DATABASE} : ${PG_DEFAULT_EXTENSIONS}" for ext in ${PG_DEFAULT_EXTENSIONS//,/ } do log "initdb: create extension ${ext};" psql -AXtwq ${PG_DEFAULT_DATABASE} <<- EOF CREATE EXTENSION IF NOT EXISTS ${ext} WITH SCHEMA public; EOF done fi fi #---------------------------------------------------------------------------- # customize commands #---------------------------------------------------------------------------- log "initdb: completed!" ================================================ FILE: monitor/pgrds-instance.json ================================================ { "annotations": { "list": [ { "builtIn": 1, "datasource": { "type": "datasource", "uid": "grafana" }, "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "target": { "limit": 100, "matchAny": false, "tags": [], "type": "dashboard" }, "type": "dashboard" } ] }, "author": "Ruohang Feng (rh@vonng.com)", "description": "PostgreSQL Monitoring for Remote RDS instances (with limited metrics)", "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, "id": null, "license": "AGPLv3 @ https://pigsty.io/docs/about/license", "links": [ { "asDropdown": true, "icon": "external link", "includeVars": true, "keepTime": true, "tags": [ "Pigsty", "PGSQL", "Overview" ], "targetBlank": false, "title": "Overview", "tooltip": "", "type": "dashboards", "url": "" }, { "asDropdown": true, "icon": "external link", "includeVars": true, "keepTime": true, "tags": [ "Pigsty", "PGSQL", "Cluster" ], "targetBlank": false, "title": "Cluster", "tooltip": "", "type": "dashboards", "url": "" }, { "asDropdown": true, "icon": "external link", "includeVars": true, "keepTime": true, "tags": [ "Pigsty", "PGSQL", "Instance" ], "targetBlank": false, "title": "Instance", "tooltip": "", "type": "dashboards", "url": "" }, { "asDropdown": true, "icon": "external link", "includeVars": true, "keepTime": true, "tags": [ "Pigsty", "PGSQL", "Database" ], "targetBlank": false, "title": "Database", "tooltip": "", "type": "dashboards", "url": "" } ], "liveNow": false, "panels": [ { "collapsed": false, "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, "id": 86, "panels": [], "title": "Overview", "type": "row" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "max": 2.5, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "#6986a3", "value": null } ] } }, "overrides": [ { "matcher": { "id": "byName", "options": "Cluster" }, "properties": [ { "id": "displayName", "value": "${cls}" }, { "id": "mappings", "value": [ { "options": { "1": { "index": 0, "text": "Cluster" } }, "type": "value" } ] }, { "id": "links", "value": [ { "title": "PGRDS Cluster : ${cls}", "url": "/d/pgrds-cluster?var-cls=${cls}&${__url_time_range}" } ] } ] }, { "matcher": { "id": "byName", "options": "Instance" }, "properties": [ { "id": "displayName", "value": "${ins}" }, { "id": "mappings", "value": [ { "options": { "1": { "index": 0, "text": "Instance" } }, "type": "value" } ] }, { "id": "links", "value": [ { "title": "PGCAT Instance : ${ins}", "url": "/d/pgcat-instance?var-dsn=${ins}.${datname}" } ] } ] }, { "matcher": { "id": "byName", "options": "IP" }, "properties": [ { "id": "mappings", "value": [ { "options": { "1": { "index": 0, "text": "Node IP" } }, "type": "value" } ] }, { "id": "displayName", "value": "${ip}" }, { "id": "links", "value": [ { "title": "Node Instance : ${ip}", "url": "/d/node-instance?var-id=${ip}&${__url_time_range}" } ] } ] }, { "matcher": { "id": "byName", "options": "Name" }, "properties": [ { "id": "displayName", "value": "${node}" }, { "id": "mappings", "value": [ { "options": { "1": { "index": 0, "text": "Hostname" } }, "type": "value" } ] }, { "id": "links", "value": [ { "title": "Node Instance : ${node}", "url": "/d/node-instance?var-id=${node}&${__url_time_range}" } ] } ] } ] }, "gridPos": { "h": 4, "w": 6, "x": 0, "y": 1 }, "id": 189, "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "auto", "orientation": "horizontal", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showPercentChange": false, "text": { "titleSize": 20, "valueSize": 16 }, "textMode": "value_and_name", "wideLayout": true }, "pluginVersion": "11.1.4", "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "1", "hide": false, "instant": true, "interval": "", "legendFormat": "Instance", "queryType": "measurements", "refId": "B" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "1", "hide": false, "instant": true, "interval": "", "legendFormat": "Cluster", "queryType": "measurements", "refId": "A" } ], "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "All instances among pgsql cluster ${cls}\n\nInstance: Goto PGSQL Instance\n\nIP: Goto PGSQL Node\n\nStatus: Goto PGSQL Service\n\nLoad: max(cpu,postgres,pgbouncer)\n\nSpace: Disk space usage max(all device)\n\nProxy: session number, Goto Haproxy Admin Page", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "custom": { "align": "center", "cellOptions": { "type": "auto" }, "inspect": false }, "mappings": [], "max": 1.2, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "#e3e3e3e0", "value": null } ] } }, "overrides": [ { "matcher": { "id": "byName", "options": "Instance" }, "properties": [ { "id": "links", "value": [ { "title": "PGRDS Instance ${__data.fields.Instance}", "url": "/d/pgrds-instance?var-ins=${__data.fields.Instance}&${__url_time_range}" } ] }, { "id": "custom.width" } ] }, { "matcher": { "id": "byName", "options": "Host" }, "properties": [ { "id": "links", "value": [] } ] }, { "matcher": { "id": "byName", "options": "Role" }, "properties": [ { "id": "custom.width", "value": 80 }, { "id": "mappings", "value": [ { "options": { "0": { "color": "#3e668f", "index": 0, "text": "primary" }, "1": { "color": "#346f36cc", "index": 1, "text": "replica" } }, "type": "value" } ] }, { "id": "custom.cellOptions", "value": { "mode": "gradient", "type": "color-background" } }, { "id": "links", "value": [ { "title": "PGSQL Service for ${cls}-${__data.fields.Role}", "url": "/d/pgsql-service?var-svc=${cls}-${__data.fields.Role}&${__url_time_range}" } ] } ] }, { "matcher": { "id": "byName", "options": "Load" }, "properties": [ { "id": "custom.cellOptions", "value": { "type": "auto" } }, { "id": "color", "value": { "mode": "thresholds" } }, { "id": "unit", "value": "short" }, { "id": "custom.width", "value": 120 }, { "id": "thresholds", "value": { "mode": "absolute", "steps": [ { "color": "#3e668f", "value": null }, { "color": "#346f36cc", "value": 0.1 }, { "color": "#fcdb72", "value": 0.3 }, { "color": "#f5a673", "value": 0.5 }, { "color": "red", "value": 0.7 }, { "color": "#b783af", "value": 0.9 }, { "color": "text", "value": 1 } ] } }, { "id": "decimals", "value": 2 }, { "id": "mappings", "value": [ { "options": { "-1": { "color": "transparent", "index": 0, "text": "N/A" } }, "type": "value" } ] } ] }, { "matcher": { "id": "byName", "options": "Cluster" }, "properties": [ { "id": "links", "value": [ { "title": "PGSQL Cluster for ${__data.fields.Cluster}", "url": "/d/pgsql-cluster?var-cls=${__data.fields.Cluster}&${__url_time_range}" } ] } ] }, { "matcher": { "id": "byName", "options": "TPS" }, "properties": [ { "id": "custom.width", "value": 80 }, { "id": "thresholds", "value": { "mode": "absolute", "steps": [ { "color": "#3e668f", "value": null }, { "color": "#3e668f", "value": 32 }, { "color": "#f5a673", "value": 32000 } ] } }, { "id": "custom.cellOptions", "value": { "type": "color-text" } } ] }, { "matcher": { "id": "byName", "options": "DB Conn" }, "properties": [ { "id": "custom.width", "value": 80 }, { "id": "thresholds", "value": { "mode": "absolute", "steps": [ { "color": "#3e668f", "value": null }, { "color": "#346f36cc", "value": 4 }, { "color": "#EAB839", "value": 20 }, { "color": "#EF843C", "value": 40 }, { "color": "#E24D42", "value": 80 }, { "color": "#b783af", "value": 100 }, { "color": "text", "value": 400 } ] } }, { "id": "custom.cellOptions", "value": { "type": "color-text" } } ] }, { "matcher": { "id": "byName", "options": "RT" }, "properties": [ { "id": "custom.width", "value": 80 }, { "id": "unit", "value": "s" }, { "id": "mappings", "value": [ { "options": { "match": "nan", "result": { "index": 1, "text": "-" } }, "type": "special" } ] }, { "id": "thresholds", "value": { "mode": "absolute", "steps": [ { "color": "#346f36cc", "value": null }, { "color": "#fcdb72", "value": 1 }, { "color": "#f5a673", "value": 2 }, { "color": "red", "value": 4 }, { "color": "#b783af", "value": 8 }, { "color": "text", "value": 16 } ] } }, { "id": "custom.cellOptions", "value": { "type": "color-text" } } ] }, { "matcher": { "id": "byName", "options": "Session" }, "properties": [ { "id": "custom.width", "value": 72 }, { "id": "thresholds", "value": { "mode": "absolute", "steps": [ { "color": "#3e668f", "value": null }, { "color": "#346f36cc", "value": 1 }, { "color": "#fcdb72", "value": 1000 }, { "color": "#f5a673", "value": 2000 }, { "color": "red", "value": 3000 } ] } }, { "id": "custom.cellOptions", "value": { "type": "color-text" } } ] }, { "matcher": { "id": "byName", "options": "Queue" }, "properties": [ { "id": "custom.width", "value": 60 }, { "id": "thresholds", "value": { "mode": "absolute", "steps": [ { "color": "#346f36cc", "value": null }, { "color": "red", "value": 1 } ] } }, { "id": "custom.cellOptions", "value": { "type": "color-text" } } ] }, { "matcher": { "id": "byName", "options": "HAProxy" }, "properties": [ { "id": "links", "value": [ { "title": "HAProxy Admin Page : ${__data.fields.Instance}", "url": "http://h.pigsty/${__data.fields.Instance}" } ] } ] }, { "matcher": { "id": "byName", "options": "Up" }, "properties": [ { "id": "custom.width", "value": 80 }, { "id": "mappings", "value": [ { "options": { "0": { "color": "#cc4637d9", "index": 0, "text": "Dead" }, "1": { "color": "#346f36cc", "index": 1, "text": "Alive" } }, "type": "value" } ] }, { "id": "custom.cellOptions", "value": { "type": "color-background" } } ] }, { "matcher": { "id": "byName", "options": "UPTime" }, "properties": [ { "id": "unit", "value": "s" }, { "id": "decimals", "value": 1 }, { "id": "custom.width", "value": 100 }, { "id": "thresholds", "value": { "mode": "absolute", "steps": [ { "color": "red", "value": null }, { "color": "#f5a673", "value": 300 }, { "color": "#eab839", "value": 3600 }, { "color": "#346f36cc", "value": 86400 } ] } }, { "id": "custom.cellOptions", "value": { "type": "color-text" } } ] } ] }, "gridPos": { "h": 6, "w": 18, "x": 6, "y": 1 }, "id": 191, "links": [ { "title": "PGRDS Cluster : ${cls}", "url": "/d/pgrds-cluster?var-cls=${cls}&${__url_time_range}" } ], "options": { "cellHeight": "sm", "footer": { "countRows": false, "fields": "", "reducer": [ "sum" ], "show": false }, "showHeader": true, "sortBy": [ { "desc": false, "displayName": "Instance" } ] }, "pluginVersion": "11.1.4", "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "min by (ins,ip) (pg_up{cls=\"$cls\"})", "format": "table", "hide": false, "instant": true, "interval": "", "legendFormat": "", "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "max by (ins) (pg:ins:active_time_rate1m{cls=\"$cls\"})", "format": "table", "hide": false, "instant": true, "interval": "", "legendFormat": "", "refId": "B" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "exemplar": false, "expr": "sum by (ins) (pg:ins:xact_total_rate1m{cls=\"$cls\"})", "format": "table", "hide": false, "instant": true, "interval": "", "legendFormat": "", "refId": "C" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "min by (ins,ip) (pg_in_recovery{cls=\"$cls\"})", "format": "table", "hide": false, "instant": true, "interval": "", "legendFormat": "", "refId": "D" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "exemplar": false, "expr": "max by (ins) (pg:ins:num_backends{cls=\"$cls\"})", "format": "table", "hide": false, "instant": true, "interval": "", "legendFormat": "", "refId": "E" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "exemplar": false, "expr": "max by (ins) (pgbouncer:ins:xact_rt_1m{cls=\"$cls\"})", "format": "table", "hide": true, "instant": true, "interval": "", "legendFormat": "", "refId": "F" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "max by (ins) (pg_uptime{cls=\"$cls\"})", "format": "table", "hide": false, "instant": true, "interval": "", "legendFormat": "", "refId": "G" } ], "transformations": [ { "id": "seriesToColumns", "options": { "byField": "ins" } }, { "id": "organize", "options": { "excludeByName": { "Time": true, "Time 1": true, "Time 10": true, "Time 2": true, "Time 3": true, "Time 4": true, "Time 6": true, "Time 8": true, "Time 9": true, "Value #A": false, "Value #B": false, "Value #H": false, "__name__": true, "__name__ 1": true, "__name__ 2": true, "__name__ 3": true, "__name__ 5": true, "__name__ 7": true, "cls": true, "cls 1": true, "cls 2": true, "cls 3": true, "cls 4": true, "cls 5": true, "cls 6": true, "cls 7": true, "instance": false, "instance 1": true, "instance 2": false, "instance 3": true, "instance 4": true, "instance 5": true, "instance 6": true, "ip 2": true, "ip 3": true, "ip 4": true, "ip 5": true, "ip 6": true, "ip 7": true, "job": true, "job 1": true, "job 2": true, "job 3": true, "job 4": true, "job 5": true, "job 6": true, "job 7": true }, "indexByName": { "Time 1": 7, "Time 2": 8, "Time 3": 9, "Time 4": 10, "Time 5": 11, "Value #A": 2, "Value #B": 4, "Value #C": 5, "Value #D": 3, "Value #E": 6, "ins": 0, "ip 1": 1, "ip 2": 12 }, "renameByName": { "Time 4": "", "Value #A": "Up", "Value #B": "Load", "Value #C": "TPS", "Value #D": "Role", "Value #E": "Session", "Value #F": "RT", "Value #G": "UPTime", "Value #H": "LB", "Value #I": "QPS", "Value #J": "LB Clients", "Value #K": "Lag", "cls 1": "", "cls 2": "", "ins": "Instance", "instance": "HAProxy", "instance 2": "", "ip": "IP", "ip 1": "Host" } } } ], "type": "table" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "max": 2.5, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "#6986a3", "value": null } ] } }, "overrides": [] }, "gridPos": { "h": 2, "w": 6, "x": 0, "y": 5 }, "id": 190, "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "/.*/", "values": false }, "showPercentChange": false, "text": { "valueSize": 12 }, "textMode": "value", "wideLayout": true }, "pluginVersion": "11.1.4", "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "max by (address) (label_join(pg_meta_info{ins=\"$ins\"}, \"address\", \":\", \"ip\", \"listen_port\"))", "format": "table", "hide": false, "instant": true, "interval": "", "legendFormat": "__auto", "queryType": "measurements", "refId": "A" } ], "transformations": [ { "id": "organize", "options": { "excludeByName": { "Time": true, "Value": true, "__name__": true, "cls": true, "cluster_id": true, "ins": true, "instance": true, "ip": false, "job": true, "ver_num": true, "version": true, "wal_level": true }, "indexByName": { "Time": 0, "Value": 9, "__name__": 1, "cls": 2, "cluster_id": 3, "ins": 4, "instance": 12, "ip": 10, "job": 5, "listen_port": 11, "ver_num": 8, "version": 6, "wal_level": 7 }, "renameByName": { "cls": "Cluster", "cluster_id": "Cluster ID", "ins": "Leader", "ip": "Host", "listen_port": "Port", "version": "Version", "wal_level": "WAL Level" } } }, { "id": "filterFieldsByName", "options": {} } ], "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "max": 2.5, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "#6986a3", "value": null } ] } }, "overrides": [ { "matcher": { "id": "byName", "options": "Leader" }, "properties": [ { "id": "links", "value": [ { "title": "PGRDS Leader : ${primary}", "url": "/d/pgrds-instance?var-ins=${primary}&${__url_time_range}" } ] } ] }, { "matcher": { "id": "byName", "options": "Kernel" }, "properties": [ { "id": "mappings", "value": [ { "options": { "pattern": ".*wiltondb.*", "result": { "color": "#d82024e3", "index": 0, "text": "Babelfish" } }, "type": "regex" }, { "options": { "pattern": ".*PolarDB.*", "result": { "color": "#f25009", "index": 1, "text": "PolarDB" } }, "type": "regex" }, { "options": { "pattern": ".*IvorySQL.*", "result": { "color": "#ee9832", "index": 2, "text": "IvorySQL" } }, "type": "regex" }, { "options": { "pattern": "PostgreSQL.*", "result": { "color": "#3e668f", "index": 3, "text": "PostgreSQL" } }, "type": "regex" } ] } ] } ] }, "gridPos": { "h": 6, "w": 6, "x": 0, "y": 7 }, "id": 209, "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "auto", "orientation": "horizontal", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "/.*/", "values": false }, "showPercentChange": false, "text": { "titleSize": 14, "valueSize": 16 }, "textMode": "value_and_name", "wideLayout": true }, "pluginVersion": "11.1.4", "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "pg_meta_info{ins=\"$ins\"}", "format": "table", "hide": false, "instant": true, "interval": "", "legendFormat": "__auto", "queryType": "measurements", "refId": "A" } ], "transformations": [ { "id": "organize", "options": { "excludeByName": { "Time": true, "Value": true, "__name__": true, "cls": true, "cluster_id": false, "cluster_name": true, "conf_path": false, "data_dir": false, "extensions": true, "hba_path": false, "ins": true, "instance": true, "ip": true, "job": true, "listen_port": true, "primary_conninfo": true, "ver_num": false, "ver_str": false, "version": true }, "includeByName": {}, "indexByName": { "Time": 3, "Value": 10, "__name__": 4, "cls": 5, "cluster_id": 9, "cluster_name": 14, "conf_path": 15, "data_dir": 17, "extensions": 18, "hba_path": 16, "ins": 6, "instance": 13, "ip": 11, "job": 7, "listen_port": 12, "ver_num": 1, "ver_str": 0, "version": 2, "wal_level": 8 }, "renameByName": { "cls": "Cluster", "cluster_id": "Cluster ID", "conf_path": "Config", "data_dir": "Data", "hba_path": "HBA", "ins": "Leader", "ip": "Host", "listen_port": "Port", "ver_num": "Version", "ver_str": "Kernel", "version": "Version", "wal_level": "WAL Level" } } } ], "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "links": [ { "title": "PGSQL Replication for ${cls}", "url": "/d/pgsql-replication?var-cls=${cls}&${__url_time_range}" } ], "mappings": [], "max": 2.5, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "#346f36cc", "value": null } ] } }, "overrides": [ { "matcher": { "id": "byFrameRefID", "options": "A" }, "properties": [ { "id": "mappings", "value": [ { "options": { "0": { "color": "#f5a673", "index": 0, "text": "Primary" }, "1": { "color": "#346f36cc", "index": 1, "text": "Replica" } }, "type": "value" } ] }, { "id": "displayName", "value": "Role" } ] }, { "matcher": { "id": "byFrameRefID", "options": "B" }, "properties": [ { "id": "mappings", "value": [ { "options": { "0": { "color": "#3e668f", "index": 0, "text": "No" }, "1": { "color": "#346f36cc", "index": 1, "text": "Yes" } }, "type": "value" } ] } ] }, { "matcher": { "id": "byFrameRefID", "options": "C" }, "properties": [ { "id": "thresholds", "value": { "mode": "absolute", "steps": [ { "color": "#3e668f", "value": null }, { "color": "#346f36cc", "value": 1 } ] } } ] }, { "matcher": { "id": "byFrameRefID", "options": "D" }, "properties": [ { "id": "thresholds", "value": { "mode": "absolute", "steps": [ { "color": "#346f36cc", "value": null } ] } }, { "id": "mappings", "value": [ { "options": { "0": { "color": "#f5a673", "index": 0, "text": "Leading" }, "1": { "color": "#3e668f", "index": 1, "text": "Follow" } }, "type": "value" } ] }, { "id": "links", "value": [ { "title": "Node Instance : ${__field.labels.sender_host}", "url": "/d/node-instance?var-id=${__field.labels.sender_host}&${__url_time_range}" } ] } ] }, { "matcher": { "id": "byFrameRefID", "options": "E" }, "properties": [ { "id": "thresholds", "value": { "mode": "absolute", "steps": [ { "color": "#3e668f", "value": null }, { "color": "#346f36cc", "value": 1 } ] } } ] }, { "matcher": { "id": "byFrameRefID", "options": "F" }, "properties": [ { "id": "thresholds", "value": { "mode": "absolute", "steps": [ { "color": "#3e668f", "value": null }, { "color": "#346f36cc", "value": 1 } ] } } ] } ] }, "gridPos": { "h": 6, "w": 3, "x": 6, "y": 7 }, "id": 194, "links": [ { "title": "PGSQL Replication : ${cls}", "url": "/d/pgsql-replication?var-cls=${cls}&${__url_time_range}" } ], "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "auto", "orientation": "horizontal", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showPercentChange": false, "text": { "titleSize": 10, "valueSize": 10 }, "textMode": "value_and_name", "wideLayout": true }, "pluginVersion": "11.1.4", "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "exemplar": false, "expr": "pg_in_recovery{ins=\"$ins\"}", "instant": true, "interval": "", "legendFormat": "recovery", "queryType": "measurements", "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "(pg_recv_init_lsn{ins=\"$ins\"} > bool 0) or on() vector(0)", "hide": false, "instant": true, "interval": "", "legendFormat": "{{ sender_host }} ", "queryType": "measurements", "refId": "D" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "count(pg_repl_lsn{ins=\"$ins\"}) or on() vector(0)", "hide": false, "instant": true, "interval": "", "legendFormat": "Downstream", "queryType": "measurements", "refId": "C" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "pg_sync_standby_enabled{ins=\"$ins\"}", "hide": false, "instant": true, "interval": "", "legendFormat": "Sync Commit", "queryType": "measurements", "refId": "B" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "count by (ins)(pg_pubrel_count{ins=\"$ins\"}) > bool 0 or on() vector(0)", "hide": false, "instant": true, "interval": "", "legendFormat": "Publication", "queryType": "measurements", "refId": "E" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "count by (ins)(pg_sub_id{ins=\"$ins\"}) or on() vector(0)", "hide": false, "instant": true, "interval": "", "legendFormat": "Subscription", "queryType": "measurements", "refId": "F" } ], "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [ { "options": { "match": "null+nan", "result": { "color": "transparent", "index": 1 } }, "type": "special" } ], "thresholds": { "mode": "absolute", "steps": [ { "color": "#346f36cc", "value": null } ] } }, "overrides": [ { "matcher": { "id": "byName", "options": "WAL Paused" }, "properties": [ { "id": "mappings", "value": [ { "options": { "0": { "color": "#346f36cc", "index": 0, "text": "Running" }, "1": { "color": "#f5a673", "index": 1, "text": "Paused" } }, "type": "value" }, { "options": { "match": "null+nan", "result": { "color": "transparent", "index": 2 } }, "type": "special" } ] } ] }, { "matcher": { "id": "byName", "options": "WAL" }, "properties": [ { "id": "mappings", "value": [ { "options": { "0": { "color": "#346f36cc", "index": 0, "text": "Proceed" }, "1": { "color": "#f5a673", "index": 1, "text": "Paused" } }, "type": "value" }, { "options": { "match": "null+nan", "result": { "color": "transparent", "index": 2 } }, "type": "special" } ] } ] }, { "matcher": { "id": "byName", "options": "PG Uptime" }, "properties": [ { "id": "unit", "value": "s" }, { "id": "thresholds", "value": { "mode": "absolute", "steps": [ { "color": "#cc4637d9", "value": null }, { "color": "#f5a673", "value": 300 }, { "color": "#fcdb72", "value": 3600 }, { "color": "#346f36cc", "value": 86400 } ] } } ] }, { "matcher": { "id": "byName", "options": "Aliveness" }, "properties": [ { "id": "mappings", "value": [ { "options": { "0": { "color": "#cc4637d9", "index": 0, "text": "Dead" }, "1": { "color": "#346f36cc", "index": 1, "text": "Alive" } }, "type": "value" } ] } ] }, { "matcher": { "id": "byName", "options": "Last Reload" }, "properties": [ { "id": "unit", "value": "s" } ] } ] }, "gridPos": { "h": 6, "w": 3, "x": 9, "y": 7 }, "id": 193, "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "auto", "orientation": "horizontal", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showPercentChange": false, "text": { "titleSize": 10, "valueSize": 10 }, "textMode": "auto", "wideLayout": true }, "pluginVersion": "11.1.4", "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "pg_up{ins=\"$ins\"}", "format": "table", "instant": true, "legendFormat": "__auto", "range": false, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "pg_uptime{ins=\"$ins\"}", "format": "table", "hide": false, "instant": true, "legendFormat": "__auto", "range": false, "refId": "B" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "pg_version{ins=\"$ins\"}", "format": "table", "hide": false, "instant": true, "legendFormat": "__auto", "range": false, "refId": "C" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "pg_conf_reload_time{ins=\"$ins\"}", "format": "table", "hide": false, "instant": true, "legendFormat": "__auto", "range": false, "refId": "D" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "pg:ins:timeline{ins=\"$ins\"}", "format": "table", "hide": false, "instant": true, "legendFormat": "__auto", "range": false, "refId": "E" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "pg_is_wal_replay_paused{ins=\"$ins\"}", "format": "table", "hide": false, "instant": true, "legendFormat": "__auto", "range": false, "refId": "F" } ], "transformations": [ { "id": "joinByField", "options": { "byField": "ins", "mode": "outer" } }, { "id": "organize", "options": { "excludeByName": { "Time": true, "Time 1": true, "Time 10": true, "Time 11": true, "Time 12": true, "Time 2": true, "Time 3": true, "Time 4": true, "Time 5": true, "Time 6": true, "Time 7": true, "Time 8": true, "Time 9": true, "Value #C": false, "Value #E": false, "Value #H": true, "Value #I": true, "Value #K": true, "__name__ 1": false, "cls": true, "ins": true, "scope": false }, "indexByName": { "Time 1": 6, "Time 2": 7, "Time 3": 8, "Time 4": 9, "Time 5": 10, "Time 6": 11, "Value #A": 0, "Value #B": 2, "Value #C": 5, "Value #D": 3, "Value #E": 4, "Value #F": 1, "__name__ 1": 12, "__name__ 2": 18, "__name__ 3": 23, "__name__ 4": 28, "__name__ 5": 33, "__name__ 6": 38, "cls 1": 14, "cls 2": 19, "cls 3": 24, "cls 4": 29, "cls 5": 34, "cls 6": 39, "ins": 13, "instance 1": 15, "instance 2": 20, "instance 3": 25, "instance 4": 30, "instance 5": 35, "instance 6": 40, "ip 1": 16, "ip 2": 21, "ip 3": 26, "ip 4": 31, "ip 5": 36, "ip 6": 41, "job 1": 17, "job 2": 22, "job 3": 27, "job 4": 32, "job 5": 37, "job 6": 42 }, "renameByName": { "Value #A": "Aliveness", "Value #B": "PG Uptime", "Value #C": "Version", "Value #D": "Last Reload", "Value #E": "Timeline", "Value #F": "WAL Paused", "Value #G": "", "Value #H": "", "Value #I": "", "Value #J": "", "Value #K": "", "Value #L": "Up", "cls": "Cluster", "ins": "Instance", "scope": "Shard" } } } ], "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "Database of this cluster ${cls}\n\nLoad: active_time / backends (PG14 only)\n\nQuery RT : max query rt of this database\n\nClient: Pgbouncer Active Clients\n\nServer: Pgbouncer Active Servers\n\nBackend: Postgres num backends\n\nT = is Template?\n\nD = Disabled in Pgbouncer ?\n\nP = Paused in Pgbouncer ?", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "custom": { "align": "center", "cellOptions": { "mode": "gradient", "type": "color-background" }, "inspect": false }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "#e3e3e3e0", "value": null } ] } }, "overrides": [ { "matcher": { "id": "byName", "options": "Database" }, "properties": [ { "id": "links", "value": [ { "title": "PGSQL Database : ${ins}.${__data.fields.Database}", "url": "/d/pgsql-database?var-ins=${ins}&var-datname=${__data.fields.Database}&${__url_time_range}" } ] }, { "id": "mappings", "value": [ { "options": { "postgres": { "color": "#808080", "index": 2 }, "template0": { "color": "#c0c0c0e0", "index": 0 }, "template1": { "color": "#a8a8a8e0", "index": 1 } }, "type": "value" } ] }, { "id": "custom.cellOptions", "value": { "type": "color-text" } }, { "id": "color", "value": { "fixedColor": "#3e668f", "mode": "fixed" } }, { "id": "filterable" } ] }, { "matcher": { "id": "byName", "options": "Size" }, "properties": [ { "id": "custom.width", "value": 80 }, { "id": "unit", "value": "decbytes" }, { "id": "thresholds", "value": { "mode": "absolute", "steps": [ { "color": "#3e668f", "value": null }, { "color": "#346f36cc", "value": 1000000000 }, { "color": "#fcdb72", "value": 1000000000000 }, { "color": "#f5a673", "value": 10000000000000 } ] } }, { "id": "custom.cellOptions", "value": { "type": "color-text" } }, { "id": "custom.cellOptions", "value": { "type": "color-text" } }, { "id": "color", "value": { "mode": "thresholds" } }, { "id": "decimals", "value": 0 } ] }, { "matcher": { "id": "byName", "options": "Age" }, "properties": [ { "id": "thresholds", "value": { "mode": "absolute", "steps": [ { "color": "#3e668f", "value": null }, { "color": "#346f36cc", "value": 60000000 }, { "color": "#EAB839", "value": 200000000 }, { "color": "#f5a673", "value": 600000000 }, { "color": "red", "value": 1000000000 }, { "color": "#b783af", "value": 2000000000 }, { "color": "text", "value": 2137483647 } ] } }, { "id": "custom.cellOptions", "value": { "type": "color-text" } }, { "id": "unit", "value": "percentunit" }, { "id": "custom.width", "value": 60 }, { "id": "color", "value": { "mode": "thresholds" } }, { "id": "decimals", "value": 1 } ] }, { "matcher": { "id": "byName", "options": "TPS" }, "properties": [ { "id": "thresholds", "value": { "mode": "absolute", "steps": [ { "color": "#346f36cc", "value": null }, { "color": "#f5a673", "value": 30000 } ] } }, { "id": "custom.cellOptions", "value": { "type": "color-text" } }, { "id": "custom.width", "value": 70 }, { "id": "color", "value": { "mode": "thresholds" } }, { "id": "decimals", "value": 1 } ] }, { "matcher": { "id": "byName", "options": "T" }, "properties": [ { "id": "custom.width", "value": 40 }, { "id": "mappings", "value": [ { "options": { "0": { "color": "#346f36cc", "index": 1, "text": "F" }, "1": { "color": "#f5a673", "index": 2, "text": "T" } }, "type": "value" }, { "options": { "match": "null+nan", "result": { "color": "#e3e3e3e0", "index": 0, "text": "N" } }, "type": "special" } ] }, { "id": "custom.cellOptions", "value": { "mode": "gradient", "type": "color-background" } } ] }, { "matcher": { "id": "byName", "options": "Backend" }, "properties": [ { "id": "custom.cellOptions", "value": { "type": "color-text" } }, { "id": "thresholds", "value": { "mode": "absolute", "steps": [ { "color": "#3e668f", "value": null }, { "color": "#346f36cc", "value": 4 }, { "color": "#f5a673", "value": 1000 } ] } }, { "id": "color", "value": { "mode": "thresholds" } }, { "id": "custom.width", "value": 75 }, { "id": "custom.cellOptions", "value": { "type": "color-text" } } ] }, { "matcher": { "id": "byName", "options": "ConnLimit" }, "properties": [ { "id": "custom.width", "value": 85 }, { "id": "custom.cellOptions", "value": { "type": "color-text" } }, { "id": "color" }, { "id": "mappings", "value": [ { "options": { "-1": { "color": "#3e668f", "index": 1, "text": "NO LIMIT" } }, "type": "value" }, { "options": { "match": "null+nan", "result": { "color": "gray", "index": 0, "text": "N/A" } }, "type": "special" }, { "options": { "from": 0, "result": { "color": "#f5a673", "index": 2 }, "to": 99999999 }, "type": "range" } ] } ] }, { "matcher": { "id": "byRegexp", "options": "Conn%" }, "properties": [ { "id": "unit", "value": "percentunit" }, { "id": "custom.width", "value": 80 }, { "id": "thresholds", "value": { "mode": "absolute", "steps": [ { "color": "#3e668f", "value": null }, { "color": "#346f36cc", "value": 0.1 }, { "color": "#fcdb72", "value": 0.3 }, { "color": "#f5a673", "value": 0.5 }, { "color": "red", "value": 0.7 }, { "color": "#b783af", "value": 0.9 }, { "color": "text", "value": 0.99 } ] } }, { "id": "custom.cellOptions", "value": { "type": "color-text" } }, { "id": "decimals", "value": 1 } ] }, { "matcher": { "id": "byName", "options": "Client" }, "properties": [ { "id": "custom.width", "value": 60 }, { "id": "thresholds", "value": { "mode": "absolute", "steps": [ { "color": "#346f36cc", "value": null }, { "color": "#fcdb72", "value": 1000 } ] } }, { "id": "custom.cellOptions", "value": { "type": "color-text" } } ] }, { "matcher": { "id": "byName", "options": "Server" }, "properties": [ { "id": "custom.width", "value": 60 }, { "id": "custom.cellOptions", "value": { "type": "color-text" } }, { "id": "thresholds", "value": { "mode": "absolute", "steps": [ { "color": "#3e668f", "value": null }, { "color": "#346f36cc", "value": 4 }, { "color": "#fcdb72", "value": 1000 } ] } } ] }, { "matcher": { "id": "byName", "options": "RT" }, "properties": [ { "id": "custom.width", "value": 80 }, { "id": "thresholds", "value": { "mode": "absolute", "steps": [ { "color": "#346f36cc", "value": null }, { "color": "#fcdb72", "value": 1 }, { "color": "#f5a673", "value": 2 }, { "color": "red", "value": 4 }, { "color": "#b783af", "value": 8 }, { "color": "text", "value": 16 } ] } }, { "id": "mappings", "value": [ { "options": { "match": "null+nan", "result": { "index": 0, "text": "-" } }, "type": "special" } ] }, { "id": "custom.cellOptions", "value": { "type": "color-text" } }, { "id": "unit", "value": "s" } ] }, { "matcher": { "id": "byName", "options": "Load" }, "properties": [ { "id": "custom.width", "value": 80 }, { "id": "custom.cellOptions", "value": { "type": "color-text" } }, { "id": "unit", "value": "short" }, { "id": "decimals", "value": 1 }, { "id": "thresholds", "value": { "mode": "absolute", "steps": [ { "color": "#3e668f", "value": null }, { "color": "#346f36cc", "value": 0.1 } ] } }, { "id": "mappings", "value": [ { "options": { "match": "null+nan", "result": { "index": 0, "text": "-" } }, "type": "special" } ] } ] }, { "matcher": { "id": "byName", "options": "PoolSize" }, "properties": [ { "id": "custom.width", "value": 80 }, { "id": "custom.cellOptions", "value": { "type": "auto" } } ] }, { "matcher": { "id": "byName", "options": "OID" }, "properties": [ { "id": "custom.width", "value": 80 }, { "id": "links", "value": [ { "title": "PGCAT Database : ${ins}.${__data.fields.Database}", "url": "/d/pgcat-database?var-dsn=${ins}.${__data.fields.Database}" } ] }, { "id": "mappings", "value": [ { "options": { "from": 0, "result": { "color": "#bfbfbf4d", "index": 0 }, "to": 10 }, "type": "range" } ] }, { "id": "custom.cellOptions", "value": { "type": "color-text" } }, { "id": "thresholds", "value": { "mode": "absolute", "steps": [ { "color": "#3e668f", "value": null } ] } } ] } ] }, "gridPos": { "h": 6, "w": 12, "x": 12, "y": 7 }, "id": 192, "options": { "cellHeight": "sm", "footer": { "countRows": false, "fields": "", "reducer": [ "sum" ], "show": false }, "showHeader": true, "sortBy": [ { "desc": true, "displayName": "OID" } ] }, "pluginVersion": "11.1.4", "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "exemplar": false, "expr": "sum by (datname) (pg_size_bytes{cls=\"$cls\", datname!~\"wal|log\"})", "format": "table", "instant": true, "interval": "", "legendFormat": "Age", "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "exemplar": false, "expr": "max by (datname) (pg:db:age{cls=\"$cls\"})", "format": "table", "hide": false, "instant": true, "interval": "", "legendFormat": "Commit", "refId": "B" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "exemplar": false, "expr": "max by (datname) (pg_db_is_template{cls=\"$cls\", ins=\"$primary\"})", "format": "table", "hide": false, "instant": true, "interval": "", "legendFormat": "isTemplate", "refId": "C" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "sum by (datname) (pg_activity_count{cls=\"$cls\"})", "format": "table", "hide": false, "instant": true, "interval": "", "legendFormat": "Backend", "refId": "D" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "exemplar": false, "expr": "sum by (datname) (pg:db:xact_commit_rate1m{cls=\"$cls\"})", "format": "table", "hide": false, "instant": true, "interval": "", "legendFormat": "TPS", "refId": "E" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "exemplar": false, "expr": "max by (datname) (pg_db_conn_limit{ins=\"$primary\"})", "format": "table", "hide": false, "instant": true, "interval": "", "legendFormat": "Conn Limit", "refId": "F" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "exemplar": false, "expr": "max by (datname) (pg:db:conn_usage{cls=\"$cls\"})", "format": "table", "hide": false, "instant": true, "interval": "", "legendFormat": "PG Conn Usage", "refId": "H" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "sum by (datname) (pg:db:active_time_rate1m{cls=\"$cls\"})", "format": "table", "hide": false, "instant": true, "interval": "", "legendFormat": "Load", "refId": "L" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "max by (datname) (pg_db_datid{cls=\"$cls\"})", "format": "table", "hide": false, "instant": true, "interval": "", "legendFormat": "OID", "refId": "O" } ], "transformations": [ { "id": "seriesToColumns", "options": { "byField": "datname" } }, { "id": "organize", "options": { "excludeByName": { "Time 1": true, "Time 10": true, "Time 11": true, "Time 12": true, "Time 13": true, "Time 14": true, "Time 15": true, "Time 2": true, "Time 3": true, "Time 4": true, "Time 5": true, "Time 6": true, "Time 7": true, "Time 8": true, "Time 9": true, "Value #G": false, "Value #H": false, "host": true }, "indexByName": { "Time 1": 10, "Time 2": 11, "Time 3": 12, "Time 4": 13, "Time 5": 14, "Time 6": 15, "Time 7": 16, "Time 8": 17, "Time 9": 18, "Value #A": 3, "Value #B": 4, "Value #C": 9, "Value #D": 5, "Value #E": 6, "Value #F": 8, "Value #H": 7, "Value #L": 2, "Value #O": 1, "datname": 0 }, "renameByName": { "Time 1": "", "Time 7": "", "Value #A": "Size", "Value #B": "Age", "Value #C": "T", "Value #D": "Backend", "Value #E": "TPS", "Value #F": "ConnLimit", "Value #G": "RT", "Value #H": "Conn%", "Value #I": "D", "Value #J": "P", "Value #K": "PoolSize", "Value #L": "Load", "Value #M": "Client", "Value #N": "Server", "Value #O": "OID", "datname": "Database", "host": "Target", "real_datname": "Datname" } } } ], "type": "table" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "links": [ { "title": "PGSQL Instance : TPS by Instance", "url": "/d/pgrds-instance?var-ins=${ins}&viewPanel=94&${__url_time_range}" } ], "mappings": [ { "options": { "match": "null+nan", "result": { "color": "gray", "index": 0, "text": "\u2205" } }, "type": "special" } ], "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "#3e668f", "value": null }, { "color": "#346f36cc", "value": 32 }, { "color": "#f5a673", "value": 32000 } ] }, "unit": "none" }, "overrides": [] }, "gridPos": { "h": 3, "w": 2, "x": 0, "y": 13 }, "id": 46, "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showPercentChange": false, "text": { "titleSize": 12, "valueSize": 18 }, "textMode": "value_and_name", "wideLayout": true }, "pluginVersion": "11.1.4", "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "pg:ins:xact_commit_rate1m{ins=\"$ins\"}", "interval": "", "legendFormat": "Commit", "range": true, "refId": "A" } ], "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "links": [ { "title": "PGSQL Instance : Rollback", "url": "/d/pgrds-instance?var-ins=${ins}&viewPanel=169&${__url_time_range}" } ], "mappings": [ { "options": { "match": "null+nan", "result": { "color": "gray", "index": 0, "text": "\u2205" } }, "type": "special" } ], "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "#346f36cc", "value": null }, { "color": "#fcdb72", "value": 1 }, { "color": "#f5a673", "value": 4 }, { "color": "red", "value": 8 } ] }, "unit": "none" }, "overrides": [] }, "gridPos": { "h": 3, "w": 2, "x": 2, "y": 13 }, "id": 57, "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showPercentChange": false, "text": { "titleSize": 12, "valueSize": 18 }, "textMode": "value_and_name", "wideLayout": true }, "pluginVersion": "11.1.4", "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "pg:ins:xact_rollback_rate1m{ins=\"$ins\"}", "interval": "", "legendFormat": "Rollback", "range": true, "refId": "A" } ], "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "links": [ { "title": "PGSQL Instance : RT", "url": "/d/pgrds-instance?var-ins=${ins}&viewPanel=109&${__url_time_range}" } ], "mappings": [ { "options": { "match": "null+nan", "result": { "color": "gray", "index": 0, "text": "\u2205" } }, "type": "special" } ], "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "#346f36cc", "value": null }, { "color": "#fcdb72", "value": 1 }, { "color": "#f5a673", "value": 2 }, { "color": "red", "value": 4 }, { "color": "#b783af", "value": 8 }, { "color": "text", "value": 16 } ] }, "unit": "s" }, "overrides": [] }, "gridPos": { "h": 3, "w": 2, "x": 4, "y": 13 }, "id": 59, "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showPercentChange": false, "text": { "titleSize": 12, "valueSize": 18 }, "textMode": "value_and_name", "wideLayout": true }, "pluginVersion": "11.1.4", "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "pg:ins:active_time_rate1m{ins=\"$ins\"} / pg:ins:xact_total_rate1m{ins=\"$ins\"}", "interval": "", "legendFormat": "RT", "range": true, "refId": "A" } ], "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "links": [ { "title": "PGRDS Instance : Conn Usage", "url": "/d/pgrds-instance?var-ins=${ins}&viewPanel=90&${__url_time_range}" } ], "mappings": [ { "options": { "match": "null+nan", "result": { "color": "gray", "index": 0, "text": "\u2205" } }, "type": "special" } ], "thresholds": { "mode": "absolute", "steps": [ { "color": "#3e668f", "value": null }, { "color": "#346f36cc", "value": 0.1 }, { "color": "#fcdb72", "value": 0.3 }, { "color": "#f5a673", "value": 0.5 }, { "color": "red", "value": 0.7 }, { "color": "#b783af", "value": 0.9 }, { "color": "text", "value": 1 } ] }, "unit": "percentunit" }, "overrides": [] }, "gridPos": { "h": 3, "w": 2, "x": 6, "y": 13 }, "id": 106, "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showPercentChange": false, "text": { "titleSize": 12, "valueSize": 18 }, "textMode": "value_and_name", "wideLayout": true }, "pluginVersion": "11.1.4", "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "max(pg:db:conn_usage{cls=\"$cls\"})", "interval": "", "legendFormat": "Conn%", "range": true, "refId": "A" } ], "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "links": [ { "title": "PGRDS Instance : Backend by Instance", "url": "/d/pgrds-instance?var-ins=${ins}&viewPanel=96&${__url_time_range}" } ], "mappings": [ { "options": { "match": "null+nan", "result": { "color": "gray", "index": 0, "text": "\u2205" } }, "type": "special" } ], "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "#3e668f", "value": null }, { "color": "#346f36cc", "value": 4 }, { "color": "#f5a673", "value": 2000 } ] }, "unit": "none" }, "overrides": [] }, "gridPos": { "h": 3, "w": 2, "x": 8, "y": 13 }, "id": 64, "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showPercentChange": false, "text": { "titleSize": 12, "valueSize": 18 }, "textMode": "value_and_name", "wideLayout": true }, "pluginVersion": "11.1.4", "targets": [ { "exemplar": false, "expr": "sum by (cls) (pg_db_numbackends{cls=\"$cls\"})", "interval": "", "legendFormat": "Backend", "refId": "A" } ], "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "links": [ { "title": "PGRDS Instance : Session by State", "url": "/d/pgrds-instance?var-ins=${ins}&viewPanel=149&${__url_time_range}" } ], "mappings": [ { "options": { "match": "null+nan", "result": { "color": "gray", "index": 0, "text": "\u2205" } }, "type": "special" } ], "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "#3e668f", "value": null }, { "color": "#346f36cc", "value": 4 }, { "color": "#f5a673", "value": 2000 } ] }, "unit": "none" }, "overrides": [] }, "gridPos": { "h": 3, "w": 2, "x": 10, "y": 13 }, "id": 107, "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showPercentChange": false, "text": { "titleSize": 12, "valueSize": 18 }, "textMode": "value_and_name", "wideLayout": true }, "pluginVersion": "11.1.4", "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "pg:ins:active_backends{ins=\"$ins\"}", "interval": "", "legendFormat": "Active Conn", "range": true, "refId": "A" } ], "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "links": [ { "title": "PGRDS Instance : Idle in Transaction Conn", "url": "/d/pgrds-instance?var-ins=${ins}&viewPanel=159&${__url_time_range}" } ], "mappings": [ { "options": { "match": "null+nan", "result": { "color": "gray", "index": 0, "text": "\u2205" } }, "type": "special" } ], "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "#346f36cc", "value": null }, { "color": "#f5a673", "value": 1 }, { "color": "red", "value": 8 } ] }, "unit": "none" }, "overrides": [] }, "gridPos": { "h": 3, "w": 2, "x": 12, "y": 13 }, "id": 203, "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showPercentChange": false, "text": { "titleSize": 12, "valueSize": 18 }, "textMode": "value_and_name", "wideLayout": true }, "pluginVersion": "11.1.4", "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "max(pg:ins:ixact_backends{ins=\"$ins\"})", "interval": "", "legendFormat": "iXact Conn", "range": true, "refId": "A" } ], "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "decimals": 2, "links": [ { "title": "PGSQL Instance : Row Fetched", "url": "/d/pgrds-instance?var-ins=${ins}&viewPanel=127&${__url_time_range}" } ], "mappings": [ { "options": { "match": "null+nan", "result": { "color": "gray", "index": 0, "text": "\u2205" } }, "type": "special" } ], "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "#3e668f", "value": null }, { "color": "#346f36cc", "value": 32 }, { "color": "#f5a673", "value": 32000 } ] }, "unit": "rowsps" }, "overrides": [] }, "gridPos": { "h": 3, "w": 2, "x": 14, "y": 13 }, "id": 204, "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showPercentChange": false, "text": { "titleSize": 12, "valueSize": 12 }, "textMode": "value_and_name", "wideLayout": true }, "pluginVersion": "11.1.4", "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "pg:ins:tup_fetched_rate1m{ins=\"$ins\"}", "interval": "", "legendFormat": "Row Fetch", "range": true, "refId": "A" } ], "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "decimals": 2, "links": [ { "title": "PGRDS Instance : Row Changed", "url": "/d/pgrds-instance?var-ins=${ins}&viewPanel=128&${__url_time_range}" } ], "mappings": [ { "options": { "match": "null+nan", "result": { "color": "gray", "index": 0, "text": "\u2205" } }, "type": "special" } ], "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "#3e668f", "value": null }, { "color": "#346f36cc", "value": 32 }, { "color": "#f5a673", "value": 32000 } ] }, "unit": "rowsps" }, "overrides": [] }, "gridPos": { "h": 3, "w": 2, "x": 16, "y": 13 }, "id": 205, "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showPercentChange": false, "text": { "titleSize": 12, "valueSize": 12 }, "textMode": "value_and_name", "wideLayout": true }, "pluginVersion": "11.1.4", "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "pg:ins:tup_modified_rate1m{ins=\"$ins\"}", "interval": "", "legendFormat": "Row Change", "range": true, "refId": "A" } ], "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "links": [ { "title": "PGSQL Instance : Blocks Read Bandwidth", "url": "/d/pgrds-instance?var-ins=${ins}&viewPanel=182&${__url_time_range}" } ], "mappings": [], "min": 0, "noValue": "\u2205", "thresholds": { "mode": "absolute", "steps": [ { "color": "#3e668f", "value": null }, { "color": "#346f36cc", "value": 10000000 }, { "color": "#fcdb72", "value": 100000000 }, { "color": "#f5a673", "value": 500000000 }, { "color": "red", "value": 1000000000 }, { "color": "#b783af", "value": 2000000000 }, { "color": "text", "value": 4000000000 } ] }, "unit": "Bps" }, "overrides": [] }, "gridPos": { "h": 3, "w": 2, "x": 18, "y": 13 }, "id": 206, "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showPercentChange": false, "text": { "titleSize": 12, "valueSize": 14 }, "textMode": "value_and_name", "wideLayout": true }, "pluginVersion": "11.1.4", "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "sum(pg:db:blks_read_1m{ins=\"$ins\"}) * 4096", "interval": "", "legendFormat": "Blks Read", "range": true, "refId": "A" } ], "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "decimals": 2, "links": [ { "title": "PGRDS Instance : Age of Databases", "url": "/d/pgrds-instance?var-ins=${ins}&viewPanel=208&${__url_time_range}" } ], "mappings": [ { "options": { "match": "null+nan", "result": { "color": "gray", "index": 0, "text": "\u2205" } }, "type": "special" } ], "max": 1, "min": 0, "thresholds": { "mode": "percentage", "steps": [ { "color": "#346f36cc", "value": null }, { "color": "#fcdb72", "value": 2 }, { "color": "#f5a673", "value": 10 }, { "color": "red", "value": 30 }, { "color": "#b783af", "value": 90 }, { "color": "text", "value": 100 } ] }, "unit": "percentunit" }, "overrides": [] }, "gridPos": { "h": 3, "w": 2, "x": 20, "y": 13 }, "id": 63, "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showPercentChange": false, "text": { "titleSize": 12, "valueSize": 18 }, "textMode": "value_and_name", "wideLayout": true }, "pluginVersion": "11.1.4", "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "pg:ins:age{ins=\"$ins\"} / 2147483647", "interval": "", "legendFormat": "Age", "range": true, "refId": "A" } ], "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "links": [ { "title": "PGRDS Instance : Database Size", "url": "/d/pgrds-instance?var-ins=${ins}&viewPanel=176&${__url_time_range}" } ], "mappings": [ { "options": { "match": "null+nan", "result": { "color": "gray", "index": 0, "text": "\u2205" } }, "type": "special" } ], "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "#3e668f", "value": null }, { "color": "#346f36cc", "value": 1000000000 }, { "color": "#fcdb72", "value": 1000000000000 }, { "color": "#f5a673", "value": 10000000000000 } ] }, "unit": "decbytes" }, "overrides": [] }, "gridPos": { "h": 3, "w": 2, "x": 22, "y": 13 }, "id": 202, "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showPercentChange": false, "text": { "titleSize": 12, "valueSize": 18 }, "textMode": "value_and_name", "wideLayout": true }, "pluginVersion": "11.1.4", "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "sum(pg_size_bytes{ins=\"$ins\"})", "interval": "", "legendFormat": "Size", "range": true, "refId": "A" } ], "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "Load, 1 = 100% one cpu core usage.\nCalculated by session active time (PG14+)", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "decimals": 2, "links": [ { "title": "PGRDS Instance : Load", "url": "/d/pgrds-instance?var-ins=${ins}&viewPanel=147&${__url_time_range}" } ], "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "#3e668f", "value": null }, { "color": "#346f36cc", "value": 0.1 } ] }, "unit": "short" }, "overrides": [] }, "gridPos": { "h": 5, "w": 3, "x": 0, "y": 16 }, "id": 199, "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showPercentChange": false, "text": {}, "textMode": "value_and_name", "wideLayout": true }, "pluginVersion": "11.1.4", "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "pg:cls:active_time_rate1m{cls=\"$cls\"}", "interval": "", "legendFormat": "Load", "range": true, "refId": "A" } ], "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "Load for Instances / Databases in last 5 minutes\nload 1 = 100% usage of 1 core cpu time", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 20, "gradientMode": "none", "hideFrom": { "graph": false, "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineStyle": { "fill": "solid" }, "lineWidth": 1, "pointSize": 3, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "normal" }, "thresholdsStyle": { "mode": "off" } }, "links": [ { "title": "PGRDS Instance : ${__field.labels.ins}", "url": "/d/pgrds-instance?var-ins=${__field.labels.ins}&${__url_time_range}" } ], "mappings": [], "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "short" }, "overrides": [] }, "gridPos": { "h": 5, "w": 9, "x": 3, "y": 16 }, "id": 196, "options": { "legend": { "calcs": [], "displayMode": "table", "placement": "right", "showLegend": true, "sortBy": "Last", "sortDesc": true }, "tooltip": { "mode": "multi", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "pg:db:active_time_rate1m{ins=\"$ins\", datname!~'template\\\\d'}", "interval": "", "legendFormat": "{{ datname }}", "range": true, "refId": "A" } ], "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "links": [ { "targetBlank": true, "title": "PGSQL Alert for Cluster ${cls}", "url": "http://a.pigsty/#/alerts?filter=%7Bcls%3D%22${cls}%22%7D" } ], "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "#346f36cc", "value": null }, { "color": "#fcdb72", "value": 1 }, { "color": "#f5a673", "value": 2 }, { "color": "red", "value": 3 }, { "color": "#b783af", "value": 4 } ] } }, "overrides": [] }, "gridPos": { "h": 5, "w": 3, "x": 12, "y": 16 }, "id": 198, "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "auto", "orientation": "auto", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showPercentChange": false, "text": {}, "textMode": "value_and_name", "wideLayout": true }, "pluginVersion": "11.1.4", "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "count(ALERTS{ins=\"$ins\", alertstate=\"firing\"}) or on() vector(0)", "interval": "", "legendFormat": "Alert", "range": true, "refId": "A" } ], "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "Alerting event graph. Each stripe represent an alerting Event. Gray transparent stipe are pending alerts.", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "hidden", "axisSoftMax": 1.2, "axisSoftMin": 0, "barAlignment": 0, "drawStyle": "line", "fillOpacity": 98, "gradientMode": "none", "hideFrom": { "graph": false, "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "stepBefore", "lineWidth": 0, "pointSize": 1, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "normal" }, "thresholdsStyle": { "mode": "off" } }, "decimals": 0, "links": [ { "targetBlank": true, "title": "AlertInfo ${__field.labels.alertname} on Instance ${ins}", "url": "http://a.pigsty/#/alerts?filter=%7Bins%3D%22${ins}%22%2C%20alertname%3D%22${__field.labels.alertname}%22%7D" }, { "targetBlank": true, "title": "Silence ${__field.labels.alertname} on Cluster ${ins}", "url": "http://a.pigsty/#/silences/new?filter=%7Bins%3D%22${ins}%22%2C%20alertname%3D%22${__field.labels.alertname}%22%7D" }, { "title": "PGSQL Cluster for ${__field.labels.cls} : Alerts", "url": "/d/pgsql-cluster?var-cls=${__field.labels.cls}&${__url_time_range}" }, { "title": "PGSQL Node for ${__field.labels.ins}", "url": "/d/node-instance?var-ins=${__field.labels.ins}&${__url_time_range}" } ], "mappings": [], "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "#346f36cc", "value": null } ] }, "unit": "none" }, "overrides": [ { "matcher": { "id": "byFrameRefID", "options": "A" }, "properties": [ { "id": "custom.fillOpacity", "value": 66 }, { "id": "mappings", "value": [ { "options": { "1": { "index": 0, "text": "Firing\ud83d\udd25" } }, "type": "value" } ] } ] }, { "matcher": { "id": "byFrameRefID", "options": "B" }, "properties": [ { "id": "custom.fillOpacity", "value": 25 }, { "id": "color", "value": { "fixedColor": "rgba(128, 128, 128, 0.5)", "mode": "fixed" } }, { "id": "mappings", "value": [ { "options": { "1": { "color": "gray", "index": 0, "text": "Pend\u23f0" } }, "type": "value" } ] } ] }, { "matcher": { "id": "byType", "options": "time" }, "properties": [ { "id": "custom.axisPlacement", "value": "auto" } ] }, { "matcher": { "id": "byName", "options": "Alert" }, "properties": [ { "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } } ] } ] }, "gridPos": { "h": 5, "w": 9, "x": 15, "y": 16 }, "id": 197, "options": { "legend": { "calcs": [ "last" ], "displayMode": "table", "placement": "right", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "ALERTS{ins=\"$ins\", alertstate=\"firing\"}", "interval": "", "legendFormat": "[{{ severity }}\ud83d\udd25] {{alertname}}", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "exemplar": false, "expr": "ALERTS{ins=\"$ins\", alertstate=\"pending\"} ", "hide": false, "interval": "", "legendFormat": "[{{ severity }}\u23f0] {{alertname}}", "refId": "B" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "exemplar": false, "expr": "0", "hide": false, "interval": "", "legendFormat": "Alert", "refId": "C" } ], "type": "timeseries" }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 21 }, "id": 133, "panels": [], "title": "Activity", "type": "row" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "Rate in last 5min of:\n\n\npg_stat_database. xact_commit: Number of transactions in this database that have been committed\n\npg_stat_database. xact_rollback: Number of transactions in this database that have been rolled back\n\non this postgres instance.", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 20, "gradientMode": "hue", "hideFrom": { "graph": false, "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "smooth", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "links": [], "mappings": [], "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "#346f36cc", "value": null } ] }, "unit": "short" }, "overrides": [ { "matcher": { "id": "byFrameRefID", "options": "A" }, "properties": [ { "id": "color", "value": { "fixedColor": "red", "mode": "fixed" } }, { "id": "custom.axisPlacement", "value": "auto" } ] }, { "matcher": { "id": "byName", "options": "Commits" }, "properties": [ { "id": "color", "value": { "fixedColor": "#3e668f", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "Rollbacks" }, "properties": [ { "id": "color", "value": { "fixedColor": "#cc4637d9", "mode": "fixed" } } ] } ] }, "gridPos": { "h": 6, "w": 12, "x": 0, "y": 22 }, "id": 169, "links": [ { "title": "PGRDS Cluster : ${cls}", "url": "/d/pgrds-cluster?var-cls=${cls}&viewPanel=169&${__url_time_range}" } ], "options": { "legend": { "calcs": [ "mean" ], "displayMode": "table", "placement": "right", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } }, "pluginVersion": "8.0.0-beta2", "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "sum by (ins) (pg:ins:xact_commit_rate5m{ins=\"$ins\"})", "hide": false, "interval": "", "intervalFactor": 1, "legendFormat": "Commits", "range": true, "refId": "B" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "sum by (ins) (pg:ins:xact_rollback_rate5m{ins=\"$ins\"})", "hide": false, "interval": "", "intervalFactor": 1, "legendFormat": "Rollbacks", "range": true, "refId": "A" } ], "title": "Transaction Commits / Rollbacks (rate5m)", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "pg_stat_database: active_time / (xact_commit + xact_rollback)\n\n\nTime spent executing SQL statements in this instance / Number of transactions in this instance in last 1 minute\n\n\nOnly available on PG14+\n", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "opacity", "hideFrom": { "graph": false, "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "smooth", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "#346f36cc", "value": null } ] }, "unit": "s" }, "overrides": [ { "matcher": { "id": "byName", "options": "Instance" }, "properties": [ { "id": "custom.fillOpacity", "value": 21 }, { "id": "custom.lineWidth", "value": 0 }, { "id": "color", "value": { "fixedColor": "#3e668f", "mode": "fixed" } }, { "id": "custom.gradientMode", "value": "hue" }, { "id": "links", "value": [ { "title": "PGSQL Instance : ${__field.labels.ins}", "url": "/d/pgsql-instance?var-ins=${__field.labels.ins}\ufeff&\ufeff${__url_time_range}" } ] } ] }, { "matcher": { "id": "byFrameRefID", "options": "B" }, "properties": [ { "id": "links", "value": [ { "title": "PGSQL Database : ${__field.labels.ins}.${__field.labels.datname}", "url": "/d/pgsql-database?var-ins=${__field.labels.ins}&var-datname=${__field.labels.datname}&${__url_time_range}&viewPanel=137" } ] } ] } ] }, "gridPos": { "h": 6, "w": 12, "x": 12, "y": 22 }, "id": 109, "links": [ { "title": "PGRDS Cluster Transaction RT: ${cls}", "url": "/d/pgrds-cluster?var-cls=${cls}&viewPanel=109&${__url_time_range}\n" } ], "options": { "legend": { "calcs": [ "last" ], "displayMode": "table", "placement": "right", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } }, "pluginVersion": "8.0.0-beta2", "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "pg:ins:active_time_rate1m{ins=\"$ins\"} / pg:ins:xact_total_rate1m{ins=\"$ins\"}", "interval": "", "intervalFactor": 1, "legendFormat": "Instance", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "pg:db:active_time_rate1m{ins=\"$ins\", datname!~'rdsadmin|polardb_admin|template\\\\d'} / pg:db:xact_total_rate1m{ins=\"$ins\", datname!~'rdsadmin|polardb_admin|template\\\\d'}", "hide": false, "interval": "", "intervalFactor": 1, "legendFormat": "{{ datname }}", "range": true, "refId": "B" } ], "title": "Transaction RT (1m)", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "Transaction Committed rate in last 1 minute, group on database level", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 20, "gradientMode": "hue", "hideFrom": { "graph": false, "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "smooth", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "links": [ { "title": "PGSQL Xacts for ${__field.labels.ins}", "url": "/d/pgsql-xacts?var-ins=${__field.labels.ins}&${__url_time_range}" } ], "mappings": [], "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "#346f36cc", "value": null } ] }, "unit": "short" }, "overrides": [ { "matcher": { "id": "byName", "options": "Instance" }, "properties": [ { "id": "custom.lineWidth", "value": 0 }, { "id": "custom.fillOpacity", "value": 20 } ] } ] }, "gridPos": { "h": 6, "w": 12, "x": 0, "y": 28 }, "id": 94, "links": [ { "title": "PGRDS Cluster TPS: ${cls}", "url": "/d/pgrds-cluster?var-cls=${cls}&viewPanel=94&${__url_time_range}" } ], "options": { "legend": { "calcs": [ "mean" ], "displayMode": "table", "placement": "right", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } }, "pluginVersion": "8.0.0-beta2", "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "pg:db:xact_commit_rate1m{ins=\"$ins\", datname!~'rdsadmin|polardb_admin|template\\\\d'}", "hide": false, "interval": "", "intervalFactor": 1, "legendFormat": "{{ datname }}", "range": true, "refId": "B" } ], "title": "TPS by Database (rate1m)", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "Postgres Load Index in last 1 minutes\n\n\nrate(pg_stat_database.active_time[5m])\n\n\nload 1 = 100% usage of one cpu core.\n\n\nCalculated by session active time (PG14+) @ database level.", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 40, "gradientMode": "none", "hideFrom": { "graph": false, "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineStyle": { "fill": "solid" }, "lineWidth": 1, "pointSize": 3, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "links": [ { "title": "PGRDS Instance : ${__field.labels.ins}", "url": "/d/pgrds-instance?var-ins=${__field.labels.ins}&${__url_time_range}" } ], "mappings": [], "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "short" }, "overrides": [] }, "gridPos": { "h": 6, "w": 12, "x": 12, "y": 28 }, "id": 147, "links": [ { "title": "PGRDS Cluster Load: ${cls}", "url": "/d/pgrds-cluster?var-cls=${cls}&viewPanel=147&${__url_time_range}" } ], "options": { "legend": { "calcs": [ "last" ], "displayMode": "table", "placement": "right", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "pg:db:active_time_rate1m{ins=\"$ins\", datname!~'rdsadmin|polardb_admin|template\\\\d'}", "interval": "", "legendFormat": "{{ datname }}", "range": true, "refId": "A" } ], "title": "Postgres Load (rate1m)", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "Tuples fetched from databases r/s in last minute, group by database on this instance. ", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 3, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "links": [ { "title": "PGRDS Instance : ${__field.labels.ins}", "url": "/d/pgrds-instance?var-ins=${__field.labels.ins}&${__url_time_range}" } ], "mappings": [], "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "#346f36cc", "value": null } ] }, "unit": "cps" }, "overrides": [] }, "gridPos": { "h": 6, "w": 12, "x": 0, "y": 34 }, "id": 127, "links": [ { "title": "PGRDS Cluster Row Fetched: ${cls}", "url": "/d/pgrds-cluster?var-cls=${cls}&viewPanel=127&${__url_time_range}" } ], "options": { "legend": { "calcs": [ "last" ], "displayMode": "table", "placement": "right", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } }, "pluginVersion": "8.0.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "sum by (datname) (rate(pg_db_tup_fetched{ins=\"$ins\", datname!~'rdsadmin|polardb_admin|template\\\\d'}[1m]))", "interval": "", "legendFormat": "{{ datname }}", "range": true, "refId": "A" } ], "title": "Row Fetched (rate1m)", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "Tuples fetched, inserted, updated, delete r/s in last minute, on this instance", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 50, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 0, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "normal" }, "thresholdsStyle": { "mode": "off" } }, "links": [], "mappings": [], "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "#346f36cc", "value": null } ] }, "unit": "cps" }, "overrides": [ { "matcher": { "id": "byName", "options": "DELETED" }, "properties": [ { "id": "color", "value": { "fixedColor": "#cc4637d9", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "INSERTED" }, "properties": [ { "id": "color", "value": { "fixedColor": "#3e668f", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "UPDATED" }, "properties": [ { "id": "color", "value": { "fixedColor": "#fcdb72", "mode": "fixed" } } ] } ] }, "gridPos": { "h": 6, "w": 12, "x": 12, "y": 34 }, "id": 128, "links": [ { "title": "PGRDS Cluster Row Modified: ${cls}", "url": "/d/pgrds-cluster?var-cls=${cls}&viewPanel=128&${__url_time_range}" } ], "options": { "legend": { "calcs": [ "mean" ], "displayMode": "table", "placement": "right", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } }, "pluginVersion": "8.0.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "sum(rate(pg_db_tup_inserted{ins=\"$ins\", datname!~'rdsadmin|polardb_admin|template\\\\d'}[1m]))", "hide": false, "interval": "", "intervalFactor": 2, "legendFormat": "INSERTED", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "sum(rate(pg_db_tup_updated{ins=\"$ins\", datname!~'rdsadmin|polardb_admin|template\\\\d'}[1m]))", "hide": false, "interval": "", "intervalFactor": 2, "legendFormat": "UPDATED", "range": true, "refId": "C" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "sum(rate(pg_db_tup_deleted{ins=\"$ins\", datname!~'rdsadmin|polardb_admin|template\\\\d'}[1m]))", "hide": false, "interval": "", "intervalFactor": 2, "legendFormat": "DELETED", "range": true, "refId": "D" } ], "title": "Row Modified (rate1m)", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "Table lock categorized in to 3 major categories:\nRead locks, Write Locks, and Exclusive locks.", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "bars", "fillOpacity": 100, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": true, "stacking": { "group": "A", "mode": "normal" }, "thresholdsStyle": { "mode": "off" } }, "decimals": 0, "mappings": [], "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "#346f36cc", "value": null }, { "color": "red", "value": 80 } ] }, "unit": "short" }, "overrides": [ { "matcher": { "id": "byName", "options": "Read" }, "properties": [ { "id": "color", "value": { "fixedColor": "#346f36cc", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "Write" }, "properties": [ { "id": "color", "value": { "fixedColor": "#3e668f", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "Exclusive" }, "properties": [ { "id": "color", "value": { "fixedColor": "#cc4637d9", "mode": "fixed" } } ] } ] }, "gridPos": { "h": 6, "w": 12, "x": 0, "y": 40 }, "id": 180, "links": [ { "title": "PGRDS Cluster Locks by Category: ${cls}", "url": "/d/pgrds-cluster?var-cls=${cls}&viewPanel=180&${__url_time_range}" } ], "options": { "legend": { "calcs": [ "max" ], "displayMode": "table", "placement": "right", "showLegend": true, "sortBy": "Name", "sortDesc": true }, "tooltip": { "mode": "single", "sort": "none" } }, "pluginVersion": "8.0.5", "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "pg:ins:xlock_count{ins=\"$ins\"}", "interval": "", "legendFormat": "Exclusive", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "pg:ins:wlock_count{ins=\"$ins\"}", "interval": "", "legendFormat": "Write", "range": true, "refId": "B" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "pg:ins:rlock_count{ins=\"$ins\"}", "interval": "", "legendFormat": "Read", "range": true, "refId": "C" } ], "title": "Locks by Category", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "Table Locks in 8 different mode, on this instance", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "bars", "fillOpacity": 55, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": true, "stacking": { "group": "A", "mode": "normal" }, "thresholdsStyle": { "mode": "off" } }, "decimals": 0, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "short" }, "overrides": [ { "matcher": { "id": "byName", "options": "AccessShareLock" }, "properties": [ { "id": "color", "value": { "fixedColor": "#56A64B", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "RowShareLock" }, "properties": [ { "id": "color", "value": { "fixedColor": "#8AB8FF", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "RowExclusiveLock" }, "properties": [ { "id": "color", "value": { "fixedColor": "#3274D9", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "ShareUpdateExclusiveLock" }, "properties": [ { "id": "color", "value": { "fixedColor": "#F2CC0C", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "ShareLock" }, "properties": [ { "id": "color", "value": { "fixedColor": "#CC9D00", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "ShareRowExclusiveLock" }, "properties": [ { "id": "color", "value": { "fixedColor": "#FF780A", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "ExclusiveLock" }, "properties": [ { "id": "color", "value": { "fixedColor": "#E02F44", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "AccessExclusiveLock" }, "properties": [ { "id": "color", "value": { "fixedColor": "#A352CC", "mode": "fixed" } } ] } ] }, "gridPos": { "h": 12, "w": 12, "x": 12, "y": 40 }, "id": 154, "links": [ { "title": "PGRDS Cluster Locks : ${cls}", "url": "/d/pgrds-cluster?var-cls=${cls}&viewPanel=154&${__url_time_range}" } ], "options": { "legend": { "calcs": [ "max", "mean", "last" ], "displayMode": "table", "placement": "bottom", "showLegend": true, "sortBy": "Max", "sortDesc": true }, "tooltip": { "mode": "multi", "sort": "none" } }, "pluginVersion": "8.0.5", "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "sum by (mode) (pg_lock_count{ins=\"$ins\"})", "interval": "", "legendFormat": "{{ mode }}", "range": true, "refId": "A" } ], "title": "Locks", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "Very long transactions, max elapse time (y) group by session state. On this postgres instance", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 2, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "#346f36cc", "value": null } ] }, "unit": "s" }, "overrides": [] }, "gridPos": { "h": 6, "w": 12, "x": 0, "y": 46 }, "id": 152, "links": [ { "title": "PGRDS Cluster SAGE: ${cls}", "url": "/d/pgrds-cluster?var-cls=${cls}&viewPanel=152&${__url_time_range}" } ], "options": { "legend": { "calcs": [ "max" ], "displayMode": "table", "placement": "right", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "sum by (state) (pg_activity_max_tx_duration{ins=\"$ins\", state!~\"(idle.*|disabled)\"})", "interval": "", "legendFormat": "{{ state }}", "range": true, "refId": "A" } ], "title": "SAGE", "type": "timeseries" }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 52 }, "id": 132, "panels": [], "title": "Session", "type": "row" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "Ratio of database session / max allowed sesion per database. the limit is min of max_connections and database's connection limit.", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "axisSoftMin": 0, "barAlignment": 0, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "stepBefore", "lineWidth": 1, "pointSize": 2, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "area" } }, "decimals": 1, "links": [ { "title": "PGSQL Session : ${__field.labels.ins}", "url": "/d/pgsql-session?var-ins=${__field.labels.ins}&${__url_time_range}" } ], "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "#3e668f", "value": null }, { "color": "green", "value": 0.1 }, { "color": "yellow", "value": 0.3 }, { "color": "orange", "value": 0.5 }, { "color": "red", "value": 0.7 }, { "color": "#b783af", "value": 0.9 } ] }, "unit": "percentunit" }, "overrides": [] }, "gridPos": { "h": 6, "w": 12, "x": 0, "y": 53 }, "id": 90, "links": [ { "title": "PGRDS Cluster : ${cls}", "url": "/d/pgrds-cluster?var-cls=${cls}&viewPanel=90&${__url_time_range}\n" } ], "options": { "legend": { "calcs": [ "max" ], "displayMode": "table", "placement": "right", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "pg:db:conn_usage{ins=\"$ins\", datname!~'template\\\\d'}", "interval": "", "legendFormat": "{{ datname }}", "range": true, "refId": "A" } ], "title": "Connection Usage", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "Idle in Transaction backend number of all databases in this instance.", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "axisSoftMax": 2, "barAlignment": 0, "drawStyle": "line", "fillOpacity": 71, "gradientMode": "hue", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "stepBefore", "lineWidth": 0, "pointSize": 2, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "normal" }, "thresholdsStyle": { "mode": "area" } }, "links": [ { "title": "PGSQL Session : ${__field.labels.ins}", "url": "/d/pgsql-session?var-ins=${__field.labels.ins}&${__url_time_range}" } ], "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 1 } ] } }, "overrides": [] }, "gridPos": { "h": 6, "w": 12, "x": 12, "y": 53 }, "id": 159, "links": [ { "title": "PGRDS Cluster : ${cls}", "url": "/d/pgrds-cluster?var-cls=${cls}&viewPanel=159&${__url_time_range}\n" } ], "options": { "legend": { "calcs": [ "max" ], "displayMode": "table", "placement": "right", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "sum by (datname) (pg_activity_count{ins=\"$ins\", state=~\"idle in.*\"})", "interval": "", "legendFormat": "{{ datname }}", "range": true, "refId": "A" } ], "title": "Idle in Transaction Backends", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "Number of database backend process / session by database.", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 60, "gradientMode": "hue", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "stepBefore", "lineWidth": 0, "pointSize": 2, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "normal" }, "thresholdsStyle": { "mode": "off" } }, "links": [ { "title": "PGSQL Session : ${__field.labels.ins}", "url": "/d/pgsql-session?var-ins=${__field.labels.ins}&${__url_time_range}" } ], "mappings": [], "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "#346f36cc", "value": null } ] } }, "overrides": [] }, "gridPos": { "h": 6, "w": 12, "x": 0, "y": 59 }, "id": 96, "links": [ { "title": "PGRDS Cluster Backends: ${cls}", "url": "/d/pgrds-cluster?var-cls=${cls}&viewPanel=96&${__url_time_range}\n" } ], "options": { "legend": { "calcs": [ "last" ], "displayMode": "table", "placement": "right", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "sum by (datname) (pg_activity_count{ins=\"$ins\"})", "interval": "", "legendFormat": "{{ datname }}", "range": true, "refId": "A" } ], "title": "Backends", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "Newly established session on each database.\nIncrease number of last 1 minute", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 71, "gradientMode": "hue", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "stepBefore", "lineWidth": 0, "pointSize": 2, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "normal" }, "thresholdsStyle": { "mode": "off" } }, "links": [ { "title": "PGSQL Session : ${__field.labels.ins}", "url": "/d/pgsql-session?var-ins=${__field.labels.ins}&${__url_time_range}" } ], "mappings": [], "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "super-light-green", "value": null } ] } }, "overrides": [] }, "gridPos": { "h": 6, "w": 12, "x": 12, "y": 59 }, "id": 153, "links": [ { "title": "PGRDS Cluster : ${cls}", "url": "/d/pgrds-cluster?var-cls=${cls}&viewPanel=153&${__url_time_range}" } ], "options": { "legend": { "calcs": [ "mean" ], "displayMode": "table", "placement": "right", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "sum by (datname) (increase(pg_db_sessions{ins=\"$ins\"}[1m]))", "interval": "", "legendFormat": "{{ ins }}", "range": true, "refId": "A" } ], "title": "New Sessions (increase1m)", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "Backend number group by state in this instance.", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 60, "gradientMode": "hue", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "stepBefore", "lineWidth": 0, "pointSize": 2, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "normal" }, "thresholdsStyle": { "mode": "off" } }, "links": [ { "title": "PGRDS Instance : ${__field.labels.ins}", "url": "/d/pgrds-instance?var-ins=${__field.labels.ins}&${__url_time_range}" } ], "mappings": [], "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "#346f36cc", "value": null } ] } }, "overrides": [ { "matcher": { "id": "byName", "options": "idle" }, "properties": [ { "id": "color", "value": { "fixedColor": "#808080", "mode": "fixed" } } ] } ] }, "gridPos": { "h": 6, "w": 12, "x": 0, "y": 65 }, "id": 149, "links": [ { "title": "PGRDS Cluster : ${cls}", "url": "/d/pgrds-cluster?var-cls=${cls}&viewPanel=149&${__url_time_range}\n" } ], "options": { "legend": { "calcs": [ "last" ], "displayMode": "table", "placement": "right", "showLegend": true, "sortBy": "Last", "sortDesc": true }, "tooltip": { "mode": "multi", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "sum by (state) (pg_activity_count{ins=\"$ins\"})", "interval": "", "legendFormat": "{{ state }}", "range": true, "refId": "A" } ], "title": "Backends by State", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "Backend number group by backend type in this instance.", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 60, "gradientMode": "hue", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "stepBefore", "lineWidth": 0, "pointSize": 1, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "normal" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "#346f36cc", "value": null } ] } }, "overrides": [ { "matcher": { "id": "byName", "options": "client backend" }, "properties": [ { "id": "color", "value": { "fixedColor": "#3e668f", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "autovacuum launcher" }, "properties": [ { "id": "color", "value": { "fixedColor": "#b783af", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "background writer" }, "properties": [ { "id": "color", "value": { "fixedColor": "#f5a673", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "checkpointer" }, "properties": [ { "id": "color", "value": { "fixedColor": "super-light-orange", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "logical replication launcher" }, "properties": [ { "id": "color", "value": { "fixedColor": "#3e668f", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "walwriter" }, "properties": [ { "id": "color", "value": { "fixedColor": "#cc4637d9", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "archiver" }, "properties": [ { "id": "color", "value": { "fixedColor": "#f5a673", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "TimescaleDB Background Worker Launcher" }, "properties": [ { "id": "color", "value": { "fixedColor": "#fcdb72", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "NULL" }, "properties": [ { "id": "color", "value": { "fixedColor": "#808080", "mode": "fixed" } } ] } ] }, "gridPos": { "h": 6, "w": 12, "x": 12, "y": 65 }, "id": 150, "links": [ { "title": "PGRDS Cluster : ${cls}", "url": "/d/pgrds-cluster?var-cls=${cls}&viewPanel=150&${__url_time_range}\n" } ], "options": { "legend": { "calcs": [ "last" ], "displayMode": "table", "placement": "right", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "sum by (type) (pg_backend_count{ins=\"$ins\"})", "interval": "", "legendFormat": "{{ type }}", "range": true, "refId": "A" } ], "title": "Backends by Type", "transformations": [ { "id": "renameByRegex", "options": { "regex": "(Value)", "renamePattern": "NULL" } } ], "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "Max connection duration group by session time.", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 2, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "#346f36cc", "value": null } ] }, "unit": "s" }, "overrides": [ { "matcher": { "id": "byName", "options": "idle" }, "properties": [ { "id": "color", "value": { "fixedColor": "#808080", "mode": "fixed" } } ] } ] }, "gridPos": { "h": 6, "w": 12, "x": 0, "y": 71 }, "id": 151, "links": [ { "title": "PGRDS Cluster : ${cls}", "url": "/d/pgrds-cluster?var-cls=${cls}&viewPanel=151&${__url_time_range}\n" } ], "options": { "legend": { "calcs": [ "max" ], "displayMode": "table", "placement": "right", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "sum by (state) (pg_activity_max_conn_duration{ins=\"$ins\", state!=\"idle|disabled\"})", "interval": "", "legendFormat": "{{ state }}", "range": true, "refId": "A" } ], "title": "Max Conn Lifespan", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "Backend number group by wait event type in this instance.", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 71, "gradientMode": "hue", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "stepBefore", "lineWidth": 0, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "normal" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "#346f36cc", "value": null } ] } }, "overrides": [ { "matcher": { "id": "byName", "options": "client backend" }, "properties": [ { "id": "color", "value": { "fixedColor": "#346f36cc", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "autovacuum launcher" }, "properties": [ { "id": "color", "value": { "fixedColor": "#b783af", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "background writer" }, "properties": [ { "id": "color", "value": { "fixedColor": "#f5a673", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "checkpointer" }, "properties": [ { "id": "color", "value": { "fixedColor": "super-light-orange", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "logical replication launcher" }, "properties": [ { "id": "color", "value": { "fixedColor": "#3e668f", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "walwriter" }, "properties": [ { "id": "color", "value": { "fixedColor": "super-light-blue", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "idle" }, "properties": [ { "id": "color", "value": { "fixedColor": "rgba(128, 128, 128, 0.5)", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "active" }, "properties": [ { "id": "color", "value": { "fixedColor": "#346f36cc", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "disabled" }, "properties": [ { "id": "color", "value": { "fixedColor": "#fcdb72", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "idle in transaction (aborted)" }, "properties": [ { "id": "color", "value": { "fixedColor": "#b783af", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "idle in transaction" }, "properties": [ { "id": "color", "value": { "fixedColor": "#cc4637d9", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "Client" }, "properties": [ { "id": "color", "value": { "fixedColor": "#346f36cc", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "NULL" }, "properties": [ { "id": "color", "value": { "fixedColor": "#808080", "mode": "fixed" } } ] } ] }, "gridPos": { "h": 6, "w": 12, "x": 12, "y": 71 }, "id": 165, "links": [ { "title": "PGRDS Cluster : ${cls}", "url": "/d/pgrds-cluster?var-cls=${cls}&viewPanel=165&${__url_time_range}\n" } ], "options": { "legend": { "calcs": [ "last" ], "displayMode": "table", "placement": "right", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "sum by (event) (pg_wait_count{ins=\"$ins\"})", "interval": "", "legendFormat": "{{ event }}", "range": true, "refId": "A" } ], "title": "Backends by Wait Event", "transformations": [ { "id": "renameByRegex", "options": { "regex": "(Value)", "renamePattern": "NULL" } } ], "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "Time spent executing SQL statements in this database", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 9, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 2, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "area" } }, "links": [ { "title": "PGSQL Session : ${__field.labels.ins}", "url": "/d/pgsql-session?var-ins=${__field.labels.ins}&${__url_time_range}" } ], "mappings": [], "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "orange", "value": 0.5 }, { "color": "red", "value": 0.7 }, { "color": "purple", "value": 0.9 } ] }, "unit": "percentunit" }, "overrides": [] }, "gridPos": { "h": 6, "w": 12, "x": 0, "y": 77 }, "id": 183, "links": [ { "title": "PGRDS Cluster : ${cls}", "url": "/d/pgrds-cluster?var-cls=${cls}&viewPanel=183&${__url_time_range}\n" } ], "options": { "legend": { "calcs": [ "mean" ], "displayMode": "table", "placement": "right", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "sum by (datname) (rate(pg_db_active_time{ins=\"$ins\"}[1m]))\n/ sum by (datname) (rate(pg_db_session_time{ins=\"$ins\"}[1m]))", "interval": "", "legendFormat": "{{ datname }}", "range": true, "refId": "A" } ], "title": "Active% (of Session Time)", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "Abandon: Number of database sessions to this database that were terminated because connection to the client was lost\n\n\nFatal: Number of database sessions to this database that were terminated by fatal errors\n\n\nKilled: Number of database sessions to this database that were terminated by operator intervention\n\n\navailable on PG 14+", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 71, "gradientMode": "hue", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "stepBefore", "lineWidth": 0, "pointSize": 2, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "normal" }, "thresholdsStyle": { "mode": "off" } }, "links": [ { "title": "PGSQL Session : ${__field.labels.ins}", "url": "/d/pgsql-session?var-ins=${__field.labels.ins}&${__url_time_range}" } ], "mappings": [], "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "super-light-green", "value": null } ] } }, "overrides": [] }, "gridPos": { "h": 6, "w": 12, "x": 12, "y": 77 }, "id": 184, "links": [ { "title": "PGRDS Cluster : ${cls}", "url": "/d/pgrds-cluster?var-cls=${cls}&viewPanel=184&${__url_time_range}\n" } ], "options": { "legend": { "calcs": [ "max" ], "displayMode": "table", "placement": "right", "showLegend": true, "sortBy": "Max", "sortDesc": true }, "tooltip": { "mode": "multi", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "sum by (datname) (increase(pg_db_sessions_abandoned{ins=\"$ins\"}[1m]))", "interval": "", "legendFormat": "[Abandoned] : {{ datname }}", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "sum by (datname) (increase(pg_db_sessions_fatal{ins=\"$ins\"}[1m]))", "hide": false, "interval": "", "legendFormat": "[Fatal] : {{ datname }}", "range": true, "refId": "B" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "sum by (datname) (increase(pg_db_sessions_killed{ins=\"$ins\"}[1m]))", "hide": false, "interval": "", "legendFormat": "[Killed] : {{ datname }}", "range": true, "refId": "C" } ], "title": "Sessions Failure in 1m", "type": "timeseries" }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 83 }, "id": 171, "panels": [], "title": "Persist", "type": "row" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "Rate of lsn in last minute", "fieldConfig": { "defaults": { "color": { "fixedColor": "#f5a673", "mode": "fixed" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 20, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "smooth", "lineWidth": 1, "pointSize": 3, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "links": [ { "title": "PGRDS Instance : ${__field.labels.ins}", "url": "/d/pgrds-instance?var-ins=${__field.labels.ins}&${__url_time_range}" } ], "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "#346f36cc", "value": null } ] }, "unit": "Bps" }, "overrides": [] }, "gridPos": { "h": 6, "w": 12, "x": 0, "y": 84 }, "id": 207, "links": [ { "title": "PGRDS Cluster : ${cls}", "url": "/d/pgrds-cluster?var-cls=${cls}&viewPanel=140&${__url_time_range}" } ], "options": { "legend": { "calcs": [ "last" ], "displayMode": "table", "placement": "right", "showLegend": false }, "tooltip": { "mode": "multi", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "pg:ins:lsn_rate1m{ins=\"$ins\"}", "interval": "", "legendFormat": "{{ ins }}", "range": true, "refId": "A" } ], "title": "LSN Progress (rate1m)", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "XID Usage of each databases in this instance.", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 1, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "area" } }, "links": [ { "title": "PGSQL Database : ${datname}", "url": "/d/pgsql-database?var-ins=${ins}&var-datname=${datname}&${__url_time_range}" } ], "mappings": [ { "options": { "match": "null+nan", "result": { "color": "gray", "index": 0, "text": "\u2205" } }, "type": "special" } ], "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "yellow", "value": 2 }, { "color": "orange", "value": 10 }, { "color": "red", "value": 30 }, { "color": "purple", "value": 90 }, { "color": "text", "value": 100 } ] }, "unit": "percentunit" }, "overrides": [] }, "gridPos": { "h": 6, "w": 12, "x": 12, "y": 84 }, "id": 208, "links": [ { "title": "PGRDS Cluster Age: ${cls}", "url": "/d/pgrds-cluster?var-cls=${cls}&viewPanel=161&${__url_time_range}" } ], "options": { "legend": { "calcs": [ "max" ], "displayMode": "table", "placement": "right", "showLegend": true, "sortBy": "Max", "sortDesc": true }, "tooltip": { "mode": "single", "sort": "none" } }, "pluginVersion": "8.0.3", "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "max by (datname) (pg:db:age{cls=\"$cls\"})", "interval": "", "legendFormat": "{{ datname }}", "range": true, "refId": "A" } ], "title": "Age Usage", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 30, "gradientMode": "hue", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 2, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "normal" }, "thresholdsStyle": { "mode": "off" } }, "links": [ { "title": "PGSQL Persist : ${__field.labels.ins}", "url": "/d/pgsql-persist?var-ins=${__field.labels.ins}&var-datname=${__field.labels.datname}&${__url_time_range}" } ], "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "decbytes" }, "overrides": [] }, "gridPos": { "h": 6, "w": 12, "x": 0, "y": 90 }, "id": 176, "links": [ { "title": "PGRDS Cluster : ${cls}", "url": "/d/pgrds-cluster?var-cls=${cls}&viewPanel=176&${__url_time_range}\n" } ], "options": { "legend": { "calcs": [ "last" ], "displayMode": "table", "placement": "right", "showLegend": true, "sortBy": "Last", "sortDesc": true }, "tooltip": { "mode": "multi", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "sum by (datname) (pg_size_bytes{ins=\"$ins\"})", "interval": "", "legendFormat": "{{ datname }}", "range": true, "refId": "A" } ], "title": "Database Cluster Size", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 8, "gradientMode": "hue", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 2, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "links": [ { "title": "PGSQL Persist : ${__field.labels.ins}", "url": "/d/pgsql-persist?var-ins=${__field.labels.ins}&var-datname=${__field.labels.datname}&${__url_time_range}" } ], "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "decbytes" }, "overrides": [] }, "gridPos": { "h": 6, "w": 12, "x": 12, "y": 90 }, "id": 178, "links": [ { "title": "PGRDS Cluster : ${cls}", "url": "/d/pgrds-cluster?var-cls=${cls}&viewPanel=178&${__url_time_range}\n" } ], "options": { "legend": { "calcs": [ "last" ], "displayMode": "table", "placement": "right", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "sum by (datname) (pg_size_bytes{ins=\"$ins\", datname=~\"(log|wal)\"})", "interval": "", "legendFormat": "{{ datname }}", "range": true, "refId": "A" } ], "title": "Database WAL/Log Size", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 60, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "stepBefore", "lineWidth": 0, "pointSize": 1, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "normal" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "#3e668f", "value": null } ] } }, "overrides": [ { "matcher": { "id": "byName", "options": "Scheduled" }, "properties": [ { "id": "color", "value": { "fixedColor": "#3e668f", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "Requested" }, "properties": [ { "id": "color", "value": { "fixedColor": "#f5a673", "mode": "fixed" } } ] } ] }, "gridPos": { "h": 6, "w": 12, "x": 0, "y": 96 }, "id": 160, "links": [ { "title": "PGRDS Cluster : ${cls}", "url": "/d/pgrds-cluster?var-cls=${cls}&viewPanel=160&${__url_time_range}\n" } ], "options": { "legend": { "calcs": [], "displayMode": "table", "placement": "right", "showLegend": true }, "tooltip": { "mode": "single", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "pg_bgwriter_checkpoints_timed{ins=\"$ins\"}", "hide": false, "interval": "", "legendFormat": "Scheduled", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "pg_bgwriter_checkpoints_req{ins=\"$ins\"} ", "hide": false, "interval": "", "legendFormat": "Requested", "range": true, "refId": "B" } ], "title": "Checkpoint Scheduled / Requested", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 30, "gradientMode": "hue", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 1, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "links": [ { "title": "PGSQL Database for ${__field.labels.datname}", "url": "/d/pgsql-database?var-ins=${ins}&var-datname=${__field.labels.datname}&${__url_time_range}" } ], "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "#346f36cc", "value": null } ] }, "unit": "ms" }, "overrides": [ { "matcher": { "id": "byName", "options": "Sync" }, "properties": [ { "id": "color", "value": { "fixedColor": "#cc4637d9", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "Write" }, "properties": [ { "id": "color", "value": { "fixedColor": "#3e668f", "mode": "fixed" } } ] } ] }, "gridPos": { "h": 6, "w": 12, "x": 12, "y": 96 }, "id": 174, "links": [ { "title": "PGRDS Cluster : ${cls}", "url": "/d/pgrds-cluster?var-cls=${cls}&viewPanel=174&${__url_time_range}\n" } ], "options": { "legend": { "calcs": [], "displayMode": "table", "placement": "right", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "rate(pg_bgwriter_checkpoint_sync_time{ins=\"$ins\"}[1m])", "interval": "", "legendFormat": "Sync", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "- rate(pg_bgwriter_checkpoint_write_time{ins=\"$ins\"}[1m])", "hide": false, "interval": "", "legendFormat": "Write", "range": true, "refId": "B" } ], "title": "Checkpoint Time : Sync/Write", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 60, "gradientMode": "hue", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 0, "pointSize": 1, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "normal" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "#346f36cc", "value": null } ] }, "unit": "decbytes" }, "overrides": [ { "matcher": { "id": "byName", "options": "Clean" }, "properties": [ { "id": "color", "value": { "fixedColor": "#346f36cc", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "Backend" }, "properties": [ { "id": "color", "value": { "fixedColor": "#f5a673", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "Checkpoint" }, "properties": [ { "id": "color", "value": { "fixedColor": "#3e668f", "mode": "fixed" } } ] } ] }, "gridPos": { "h": 6, "w": 12, "x": 0, "y": 102 }, "id": 172, "links": [ { "title": "PGRDS Cluster : ${cls}", "url": "/d/pgrds-cluster?var-cls=${cls}&viewPanel=172&${__url_time_range}\n" } ], "options": { "legend": { "calcs": [ "max" ], "displayMode": "table", "placement": "right", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "rate(pg_bgwriter_buffers_clean{ins=\"$ins\"}[1m]) * 8192", "interval": "", "legendFormat": "Clean", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "rate(pg_bgwriter_buffers_checkpoint{ins=\"$ins\"}[1m]) * 8192", "hide": false, "interval": "", "legendFormat": "Checkpoint", "range": true, "refId": "B" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "rate(pg_bgwriter_buffers_backend{ins=\"$ins\"}[1m]) * 8192", "hide": false, "instant": true, "interval": "", "legendFormat": "Backend", "refId": "C" } ], "title": "BGWriter Buffer Flush", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 60, "gradientMode": "hue", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 0, "pointSize": 1, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "normal" }, "thresholdsStyle": { "mode": "off" } }, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "#346f36cc", "value": null } ] }, "unit": "decbytes" }, "overrides": [ { "matcher": { "id": "byName", "options": "Clean" }, "properties": [ { "id": "color", "value": { "fixedColor": "#346f36cc", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "Backend" }, "properties": [ { "id": "color", "value": { "fixedColor": "#f5a673", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "Checkpoint" }, "properties": [ { "id": "color", "value": { "fixedColor": "#3e668f", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "Alloc" }, "properties": [ { "id": "color", "value": { "fixedColor": "super-light-red", "mode": "fixed" } } ] } ] }, "gridPos": { "h": 6, "w": 12, "x": 12, "y": 102 }, "id": 173, "links": [ { "title": "PGRDS Cluster : ${cls}", "url": "/d/pgrds-cluster?var-cls=${cls}&viewPanel=173&${__url_time_range}\n" } ], "options": { "legend": { "calcs": [ "max" ], "displayMode": "list", "placement": "bottom", "showLegend": false }, "tooltip": { "mode": "multi", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "rate(pg_bgwriter_buffers_alloc{ins=\"$ins\"}[1m]) * 8192", "interval": "", "legendFormat": "Alloc", "range": true, "refId": "A" } ], "title": "BGWriter Buffer Alloc", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "Database 4k blocks access", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 60, "gradientMode": "hue", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 0, "pointSize": 1, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "normal" }, "thresholdsStyle": { "mode": "off" } }, "links": [ { "title": "PGSQL Persist : ${__field.labels.ins}", "url": "/d/pgsql-persist?var-ins=${__field.labels.ins}&var-datname=${__field.labels.datname}&${__url_time_range}" } ], "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "#346f36cc", "value": null } ] }, "unit": "decbytes" }, "overrides": [ { "matcher": { "id": "byName", "options": "Clean" }, "properties": [ { "id": "color", "value": { "fixedColor": "#346f36cc", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "Backend" }, "properties": [ { "id": "color", "value": { "fixedColor": "#f5a673", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "Checkpoint" }, "properties": [ { "id": "color", "value": { "fixedColor": "#3e668f", "mode": "fixed" } } ] } ] }, "gridPos": { "h": 6, "w": 12, "x": 0, "y": 108 }, "id": 179, "links": [ { "title": "PGRDS Cluster : ${cls}", "url": "/d/pgrds-cluster?var-cls=${cls}&viewPanel=179&${__url_time_range}\n" } ], "options": { "legend": { "calcs": [ "max" ], "displayMode": "table", "placement": "right", "showLegend": true, "sortBy": "Name", "sortDesc": true }, "tooltip": { "mode": "multi", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "sum by (datname) (pg:db:blks_access_1m{ins=\"$ins\"}) * 4096", "interval": "", "legendFormat": "{{ datname }}", "range": true, "refId": "A" } ], "title": "Blocks Access 1m", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "Database 4k blocks hit ratio in last 1 minute", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "axisSoftMax": 1, "barAlignment": 0, "drawStyle": "line", "fillOpacity": 2, "gradientMode": "hue", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 2, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "decimals": 2, "links": [ { "title": "PGSQL Persist : ${__field.labels.ins}", "url": "/d/pgsql-persist?var-ins=${__field.labels.ins}&var-datname=${__field.labels.datname}&${__url_time_range}\n" } ], "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "#346f36cc", "value": null } ] }, "unit": "percentunit" }, "overrides": [] }, "gridPos": { "h": 6, "w": 12, "x": 12, "y": 108 }, "id": 170, "links": [ { "title": "PGRDS Cluster : ${cls}", "url": "/d/pgrds-cluster?var-cls=${cls}&viewPanel=170&${__url_time_range}\n" } ], "options": { "legend": { "calcs": [ "max" ], "displayMode": "table", "placement": "right", "showLegend": true, "sortBy": "Max", "sortDesc": false }, "tooltip": { "mode": "multi", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "pg:db:blks_hit_ratio1m{ins=\"$ins\", datname!~'rdsadmin|polardb_admin|template\\\\d'}", "hide": false, "interval": "", "legendFormat": "{{ datname }}", "range": true, "refId": "A" } ], "title": "Blocks Hit Ratio (1m)", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "Database 4k blocks access", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 60, "gradientMode": "hue", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 0, "pointSize": 1, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "normal" }, "thresholdsStyle": { "mode": "off" } }, "links": [ { "title": "PGSQL Persist : ${__field.labels.ins}", "url": "/d/pgsql-persist?var-ins=${__field.labels.ins}&var-datname=${__field.labels.datname}&${__url_time_range}\n" } ], "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "#346f36cc", "value": null } ] }, "unit": "decbytes" }, "overrides": [ { "matcher": { "id": "byName", "options": "Clean" }, "properties": [ { "id": "color", "value": { "fixedColor": "#346f36cc", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "Backend" }, "properties": [ { "id": "color", "value": { "fixedColor": "#f5a673", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "Checkpoint" }, "properties": [ { "id": "color", "value": { "fixedColor": "#3e668f", "mode": "fixed" } } ] } ] }, "gridPos": { "h": 6, "w": 12, "x": 0, "y": 114 }, "id": 182, "links": [ { "title": "PGRDS Cluster : ${cls}", "url": "/d/pgrds-cluster?var-cls=${cls}&viewPanel=182&${__url_time_range}\n" } ], "options": { "legend": { "calcs": [ "max" ], "displayMode": "table", "placement": "right", "showLegend": true, "sortBy": "Name", "sortDesc": true }, "tooltip": { "mode": "multi", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "sum by (datname) (pg:db:blks_read_1m{ins=\"$ins\", datname!~'rdsadmin|polardb_admin|template\\\\d'}) * 4096", "interval": "", "legendFormat": "{{ datname }}", "range": true, "refId": "A" } ], "title": "Blocks Read (1m)", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": true, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 5, "gradientMode": "hue", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 1, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "links": [ { "title": "PGSQL Persist : ${__field.labels.ins}", "url": "/d/pgsql-persist?var-ins=${__field.labels.ins}&var-datname=${__field.labels.datname}&${__url_time_range}" } ], "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "#346f36cc", "value": null } ] }, "unit": "percentunit" }, "overrides": [] }, "gridPos": { "h": 6, "w": 12, "x": 12, "y": 114 }, "id": 181, "links": [ { "title": "PGRDS Cluster : ${cls}", "url": "/d/pgrds-cluster?var-cls=${cls}&viewPanel=181&${__url_time_range}\n" } ], "options": { "legend": { "calcs": [ "max" ], "displayMode": "table", "placement": "right", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "sum by (datname) (pg:db:blk_read_time_seconds_rate1m{ins=\"$ins\", datname!~'rdsadmin|polardb_admin|template\\\\d'})", "interval": "", "legendFormat": "\u25b2 {{ datname }}", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "- sum by (datname) (pg:db:blk_write_time_seconds_rate1m{ins=\"$ins\", datname!~'rdsadmin|polardb_admin|template\\\\d'})", "hide": false, "interval": "", "legendFormat": "\u25bc {{ datname }}", "range": true, "refId": "B" } ], "title": "Blocks Read/Write Time Spent Rate1m", "type": "timeseries" }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 120 }, "id": 162, "links": [ { "title": "PGRDS Cluster : ${cls}", "url": "/d/pgrds-cluster?var-cls=${cls}&viewPanel=162&${__url_time_range}\n" } ], "panels": [], "title": "Database", "type": "row" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "Database size on cluster primary", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 35, "gradientMode": "hue", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 0, "pointSize": 1, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "normal" }, "thresholdsStyle": { "mode": "off" } }, "links": [ { "title": "PGSQL Databases : ${cls}.${__field.labels.datname}", "url": "/d/pgsql-databases?var-cls=${cls}&var-datname=${__field.labels.datname}&${__url_time_range}" } ], "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "decbytes" }, "overrides": [] }, "gridPos": { "h": 6, "w": 12, "x": 0, "y": 121 }, "id": 175, "links": [ { "title": "PGRDS Cluster : ${cls}", "url": "/d/pgrds-cluster?var-cls=${cls}&viewPanel=175&${__url_time_range}\n" } ], "options": { "legend": { "calcs": [ "last" ], "displayMode": "table", "placement": "right", "showLegend": true, "sortBy": "Last", "sortDesc": true }, "tooltip": { "mode": "multi", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "pg_size_bytes{ins=\"$ins\", datname!~'wal|log|template\\\\d'}", "interval": "", "legendFormat": "{{ datname }}", "range": true, "refId": "A" } ], "title": "Database Size", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "Database Size Change on Cluster Primary in Last 10 minutes", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 35, "gradientMode": "hue", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 1, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "links": [ { "title": "PGSQL Databases : ${cls}.${__field.labels.datname}", "url": "/d/pgsql-databases?var-cls=${cls}&var-datname=${__field.labels.datname}&${__url_time_range}" } ], "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "decbytes" }, "overrides": [] }, "gridPos": { "h": 6, "w": 12, "x": 12, "y": 121 }, "id": 185, "links": [ { "title": "PGRDS Cluster : ${cls}", "url": "/d/pgrds-cluster?var-cls=${cls}&viewPanel=185&${__url_time_range}\n" } ], "options": { "legend": { "calcs": [ "max" ], "displayMode": "table", "placement": "right", "showLegend": true, "sortBy": "Last", "sortDesc": true }, "tooltip": { "mode": "multi", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "delta(pg_size_bytes{ins=\"$ins\", datname!~'wal|log|template\\\\d'}[10m])", "interval": "", "legendFormat": "{{ datname }}", "range": true, "refId": "A" } ], "title": "Database Size Delta 10m", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "Database Transaction Commit Rate in last 1 minute", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 20, "gradientMode": "hue", "hideFrom": { "graph": false, "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "smooth", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "links": [ { "title": "PGSQL Databases : ${cls}.${__field.labels.datname}", "url": "/d/pgsql-databases?var-cls=${cls}&var-datname=${__field.labels.datname}&${__url_time_range}\n" } ], "mappings": [], "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "#346f36cc", "value": null } ] }, "unit": "short" }, "overrides": [ { "matcher": { "id": "byName", "options": "Instance" }, "properties": [ { "id": "custom.lineWidth", "value": 0 }, { "id": "custom.fillOpacity", "value": 20 } ] } ] }, "gridPos": { "h": 6, "w": 12, "x": 0, "y": 127 }, "id": 144, "links": [ { "title": "PGRDS Cluster : ${cls}", "url": "/d/pgrds-cluster?var-cls=${cls}&viewPanel=144&${__url_time_range}\n" } ], "options": { "legend": { "calcs": [ "mean" ], "displayMode": "table", "placement": "right", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } }, "pluginVersion": "8.0.0-beta2", "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "sum by (datname) (pg:db:xact_commit_rate1m{ins=\"$ins\", datname!~'template\\\\d'})", "hide": false, "interval": "", "intervalFactor": 1, "legendFormat": "{{ datname }}", "range": true, "refId": "B" } ], "title": "TPS by Database (rate1m)", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "Active processs for each database in this cluster", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 60, "gradientMode": "hue", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "stepBefore", "lineWidth": 0, "pointSize": 2, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "normal" }, "thresholdsStyle": { "mode": "off" } }, "links": [ { "title": "PGSQL Databases : ${cls}.${__field.labels.datname}", "url": "/d/pgsql-databases?var-cls=${cls}&var-datname=${__field.labels.datname}&${__url_time_range}\n" } ], "mappings": [], "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "#346f36cc", "value": null } ] } }, "overrides": [] }, "gridPos": { "h": 6, "w": 12, "x": 12, "y": 127 }, "id": 148, "links": [ { "title": "PGRDS Cluster : ${cls}", "url": "/d/pgrds-cluster?var-cls=${cls}&viewPanel=148&${__url_time_range}\n" } ], "options": { "legend": { "calcs": [ "last" ], "displayMode": "table", "placement": "right", "showLegend": true, "sortBy": "Last", "sortDesc": true }, "tooltip": { "mode": "multi", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "sum by (datname) (pg_activity_count{ins=\"$ins\"})", "interval": "", "legendFormat": "{{ datname }}", "range": true, "refId": "A" } ], "title": "Session by Database", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "Database 4k blocks hit ratio in last 1 minute", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "axisSoftMax": 1, "barAlignment": 0, "drawStyle": "line", "fillOpacity": 2, "gradientMode": "hue", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 2, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "decimals": 2, "links": [ { "title": "PGSQL Database : ${__field.labels.ins}.${__field.labels.datname}", "url": "/d/pgsql-database?var-ins=${__field.labels.ins}&var-datname=${__field.labels.datname}&${__url_time_range}" } ], "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "#346f36cc", "value": null } ] }, "unit": "percentunit" }, "overrides": [] }, "gridPos": { "h": 6, "w": 12, "x": 0, "y": 133 }, "id": 155, "links": [ { "title": "PGRDS Cluster : ${cls}", "url": "/d/pgrds-cluster?var-cls=${cls}&viewPanel=155&${__url_time_range}\n" } ], "options": { "legend": { "calcs": [ "max" ], "displayMode": "table", "placement": "right", "showLegend": true, "sortBy": "Max", "sortDesc": false }, "tooltip": { "mode": "multi", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "pg:db:blks_hit_ratio1m{ins=\"$ins\", datname!~'template\\\\d'}", "hide": false, "interval": "", "legendFormat": "{{ datname }}", "range": true, "refId": "A" } ], "title": "Database Blocks Hit Ratio (1m)", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "number of dangerous idle in transaction backend in different database amoung this cluster", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "axisSoftMax": 2, "barAlignment": 0, "drawStyle": "line", "fillOpacity": 71, "gradientMode": "hue", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "stepBefore", "lineWidth": 0, "pointSize": 2, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "normal" }, "thresholdsStyle": { "mode": "area" } }, "links": [ { "title": "PGSQL Databases : ${cls}.${__field.labels.datname}", "url": "/d/pgsql-databases?var-cls=${cls}&var-datname=${__field.labels.datname}&${__url_time_range}\n" } ], "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 1 } ] } }, "overrides": [] }, "gridPos": { "h": 6, "w": 12, "x": 12, "y": 133 }, "id": 167, "links": [ { "title": "PGRDS Cluster : ${cls}", "url": "/d/pgrds-cluster?var-cls=${cls}&viewPanel=167&${__url_time_range}\n" } ], "options": { "legend": { "calcs": [ "max" ], "displayMode": "table", "placement": "right", "showLegend": true, "sortBy": "Max", "sortDesc": true }, "tooltip": { "mode": "multi", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "sum by (datname) (pg_activity_count{ins=\"$ins\", state=~\"idle in.*\"})", "interval": "", "legendFormat": "{{ datname }}", "range": true, "refId": "A" } ], "title": "Idle in Transaction Backends", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "Connection used ratio of min(\n max_connections @ instance level,\n connlimit @ database level\n)", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "axisSoftMin": 0, "barAlignment": 0, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "stepBefore", "lineWidth": 1, "pointSize": 2, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "area" } }, "decimals": 1, "links": [ { "title": "PGSQL Database : ${__field.labels.ins}.${__field.labels.datname}", "url": "/d/pgsql-database?var-ins=${__field.labels.ins}&var-datname=${__field.labels.datname}&${__url_time_range}" } ], "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "#3e668f", "value": null }, { "color": "#346f36cc", "value": 0.1 }, { "color": "#fcdb72", "value": 0.3 }, { "color": "#f5a673", "value": 0.5 }, { "color": "red", "value": 0.7 }, { "color": "#b783af", "value": 0.9 } ] }, "unit": "percentunit" }, "overrides": [] }, "gridPos": { "h": 6, "w": 12, "x": 0, "y": 139 }, "id": 168, "links": [ { "title": "PGRDS Cluster : ${cls}", "url": "/d/pgrds-cluster?var-cls=${cls}&viewPanel=168&${__url_time_range}\n" } ], "options": { "legend": { "calcs": [ "max" ], "displayMode": "table", "placement": "right", "showLegend": true }, "tooltip": { "mode": "multi", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "pg:db:conn_usage{ins=\"$ins\", datname!~'template\\\\d'}", "interval": "", "legendFormat": "{{ datname }}", "range": true, "refId": "A" } ], "title": "Connection Usage", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 71, "gradientMode": "hue", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "stepBefore", "lineWidth": 0, "pointSize": 2, "scaleDistribution": { "type": "linear" }, "showPoints": "auto", "spanNulls": false, "stacking": { "group": "A", "mode": "normal" }, "thresholdsStyle": { "mode": "off" } }, "links": [ { "title": "PGSQL Session : ${__field.labels.ins}", "url": "/d/pgsql-session?var-ins=${__field.labels.ins}&${__url_time_range}" } ], "mappings": [], "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "super-light-green", "value": null } ] } }, "overrides": [] }, "gridPos": { "h": 6, "w": 12, "x": 12, "y": 139 }, "id": 166, "links": [ { "title": "PGRDS Cluster : ${cls}", "url": "/d/pgrds-cluster?var-cls=${cls}&viewPanel=166&${__url_time_range}\n" } ], "options": { "legend": { "calcs": [ "mean" ], "displayMode": "table", "placement": "right", "showLegend": true, "sortBy": "Mean", "sortDesc": true }, "tooltip": { "mode": "multi", "sort": "none" } }, "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "increase(pg_db_sessions{ins=\"$ins\", datname!~'template\\\\d'}[1m])", "interval": "", "legendFormat": "{{ datname }}", "range": true, "refId": "A" } ], "title": "New Sessions (incr1m)", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "Tuples fetched from databases among this cluster", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 3, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "links": [ { "title": "PGSQL Databases : ${cls}.${__field.labels.datname}", "url": "/d/pgsql-databases?var-cls=${cls}&var-datname=${__field.labels.datname}&${__url_time_range}\n" } ], "mappings": [], "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "#346f36cc", "value": null } ] }, "unit": "cps" }, "overrides": [] }, "gridPos": { "h": 6, "w": 12, "x": 0, "y": 145 }, "id": 163, "links": [ { "title": "PGRDS Cluster : ${cls}", "url": "/d/pgrds-cluster?var-cls=${cls}&viewPanel=163&${__url_time_range}\n" } ], "options": { "legend": { "calcs": [ "mean" ], "displayMode": "table", "placement": "right", "showLegend": true, "sortBy": "Mean", "sortDesc": true }, "tooltip": { "mode": "multi", "sort": "none" } }, "pluginVersion": "8.0.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "sum by (datname) (rate(pg_db_tup_fetched{ins=\"$ins\", datname!~'template\\\\d'}[1m]))", "interval": "", "legendFormat": "{{ datname }}", "range": true, "refId": "A" } ], "title": "Row Fetched", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "Tuples fetched, inserted, updated, delete among this cluster group by database", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 50, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 0, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "normal" }, "thresholdsStyle": { "mode": "off" } }, "links": [ { "title": "PGSQL Databases : ${cls}.${__field.labels.datname}", "url": "/d/pgsql-databases?var-cls=${cls}&var-datname=${__field.labels.datname}&${__url_time_range}\n" } ], "mappings": [], "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "#346f36cc", "value": null } ] }, "unit": "cps" }, "overrides": [] }, "gridPos": { "h": 6, "w": 12, "x": 12, "y": 145 }, "id": 164, "links": [ { "title": "PGRDS Cluster : ${cls}", "url": "/d/pgrds-cluster?var-cls=${cls}&viewPanel=164&${__url_time_range}\n" } ], "options": { "legend": { "calcs": [ "mean" ], "displayMode": "table", "placement": "right", "showLegend": true, "sortBy": "Mean", "sortDesc": true }, "tooltip": { "mode": "multi", "sort": "none" } }, "pluginVersion": "8.0.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "sum(rate(pg_db_tup_inserted{ins=\"$ins\", datname!~'template\\\\d'}[1m])) by (datname)", "hide": false, "interval": "", "intervalFactor": 2, "legendFormat": "INSERT.{{ datname }}", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "sum(rate(pg_db_tup_updated{ins=\"$ins\", datname!~'template\\\\d'}[1m])) by (datname)", "hide": false, "interval": "", "intervalFactor": 2, "legendFormat": "UPDATE.{{ datname }}", "range": true, "refId": "C" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "sum(rate(pg_db_tup_deleted{ins=\"$ins\", datname!~'template\\\\d'}[1m])) by (datname)", "hide": false, "interval": "", "intervalFactor": 2, "legendFormat": "DELETE.{{ datname }}", "range": true, "refId": "D" } ], "title": "Row Modified", "type": "timeseries" }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 151 }, "id": 124, "links": [ { "title": "PGRDS Cluster : ${cls}", "url": "/d/pgrds-cluster?var-cls=${cls}&viewPanel=124&${__url_time_range}\n" } ], "panels": [], "title": "Table & Query", "type": "row" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "Index Scan + Seq Scan on this table per second", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 2, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "links": [ { "title": "PGSQL Databases : ${cls}.${__field.labels.datname}", "url": "/d/pgsql-databases?var-cls=${cls}&var-datname=${__field.labels.datname}&${__url_time_range}" } ], "mappings": [], "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "#346f36cc", "value": null } ] }, "unit": "short" }, "overrides": [] }, "gridPos": { "h": 7, "w": 12, "x": 0, "y": 152 }, "id": 125, "options": { "legend": { "calcs": [ "mean" ], "displayMode": "table", "placement": "right", "showLegend": true, "sortBy": "Mean", "sortDesc": true }, "tooltip": { "mode": "single", "sort": "none" } }, "pluginVersion": "8.0.1", "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "sum by (datname,relname) (\n pg:table:scan_rate1m{ins=\"$ins\", datname!~'template\\\\d'}\n)", "interval": "", "intervalFactor": 2, "legendFormat": "{{ datname }}.{{ relname }}", "range": true, "refId": "A" } ], "title": "Table Scan (scan/s)", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 2, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "links": [ { "title": "PGSQL Databases : ${cls}.${__field.labels.datname}", "url": "/d/pgsql-databases?var-cls=${cls}&var-datname=${__field.labels.datname}&${__url_time_range}" } ], "mappings": [], "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "#346f36cc", "value": null } ] }, "unit": "short" }, "overrides": [] }, "gridPos": { "h": 7, "w": 12, "x": 12, "y": 152 }, "id": 126, "options": { "legend": { "calcs": [ "mean" ], "displayMode": "table", "placement": "right", "showLegend": true, "sortBy": "Mean", "sortDesc": true }, "tooltip": { "mode": "single", "sort": "none" } }, "pluginVersion": "8.0.1", "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "sum by (datname,relname) (\n rate(pg_table_tup_read{ins=\"$ins\", datname!~'template\\\\d'}[1m])\n)", "interval": "", "intervalFactor": 2, "legendFormat": "{{ datname }}.{{ relname }}", "range": true, "refId": "A" } ], "title": "Tuple Read (rows/s)", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "links": [ { "title": "PGSQL Databases : ${cls}.${__field.labels.datname}", "url": "/d/pgsql-databases?var-cls=${cls}&var-datname=${__field.labels.datname}&${__url_time_range}" } ], "mappings": [], "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "#346f36cc", "value": null } ] }, "unit": "ops" }, "overrides": [] }, "gridPos": { "h": 11, "w": 12, "x": 0, "y": 159 }, "id": 187, "options": { "legend": { "calcs": [ "max", "mean", "last" ], "displayMode": "table", "placement": "bottom", "showLegend": true, "sortBy": "Max", "sortDesc": true }, "tooltip": { "mode": "single", "sort": "none" } }, "pluginVersion": "8.0.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "pg:query:call_rate1m{ins=\"$ins\", datname!~'template\\\\d'}", "interval": "", "intervalFactor": 2, "legendFormat": "{{ datname }}.{{ query }}", "range": true, "refId": "B" } ], "title": "Query Call", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "Top Query Time Spent among this cluster", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "drawStyle": "line", "fillOpacity": 40, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 0, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "normal" }, "thresholdsStyle": { "mode": "off" } }, "links": [ { "title": "PGSQL Databases : ${cls}.${__field.labels.datname}", "url": "/d/pgsql-databases?var-cls=${cls}&var-datname=${__field.labels.datname}&${__url_time_range}" } ], "mappings": [], "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "#346f36cc", "value": null } ] }, "unit": "s" }, "overrides": [] }, "gridPos": { "h": 11, "w": 12, "x": 12, "y": 159 }, "id": 111, "options": { "legend": { "calcs": [ "max", "mean", "last" ], "displayMode": "table", "placement": "bottom", "showLegend": true, "sortBy": "Mean", "sortDesc": true }, "tooltip": { "mode": "single", "sort": "none" } }, "pluginVersion": "8.0.0", "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "rate(pg_query_exec_time{ins=\"$ins\", datname!~'template\\\\d'}[1m])", "interval": "", "intervalFactor": 2, "legendFormat": "{{ datname }}.{{ query }}", "range": true, "refId": "B" } ], "title": "Query Time ", "type": "timeseries" } ], "refresh": "", "revision": 1, "schemaVersion": 39, "tags": [ "Pigsty", "PGSQL", "Instance", "PGRDS" ], "templating": { "list": [ { "current": {}, "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "definition": "label_values(pg_up, ins)", "description": "Unique instance identifier (e.g pg-meta-1)", "hide": 0, "includeAll": false, "label": "Instance", "multi": false, "name": "ins", "options": [], "query": { "query": "label_values(pg_up, ins)", "refId": "StandardVariableQuery" }, "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 1, "type": "query" }, { "current": {}, "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "definition": "label_values(pg_up{ins=\"$ins\"} , ip)", "description": "IP address of this postgres instance", "hide": 2, "includeAll": false, "label": "IP", "multi": false, "name": "ip", "options": [], "query": { "query": "label_values(pg_up{ins=\"$ins\"} , ip)", "refId": "StandardVariableQuery" }, "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 1, "type": "query" }, { "current": {}, "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "definition": "label_values(pg_up{ins=\"$ins\"}, ins)", "description": "Sequence number of this instance, which is an unique integer among a postgres cluster", "hide": 2, "includeAll": false, "label": "Sequence", "multi": false, "name": "seq", "options": [], "query": { "query": "label_values(pg_up{ins=\"$ins\"}, ins)", "refId": "StandardVariableQuery" }, "refresh": 1, "regex": "/^[a-zA-Z0-9-_]+-(\\d+)$/", "skipUrlSync": false, "sort": 1, "type": "query" }, { "current": {}, "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "definition": "label_values(pg_up{ins=\"$ins\"}, cls)", "description": "Cluster identifier for this postgres instance, cls should be unique among entire environment. such as pg-meta, pg-test", "hide": 2, "includeAll": false, "label": "Cluster", "multi": false, "name": "cls", "options": [], "query": { "query": "label_values(pg_up{ins=\"$ins\"}, cls)", "refId": "StandardVariableQuery" }, "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 1, "type": "query" }, { "current": {}, "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "definition": "label_values(node_uname_info{ip=\"$ip\"},nodename)", "description": "Node name of current postgres instance", "hide": 2, "includeAll": false, "label": "Node", "multi": false, "name": "node", "options": [], "query": { "query": "label_values(node_uname_info{ip=\"$ip\"},nodename)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 1, "type": "query" }, { "current": {}, "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "definition": "label_values(pg_db_age{ins=\"$ins\", datname!~\"template0|template1|postgres|rdsadmin|polardb_admin\"},datname)", "description": "Non-trivial database in this instance", "hide": 2, "includeAll": false, "label": "Database", "multi": false, "name": "datname", "options": [], "query": { "query": "label_values(pg_db_age{ins=\"$ins\", datname!~\"template0|template1|postgres|rdsadmin|polardb_admin\"},datname)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, "regex": "", "skipUrlSync": false, "sort": 1, "type": "query" } ] }, "time": { "from": "now-1h", "to": "now" }, "timepicker": {}, "timezone": "", "title": "PGRDS Instance", "uid": "pgrds-instance", "version": 1, "weekStart": "" } ================================================ FILE: monitor/pgsql-exporter.json ================================================ { "annotations": { "list": [ { "builtIn": 1, "datasource": { "type": "datasource", "uid": "grafana" }, "enable": true, "hide": true, "iconColor": "rgba(0, 211, 255, 1)", "name": "Annotations & Alerts", "target": { "limit": 100, "matchAny": false, "tags": [], "type": "dashboard" }, "type": "dashboard" } ] }, "author": "Ruohang Feng (rh@vonng.com)", "description": "PostgreSQL Instance Dashboard", "editable": true, "fiscalYearStartMonth": 0, "graphTooltip": 0, "id": null, "license": "AGPLv3 @ https://pigsty.io/docs/about/license", "links": [ { "asDropdown": true, "icon": "external link", "includeVars": true, "keepTime": true, "tags": [ "Pigsty", "PGSQL", "Overview" ], "targetBlank": false, "title": "Overview", "tooltip": "", "type": "dashboards", "url": "" }, { "asDropdown": true, "icon": "external link", "includeVars": true, "keepTime": true, "tags": [ "Pigsty", "PGSQL", "Cluster" ], "targetBlank": false, "title": "Cluster", "tooltip": "", "type": "dashboards", "url": "" }, { "asDropdown": true, "icon": "external link", "includeVars": true, "keepTime": true, "tags": [ "Pigsty", "PGSQL", "Instance" ], "targetBlank": false, "title": "Instance", "tooltip": "", "type": "dashboards", "url": "" }, { "asDropdown": true, "icon": "external link", "includeVars": true, "keepTime": true, "tags": [ "Pigsty", "PGSQL", "Database" ], "targetBlank": false, "title": "Database", "tooltip": "", "type": "dashboards", "url": "" } ], "panels": [ { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 0 }, "id": 137, "panels": [], "title": "Overview", "type": "row" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "mappings": [], "max": 2.5, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "#3e668f", "value": null } ] } }, "overrides": [ { "matcher": { "id": "byName", "options": "Cluster" }, "properties": [ { "id": "displayName", "value": "${cls}" }, { "id": "mappings", "value": [ { "options": { "1": { "index": 0, "text": "Cluster" } }, "type": "value" } ] }, { "id": "links", "value": [ { "title": "PGSQL Cluster : ${cls}", "url": "/d/pgsql-cluster?var-cls=${cls}&${__url_time_range}" } ] } ] }, { "matcher": { "id": "byName", "options": "Instance" }, "properties": [ { "id": "displayName", "value": "${ins}" }, { "id": "mappings", "value": [ { "options": { "1": { "index": 0, "text": "Instance" } }, "type": "value" } ] }, { "id": "links", "value": [ { "title": "PGCAT Instance : ${ins}", "url": "/d/pgcat-instance?var-dsn=${ins}.${datname}" } ] } ] }, { "matcher": { "id": "byName", "options": "IP" }, "properties": [ { "id": "mappings", "value": [ { "options": { "1": { "index": 0, "text": "Node IP" } }, "type": "value" } ] }, { "id": "displayName", "value": "${ip}" }, { "id": "links", "value": [ { "title": "Node Instance : ${ip}", "url": "/d/node-instance?var-id=${ip}&${__url_time_range}" } ] } ] }, { "matcher": { "id": "byName", "options": "Name" }, "properties": [ { "id": "displayName", "value": "${node}" }, { "id": "mappings", "value": [ { "options": { "1": { "index": 0, "text": "Hostname" } }, "type": "value" } ] }, { "id": "links", "value": [ { "title": "Node Instance : ${node}", "url": "/d/node-instance?var-id=${node}&${__url_time_range}" } ] } ] } ] }, "gridPos": { "h": 6, "w": 6, "x": 0, "y": 1 }, "id": 110, "options": { "colorMode": "background", "graphMode": "area", "justifyMode": "auto", "orientation": "horizontal", "percentChangeColorMode": "standard", "reduceOptions": { "calcs": [ "lastNotNull" ], "fields": "", "values": false }, "showPercentChange": false, "text": { "titleSize": 20, "valueSize": 16 }, "textMode": "value_and_name", "wideLayout": true }, "pluginVersion": "11.5.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "1", "hide": false, "instant": true, "interval": "", "legendFormat": "Instance", "queryType": "measurements", "refId": "B" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "1", "hide": false, "instant": true, "interval": "", "legendFormat": "Cluster", "queryType": "measurements", "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "1", "hide": false, "instant": true, "interval": "", "legendFormat": "IP", "queryType": "measurements", "refId": "C" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "1", "hide": false, "instant": true, "interval": "", "legendFormat": "Name", "queryType": "measurements", "refId": "D" } ], "title": "", "type": "stat" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "Exporter Aliveness Status in ${cls}\n\nInstance: Goto PGSQL Instance\n\nIP: Goto PGSQL Node\n\nStatus: Goto PGSQL Service\n\nLoad: max(cpu,postgres,pgbouncer)\n\nSpace: Disk space usage max(all device)\n\nProxy: session number, Goto Haproxy Admin Page", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "custom": { "align": "center", "cellOptions": { "type": "auto" }, "inspect": false }, "mappings": [], "max": 1.2, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "#e3e3e3e0", "value": null } ] } }, "overrides": [ { "matcher": { "id": "byRegexp", "options": "/postgres|pgbouncer|pgbackrest/" }, "properties": [ { "id": "mappings", "value": [ { "options": { "0": { "color": "#cc4637d9", "index": 0, "text": "DOWN" }, "1": { "color": "#346f36cc", "index": 1, "text": "UP" } }, "type": "value" } ] }, { "id": "custom.cellOptions", "value": { "type": "color-background" } }, { "id": "custom.width", "value": 100 } ] }, { "matcher": { "id": "byRegexp", "options": "/.*up/" }, "properties": [ { "id": "unit", "value": "s" }, { "id": "custom.width", "value": 100 } ] }, { "matcher": { "id": "byName", "options": "Instance" }, "properties": [ { "id": "links", "value": [ { "title": "PGSQL Exporter : ${__data.fields.Instance}", "url": "/d/pgsql-exporter?var-ins=${__data.fields.Instance}&${__url_time_range}" } ] } ] } ] }, "gridPos": { "h": 6, "w": 10, "x": 6, "y": 1 }, "id": 144, "links": [ { "title": "PGSQL Instance : ${primary}", "url": "/d/pgsql-instance?var-ins=${primary}&${__url_time_range}" } ], "options": { "cellHeight": "sm", "footer": { "countRows": false, "fields": "", "reducer": [ "sum" ], "show": false }, "showHeader": true, "sortBy": [ { "desc": false, "displayName": "pg uptime" } ] }, "pluginVersion": "11.5.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "max by (ins) (pg_exporter_agent_up{cls=\"$cls\"})", "format": "table", "hide": false, "instant": true, "interval": "", "legendFormat": "", "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "max by (ins) (pgbouncer_exporter_agent_up{cls=\"$cls\"})", "format": "table", "hide": false, "instant": true, "interval": "", "legendFormat": "__auto", "refId": "B" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "max by (ins) (pgbackrest_exporter_agent_up{cls=\"$cls\"})", "format": "table", "hide": false, "instant": true, "interval": "", "legendFormat": "", "refId": "C" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "max by (ins) (pg_exporter_uptime{cls=\"$cls\"})", "format": "table", "hide": false, "instant": true, "interval": "", "legendFormat": "", "refId": "F" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "max by (ins) (pgbouncer_exporter_uptime{cls=\"$cls\"})", "format": "table", "hide": false, "instant": true, "interval": "", "legendFormat": "", "refId": "G" } ], "title": "Export Status", "transformations": [ { "id": "seriesToColumns", "options": { "byField": "ins", "mode": "outer" } }, { "id": "organize", "options": { "excludeByName": { "Time": true, "Time 1": true, "Time 10": true, "Time 2": true, "Time 3": true, "Time 4": true, "Time 6": true, "Time 8": true, "Time 9": true, "Value #A": false, "Value #B": false, "Value #H": false, "__name__": true, "__name__ 1": true, "__name__ 2": true, "__name__ 3": true, "__name__ 5": true, "__name__ 7": true, "cls": true, "cls 1": true, "cls 2": true, "cls 3": true, "cls 4": true, "cls 5": true, "cls 6": true, "cls 7": true, "instance": true, "instance 1": true, "instance 2": false, "instance 3": true, "instance 4": true, "instance 5": true, "instance 6": true, "ip 1": true, "ip 2": true, "ip 3": true, "ip 4": true, "ip 5": true, "ip 6": true, "ip 7": true, "job": true, "job 1": true, "job 2": true, "job 3": true, "job 4": true, "job 5": true, "job 6": true, "job 7": true }, "includeByName": {}, "indexByName": { "Time 1": 5, "Time 2": 6, "Time 3": 7, "Time 4": 8, "Value #A": 1, "Value #B": 2, "Value #C": 3, "Value #D": 4, "ins": 0 }, "renameByName": { "Time 4": "", "Value #A": "postgres", "Value #B": "pgbouncer", "Value #C": "pgbackrest", "Value #D": "pgb uptime", "Value #E": "DB Conn", "Value #F": "pg uptime", "Value #G": "pgb uptime", "Value #H": "LB", "Value #I": "QPS", "Value #J": "LB Clients", "Value #K": "Lag", "cls 1": "", "cls 2": "", "ins": "Instance", "instance": "", "instance 2": "", "ip": "IP", "ip 1": "IP" } } } ], "type": "table" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 12, "gradientMode": "opacity", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "stepAfter", "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "decimals": 0, "links": [], "mappings": [ { "options": { "0": { "index": 0, "text": "Dead" }, "1": { "index": 1, "text": "UP" } }, "type": "value" } ], "max": 1.2, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "none" }, "overrides": [ { "matcher": { "id": "byName", "options": "pgbouncer" }, "properties": [ { "id": "color", "value": { "fixedColor": "#f5a673", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "postgres" }, "properties": [ { "id": "color", "value": { "fixedColor": "#3e668f", "mode": "fixed" } } ] } ] }, "gridPos": { "h": 6, "w": 8, "x": 16, "y": 1 }, "id": 145, "options": { "legend": { "calcs": [ "min" ], "displayMode": "table", "placement": "right", "showLegend": true, "sortBy": "Min", "sortDesc": false }, "tooltip": { "hideZeros": false, "mode": "multi", "sort": "desc" } }, "pluginVersion": "11.5.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "expr": "pg_exporter_agent_up{ins=\"$ins\"}", "legendFormat": "pg_exporter", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "expr": "pgbouncer_exporter_agent_up{ins=\"$ins\"}", "legendFormat": "pgbouncer_exporter", "range": true, "refId": "B" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "expr": "pg_up{ins=\"$ins\"}", "hide": false, "legendFormat": "postgres", "range": true, "refId": "C" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "expr": "pgbouncer_up{ins=\"$ins\"}", "hide": false, "legendFormat": "pgbouncer", "range": true, "refId": "D" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "expr": "patroni_up{ins=\"$ins\"}", "hide": false, "legendFormat": "patroni", "range": true, "refId": "E" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "expr": "pgbackrest_exporter_agent_up{ins=\"$ins\"}", "hide": false, "legendFormat": "pgbackrest_exporter", "range": true, "refId": "F" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "expr": "up{ins=\"$ins\", job=~\"pgsql|pgrds\"}", "hide": true, "legendFormat": "{{ instance }}", "range": true, "refId": "Z" } ], "title": "Aliveness", "type": "timeseries" }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 7 }, "id": 142, "panels": [], "title": "Global Status", "type": "row" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "opacity", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "stepAfter", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "decimals": 0, "links": [], "mappings": [ { "options": { "0": { "index": 0, "text": "Dead" }, "1": { "index": 1, "text": "UP" } }, "type": "value" } ], "max": 1.2, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "none" }, "overrides": [ { "matcher": { "id": "byName", "options": "pgbouncer" }, "properties": [ { "id": "color", "value": { "fixedColor": "#f5a673", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "postgres" }, "properties": [ { "id": "color", "value": { "fixedColor": "#3e668f", "mode": "fixed" } } ] } ] }, "gridPos": { "h": 6, "w": 12, "x": 0, "y": 8 }, "id": 139, "options": { "legend": { "calcs": [ "min" ], "displayMode": "table", "placement": "right", "showLegend": true, "sortBy": "Min", "sortDesc": false }, "tooltip": { "hideZeros": false, "mode": "multi", "sort": "desc" } }, "pluginVersion": "11.5.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "expr": "pg_exporter_agent_up{}", "legendFormat": "{{ instance }}", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "expr": "pgbouncer_exporter_agent_up{}", "legendFormat": "{{ instance }}", "range": true, "refId": "B" } ], "title": "Aliveness", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "All instances among pgsql cluster ${cls}\n\nInstance: Goto PGSQL Instance\n\nIP: Goto PGSQL Node\n\nStatus: Goto PGSQL Service\n\nLoad: max(cpu,postgres,pgbouncer)\n\nSpace: Disk space usage max(all device)\n\nProxy: session number, Goto Haproxy Admin Page", "fieldConfig": { "defaults": { "color": { "mode": "thresholds" }, "custom": { "align": "center", "cellOptions": { "type": "auto" }, "inspect": false }, "mappings": [], "max": 1.2, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "#e3e3e3e0", "value": null } ] } }, "overrides": [ { "matcher": { "id": "byRegexp", "options": "/pg_exporter|pgb_exporter/" }, "properties": [ { "id": "mappings", "value": [ { "options": { "0": { "color": "#cc4637d9", "index": 0, "text": "DOWN" }, "1": { "color": "#346f36cc", "index": 1, "text": "UP" } }, "type": "value" } ] }, { "id": "custom.cellOptions", "value": { "type": "color-background" } }, { "id": "custom.width", "value": 110 } ] }, { "matcher": { "id": "byRegexp", "options": "/.*up/" }, "properties": [ { "id": "unit", "value": "s" }, { "id": "custom.width", "value": 100 } ] }, { "matcher": { "id": "byName", "options": "Instance" }, "properties": [ { "id": "links", "value": [ { "title": "PGSQL Exporter : ${__data.fields.Instance}", "url": "/d/pgsql-exporter?var-ins=${__data.fields.Instance}&${__url_time_range}" } ] } ] } ] }, "gridPos": { "h": 12, "w": 12, "x": 12, "y": 8 }, "id": 141, "links": [ { "title": "PGSQL Instance : ${primary}", "url": "/d/pgsql-instance?var-ins=${primary}&${__url_time_range}" } ], "options": { "cellHeight": "sm", "footer": { "countRows": false, "fields": "", "reducer": [ "sum" ], "show": false }, "showHeader": true, "sortBy": [ { "desc": false, "displayName": "pg uptime" } ] }, "pluginVersion": "11.5.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "max by (ins) (pg_exporter_agent_up{})", "format": "table", "hide": false, "instant": true, "interval": "", "legendFormat": "", "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "max by (ins) (pgbouncer_exporter_agent_up{})", "format": "table", "hide": false, "instant": true, "interval": "", "legendFormat": "", "refId": "B" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "max by (ins) (pg_exporter_uptime{})", "format": "table", "hide": false, "instant": true, "interval": "", "legendFormat": "", "refId": "C" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "exemplar": false, "expr": "max by (ins) (pgbouncer_exporter_uptime{})", "format": "table", "hide": false, "instant": true, "interval": "", "legendFormat": "", "refId": "D" } ], "title": "", "transformations": [ { "id": "seriesToColumns", "options": { "byField": "ins", "mode": "outer" } }, { "id": "organize", "options": { "excludeByName": { "Time": true, "Time 1": true, "Time 10": true, "Time 2": true, "Time 3": true, "Time 4": true, "Time 6": true, "Time 8": true, "Time 9": true, "Value #A": false, "Value #B": false, "Value #H": false, "__name__": true, "__name__ 1": true, "__name__ 2": true, "__name__ 3": true, "__name__ 5": true, "__name__ 7": true, "cls": true, "cls 1": true, "cls 2": true, "cls 3": true, "cls 4": true, "cls 5": true, "cls 6": true, "cls 7": true, "instance": true, "instance 1": true, "instance 2": false, "instance 3": true, "instance 4": true, "instance 5": true, "instance 6": true, "ip 1": true, "ip 2": true, "ip 3": true, "ip 4": true, "ip 5": true, "ip 6": true, "ip 7": true, "job": true, "job 1": true, "job 2": true, "job 3": true, "job 4": true, "job 5": true, "job 6": true, "job 7": true }, "indexByName": { "Time 1": 5, "Time 2": 6, "Time 3": 7, "Time 4": 8, "Value #A": 1, "Value #B": 2, "Value #C": 3, "Value #D": 4, "ins": 0 }, "renameByName": { "Time 4": "", "Value #A": "pg_exporter", "Value #B": "pgb_exporter", "Value #C": "pg uptime", "Value #D": "pgb uptime", "Value #E": "DB Conn", "Value #F": "RT", "Value #G": "LB Conn", "Value #H": "LB", "Value #I": "QPS", "Value #J": "LB Clients", "Value #K": "Lag", "cls 1": "", "cls 2": "", "ins": "Instance", "instance": "", "instance 2": "", "ip": "IP", "ip 1": "IP" } } } ], "type": "table" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "points", "fillOpacity": 0, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "stepAfter", "lineWidth": 0, "pointSize": 2, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "links": [], "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "s" }, "overrides": [] }, "gridPos": { "h": 6, "w": 12, "x": 0, "y": 14 }, "id": 138, "options": { "legend": { "calcs": [ "mean" ], "displayMode": "table", "placement": "right", "showLegend": true, "sortBy": "Mean", "sortDesc": true }, "tooltip": { "hideZeros": false, "mode": "multi", "sort": "desc" } }, "pluginVersion": "11.5.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "expr": "pg_exporter_scrape_duration{}", "legendFormat": "{{ ins }}", "range": true, "refId": "A" } ], "title": "Scrape Duration", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "decimals": 0, "links": [], "mappings": [], "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "s" }, "overrides": [ { "matcher": { "id": "byName", "options": "postgres" }, "properties": [ { "id": "color", "value": { "fixedColor": "#3e668f", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "pgbouncer" }, "properties": [ { "id": "color", "value": { "fixedColor": "#f5a673", "mode": "fixed" } } ] } ] }, "gridPos": { "h": 6, "w": 12, "x": 0, "y": 20 }, "id": 146, "options": { "legend": { "calcs": [ "last" ], "displayMode": "table", "placement": "right", "showLegend": true, "sortBy": "Last", "sortDesc": true }, "tooltip": { "hideZeros": false, "mode": "multi", "sort": "desc" } }, "pluginVersion": "11.5.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "expr": "pg_exporter_uptime{}", "legendFormat": "{{ ins }}", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "expr": "pgbouncer_exporter_uptime{}", "hide": false, "legendFormat": "{{ ins }}", "range": true, "refId": "B" } ], "title": "Global Uptime", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "Error Increase in last 1 minute", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 12, "gradientMode": "opacity", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "stepAfter", "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "links": [], "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "none" }, "overrides": [ { "matcher": { "id": "byName", "options": "pgbouncer" }, "properties": [ { "id": "color", "value": { "fixedColor": "#f5a673", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "postgres" }, "properties": [ { "id": "color", "value": { "fixedColor": "#3e668f", "mode": "fixed" } } ] } ] }, "gridPos": { "h": 6, "w": 12, "x": 12, "y": 20 }, "id": 147, "options": { "legend": { "calcs": [ "mean" ], "displayMode": "table", "placement": "right", "showLegend": true, "sortBy": "Name", "sortDesc": true }, "tooltip": { "hideZeros": false, "mode": "multi", "sort": "desc" } }, "pluginVersion": "11.5.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "expr": "increase(pg_exporter_scrape_error_count{}[1m])", "legendFormat": "{{ ins }}", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "expr": "increase(pgbouncer_exporter_scrape_error_count{}[1m])", "legendFormat": "{{ ins }}", "range": true, "refId": "B" } ], "title": "Global Error Rate", "type": "timeseries" }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 26 }, "id": 130, "panels": [], "title": "Metrics", "type": "row" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "decimals": 0, "links": [], "mappings": [], "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] }, "unit": "s" }, "overrides": [ { "matcher": { "id": "byName", "options": "postgres" }, "properties": [ { "id": "color", "value": { "fixedColor": "#3e668f", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "pgbouncer" }, "properties": [ { "id": "color", "value": { "fixedColor": "#f5a673", "mode": "fixed" } } ] } ] }, "gridPos": { "h": 5, "w": 12, "x": 0, "y": 27 }, "id": 124, "options": { "legend": { "calcs": [], "displayMode": "list", "placement": "bottom", "showLegend": true }, "tooltip": { "hideZeros": false, "mode": "multi", "sort": "desc" } }, "pluginVersion": "11.5.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "expr": "pg_exporter_uptime{ins=\"$ins\"}", "legendFormat": "postgres", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "expr": "pgbouncer_exporter_uptime{ins=\"$ins\"}", "hide": false, "legendFormat": "pgbouncer", "range": true, "refId": "B" } ], "title": "Up Time", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 12, "gradientMode": "opacity", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "stepAfter", "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "decimals": 0, "links": [], "mappings": [ { "options": { "0": { "index": 0, "text": "Dead" }, "1": { "index": 1, "text": "UP" } }, "type": "value" } ], "max": 1.2, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "none" }, "overrides": [ { "matcher": { "id": "byName", "options": "pgbouncer" }, "properties": [ { "id": "color", "value": { "fixedColor": "#f5a673", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "postgres" }, "properties": [ { "id": "color", "value": { "fixedColor": "#3e668f", "mode": "fixed" } } ] } ] }, "gridPos": { "h": 5, "w": 12, "x": 12, "y": 27 }, "id": 128, "options": { "legend": { "calcs": [ "min" ], "displayMode": "table", "placement": "right", "showLegend": true, "sortBy": "Min", "sortDesc": false }, "tooltip": { "hideZeros": false, "mode": "multi", "sort": "desc" } }, "pluginVersion": "11.5.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "expr": "pg_exporter_agent_up{ins=\"$ins\"}", "legendFormat": "postgres", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "expr": "pgbouncer_exporter_agent_up{ins=\"$ins\"}", "legendFormat": "pgbouncer", "range": true, "refId": "B" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "expr": "pgbackrest_exporter_agent_up{ins=\"$ins\"}", "hide": false, "legendFormat": "pgbackrest", "range": true, "refId": "D" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "expr": "up{ins=\"$ins\", job=~\"pgsql|pgrds\"}", "hide": true, "legendFormat": "{{ instance }}", "range": true, "refId": "C" } ], "title": "Exporter Aliveness", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "stepAfter", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "links": [], "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "s" }, "overrides": [ { "matcher": { "id": "byName", "options": "pgbouncer" }, "properties": [ { "id": "color", "value": { "fixedColor": "#f5a673", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "postgres" }, "properties": [ { "id": "color", "value": { "fixedColor": "#3e668f", "mode": "fixed" } } ] } ] }, "gridPos": { "h": 5, "w": 12, "x": 0, "y": 32 }, "id": 129, "options": { "legend": { "calcs": [ "mean" ], "displayMode": "table", "placement": "right", "showLegend": true }, "tooltip": { "hideZeros": false, "mode": "multi", "sort": "desc" } }, "pluginVersion": "11.5.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "expr": "pg_exporter_scrape_duration{ins=\"$ins\"}", "legendFormat": "postgres", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "expr": "pgbouncer_exporter_scrape_duration{ins=\"$ins\"}", "hide": false, "legendFormat": "pgbouncer", "range": true, "refId": "B" } ], "title": "Scrape Duration", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "Error Increase in last 1 minute", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 12, "gradientMode": "opacity", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "stepAfter", "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "links": [], "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "none" }, "overrides": [ { "matcher": { "id": "byName", "options": "pgbouncer" }, "properties": [ { "id": "color", "value": { "fixedColor": "#f5a673", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "postgres" }, "properties": [ { "id": "color", "value": { "fixedColor": "#3e668f", "mode": "fixed" } } ] } ] }, "gridPos": { "h": 5, "w": 12, "x": 12, "y": 32 }, "id": 136, "options": { "legend": { "calcs": [ "mean" ], "displayMode": "table", "placement": "right", "showLegend": true, "sortBy": "Name", "sortDesc": true }, "tooltip": { "hideZeros": false, "mode": "multi", "sort": "desc" } }, "pluginVersion": "11.5.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "expr": "increase(pg_exporter_scrape_error_count{ins=\"$ins\"}[1m])", "legendFormat": "postgres", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "expr": "increase(pgbouncer_exporter_scrape_error_count{ins=\"$ins\"}[1m])", "legendFormat": "pgbouncer", "range": true, "refId": "B" } ], "title": "Errors Count Per Minute", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "stepAfter", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "links": [], "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "s" }, "overrides": [ { "matcher": { "id": "byName", "options": "pgbouncer" }, "properties": [ { "id": "color", "value": { "fixedColor": "#f5a673", "mode": "fixed" } } ] } ] }, "gridPos": { "h": 5, "w": 12, "x": 0, "y": 37 }, "id": 125, "options": { "legend": { "calcs": [ "mean" ], "displayMode": "table", "placement": "right", "showLegend": true }, "tooltip": { "hideZeros": false, "mode": "multi", "sort": "desc" } }, "pluginVersion": "11.5.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "expr": "pg_exporter_server_scrape_duration{ins=\"$ins\"}", "legendFormat": "{{ datname }}", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "expr": "pgbouncer_exporter_server_scrape_duration{ins=\"$ins\"}", "hide": false, "legendFormat": "pgbouncer", "range": true, "refId": "B" } ], "title": "Scrape Duration (per Server)", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "Scrape Count", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 12, "gradientMode": "opacity", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "stepAfter", "lineWidth": 2, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "decimals": 1, "links": [], "mappings": [], "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "none" }, "overrides": [ { "matcher": { "id": "byName", "options": "Error" }, "properties": [ { "id": "color", "value": { "fixedColor": "#E02F44", "mode": "fixed" } }, { "id": "unit", "value": "short" }, { "id": "min", "value": 0 }, { "id": "max", "value": 0.1 } ] } ] }, "gridPos": { "h": 5, "w": 12, "x": 12, "y": 37 }, "id": 126, "options": { "legend": { "calcs": [ "mean" ], "displayMode": "table", "placement": "right", "showLegend": true }, "tooltip": { "hideZeros": false, "mode": "multi", "sort": "desc" } }, "pluginVersion": "11.5.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "expr": "increase(pg_exporter_server_scrape_total_count{ins=\"$ins\"}[1m])", "legendFormat": "{{ datname }}", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "expr": "increase(pgbouncer_exporter_server_scrape_total_count{ins=\"$ins\"}[1m])", "legendFormat": "pgbouncer", "range": true, "refId": "B" } ], "title": "Scrape Count Per Minute", "type": "timeseries" }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 42 }, "id": 134, "panels": [], "title": "Collectors", "type": "row" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 10, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "line+area" } }, "links": [], "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "transparent", "value": null } ] }, "unit": "none" }, "overrides": [] }, "gridPos": { "h": 6, "w": 12, "x": 0, "y": 43 }, "id": 123, "options": { "legend": { "calcs": [ "max" ], "displayMode": "table", "placement": "right", "showLegend": true, "sortBy": "Max", "sortDesc": true }, "tooltip": { "hideZeros": false, "mode": "multi", "sort": "desc" } }, "pluginVersion": "11.5.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "expr": "sum by (query) (increase(pg_exporter_query_scrape_error_count{ins=\"$ins\"}[1m]))", "legendFormat": "{{query}}", "range": true, "refId": "A" } ], "title": "Query Errors", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 2, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "links": [], "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "none" }, "overrides": [] }, "gridPos": { "h": 6, "w": 12, "x": 12, "y": 43 }, "id": 132, "options": { "legend": { "calcs": [ "mean" ], "displayMode": "table", "placement": "right", "showLegend": true, "sortBy": "Mean", "sortDesc": true }, "tooltip": { "hideZeros": false, "mode": "multi", "sort": "desc" } }, "pluginVersion": "11.5.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "expr": "sum by (query) (pg_exporter_query_scrape_metric_count{ins=\"$ins\"})", "legendFormat": "{{query}}", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "expr": "sum by (query) (pgbouncer_exporter_query_scrape_metric_count{ins=\"$ins\"})", "hide": false, "legendFormat": "{{query}}", "range": true, "refId": "B" } ], "title": "Metrics Count", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 3, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "links": [], "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] }, "unit": "s" }, "overrides": [] }, "gridPos": { "h": 15, "w": 24, "x": 0, "y": 49 }, "id": 131, "options": { "legend": { "calcs": [ "mean", "lastNotNull", "max" ], "displayMode": "table", "placement": "right", "showLegend": true, "sortBy": "Mean", "sortDesc": true }, "tooltip": { "hideZeros": false, "mode": "multi", "sort": "desc" } }, "pluginVersion": "11.5.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "expr": "increase(pg_exporter_query_scrape_duration{ins=\"$ins\"}[1m]) / \nincrease(pg_exporter_query_scrape_total_count{ins=\"$ins\"}[1m])", "legendFormat": "{{ datname }}.{{ query }}", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "expr": "increase(pgbouncer_exporter_query_scrape_duration{ins=\"$ins\"}[1m]) / \nincrease(pgbouncer_exporter_query_scrape_total_count{ins=\"$ins\"}[1m])", "hide": false, "legendFormat": "pgbouncer.{{ query }}", "range": true, "refId": "B" } ], "title": "Query Duration", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 0, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "decimals": 0, "links": [], "mappings": [], "max": 1, "min": 0, "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null }, { "color": "red", "value": 80 } ] }, "unit": "percentunit" }, "overrides": [] }, "gridPos": { "h": 9, "w": 12, "x": 0, "y": 64 }, "id": 133, "options": { "legend": { "calcs": [ "mean", "lastNotNull", "max" ], "displayMode": "table", "placement": "right", "showLegend": true }, "tooltip": { "hideZeros": false, "mode": "multi", "sort": "desc" } }, "pluginVersion": "11.5.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "expr": "increase(pg_exporter_query_scrape_hit_count{ins=\"$ins\"}[5m]) / \nincrease(pg_exporter_query_scrape_total_count{ins=\"$ins\"}[5m])", "legendFormat": "{{ datname }}.{{query}}", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "expr": "increase(pgbouncer_exporter_query_scrape_hit_count{ins=\"$ins\"}[5m]) / \nincrease(pgbouncer_exporter_query_scrape_total_count{ins=\"$ins\"}[5m])", "hide": false, "legendFormat": "{{ datname }}.{{query}}", "range": true, "refId": "B" } ], "title": "Query Cache Hit Rate", "type": "timeseries" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "description": "", "fieldConfig": { "defaults": { "color": { "mode": "palette-classic" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "line", "fillOpacity": 1, "gradientMode": "none", "hideFrom": { "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "stepAfter", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": false, "stacking": { "group": "A", "mode": "none" }, "thresholdsStyle": { "mode": "off" } }, "links": [], "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "green", "value": null } ] }, "unit": "s" }, "overrides": [ { "matcher": { "id": "byName", "options": "pgbouncer" }, "properties": [ { "id": "color", "value": { "fixedColor": "#f5a673", "mode": "fixed" } } ] } ] }, "gridPos": { "h": 9, "w": 12, "x": 12, "y": 64 }, "id": 135, "options": { "legend": { "calcs": [ "mean" ], "displayMode": "table", "placement": "right", "showLegend": true }, "tooltip": { "hideZeros": false, "mode": "multi", "sort": "desc" } }, "pluginVersion": "11.5.2", "targets": [ { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "expr": "pg_exporter_query_cache_ttl{ins=\"$ins\"}", "legendFormat": "{{ datname }}.{{ query }}", "range": true, "refId": "A" }, { "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "editorMode": "code", "expr": "pgbouncer_exporter_query_cache_ttl{ins=\"$ins\"}", "hide": false, "legendFormat": "pgbouncer. {{ query }}", "range": true, "refId": "B" } ], "title": "Cache TTL", "type": "timeseries" }, { "collapsed": false, "gridPos": { "h": 1, "w": 24, "x": 0, "y": 73 }, "id": 98, "panels": [], "title": "PG Exporter Logs: ${ins}", "type": "row" }, { "datasource": { "type": "loki", "uid": "ds-loki" }, "description": "pg_exporter logs count", "fieldConfig": { "defaults": { "color": { "fixedColor": "#346f36cc", "mode": "fixed" }, "custom": { "axisBorderShow": false, "axisCenteredZero": false, "axisColorMode": "text", "axisLabel": "", "axisPlacement": "auto", "barAlignment": 0, "barWidthFactor": 0.6, "drawStyle": "bars", "fillOpacity": 100, "gradientMode": "none", "hideFrom": { "graph": false, "legend": false, "tooltip": false, "viz": false }, "insertNulls": false, "lineInterpolation": "linear", "lineWidth": 1, "pointSize": 5, "scaleDistribution": { "type": "linear" }, "showPoints": "never", "spanNulls": true, "stacking": { "group": "A", "mode": "normal" }, "thresholdsStyle": { "mode": "off" } }, "decimals": 0, "mappings": [], "thresholds": { "mode": "absolute", "steps": [ { "color": "#346f36cc", "value": null } ] }, "unit": "none" }, "overrides": [ { "matcher": { "id": "byFrameRefID", "options": "A" }, "properties": [ { "id": "color", "value": { "fixedColor": "#346f36cc", "mode": "fixed" } } ] }, { "matcher": { "id": "byFrameRefID", "options": "B" }, "properties": [ { "id": "color", "value": { "fixedColor": "#3e668f", "mode": "fixed" } } ] }, { "matcher": { "id": "byFrameRefID", "options": "C" }, "properties": [ { "id": "color", "value": { "fixedColor": "#f5a673", "mode": "fixed" } } ] }, { "matcher": { "id": "byName", "options": "pgbackrest" }, "properties": [ { "id": "color", "value": { "fixedColor": "light-red", "mode": "fixed" } } ] } ] }, "gridPos": { "h": 5, "w": 24, "x": 0, "y": 74 }, "id": 100, "interval": "1s", "links": [ { "title": "PG Exporter Logs for ${ins}", "url": "/d/logs-instance?var-ins=$ins&var-src=syslog&var-search=exporter&${__url_time_range}" } ], "options": { "legend": { "calcs": [ "sum" ], "displayMode": "table", "placement": "right", "showLegend": false }, "tooltip": { "hideZeros": false, "mode": "single", "sort": "none" } }, "pluginVersion": "11.5.2", "targets": [ { "datasource": { "type": "loki", "uid": "ds-loki" }, "editorMode": "code", "expr": "count_over_time(({ip=\"$ip\", src=\"syslog\"} |~ \"pg_exporter\")[$__interval])", "legendFormat": "", "queryType": "range", "refId": "A" } ], "title": "Logs per $__interval", "type": "timeseries" }, { "datasource": { "type": "loki", "uid": "ds-loki" }, "description": "Recent logs for pgbouncer & patroni & postgres", "fieldConfig": { "defaults": {}, "overrides": [] }, "gridPos": { "h": 12, "w": 24, "x": 0, "y": 79 }, "id": 102, "options": { "dedupStrategy": "none", "enableInfiniteScrolling": false, "enableLogDetails": true, "prettifyLogMessage": false, "showCommonLabels": false, "showLabels": false, "showTime": true, "sortOrder": "Descending", "wrapLogMessage": false }, "pluginVersion": "11.5.2", "targets": [ { "datasource": { "type": "loki", "uid": "ds-loki" }, "editorMode": "code", "expr": "{ip=\"$ip\"} |~ \"pg_exporter\"", "instant": false, "queryType": "range", "range": true, "refId": "A" } ], "title": "Recent Logs", "type": "logs" } ], "preload": false, "refresh": "", "schemaVersion": 40, "tags": [ "Pigsty", "PGSQL", "Instance" ], "templating": { "list": [ { "current": {}, "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "definition": "label_values(pg_up, ins)", "description": "Unique instance identifier (e.g pg-meta-1)", "includeAll": false, "label": "Instance", "name": "ins", "options": [], "query": { "query": "label_values(pg_up, ins)", "refId": "StandardVariableQuery" }, "refresh": 1, "regex": "", "sort": 1, "type": "query" }, { "current": {}, "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "definition": "label_values(pg_up{ins=\"$ins\"} , ip)", "description": "IP address of this postgres instance", "hide": 2, "includeAll": false, "label": "IP", "name": "ip", "options": [], "query": { "query": "label_values(pg_up{ins=\"$ins\"} , ip)", "refId": "StandardVariableQuery" }, "refresh": 1, "regex": "", "sort": 1, "type": "query" }, { "current": {}, "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "definition": "label_values(pg_up{ins=\"$ins\"}, ins)", "description": "Sequence number of this instance, which is an unique integer among a postgres cluster", "hide": 2, "includeAll": false, "label": "Sequence", "name": "seq", "options": [], "query": { "query": "label_values(pg_up{ins=\"$ins\"}, ins)", "refId": "StandardVariableQuery" }, "refresh": 1, "regex": "/^[a-zA-Z0-9-_]+-(\\d+)$/", "sort": 1, "type": "query" }, { "current": {}, "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "definition": "label_values(pg_up{ins=\"$ins\"}, cls)", "description": "Cluster identifier for this postgres instance, cls should be unique among entire environment. such as pg-meta, pg-test", "hide": 2, "includeAll": false, "label": "Cluster", "name": "cls", "options": [], "query": { "query": "label_values(pg_up{ins=\"$ins\"}, cls)", "refId": "StandardVariableQuery" }, "refresh": 1, "regex": "", "sort": 1, "type": "query" }, { "current": {}, "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "definition": "label_values(node_uname_info{ip=\"$ip\"},nodename)", "description": "Node name of current postgres instance", "hide": 2, "includeAll": false, "label": "Node", "name": "node", "options": [], "query": { "query": "label_values(node_uname_info{ip=\"$ip\"},nodename)", "refId": "PrometheusVariableQueryEditor-VariableQuery" }, "refresh": 1, "regex": "", "sort": 1, "type": "query" }, { "current": {}, "datasource": { "type": "prometheus", "uid": "ds-prometheus" }, "definition": "label_values(pg_db_age{ins=\"$ins\",datname!~'postgres|template0|template1'}, datname)", "description": "Non-trivial database in this instance", "hide": 2, "includeAll": false, "label": "Database", "name": "datname", "options": [], "query": { "query": "label_values(pg_db_age{ins=\"$ins\",datname!~'postgres|template0|template1'}, datname)", "refId": "StandardVariableQuery" }, "refresh": 1, "regex": "", "sort": 1, "type": "query" } ] }, "time": { "from": "now-1h", "to": "now" }, "timepicker": {}, "timezone": "Asia/Shanghai", "title": "PGSQL Exporter", "uid": "pgsql-exporter", "version": 1, "weekStart": "" } ================================================ FILE: package/nfpm-amd64-deb.yaml ================================================ name: "pg-exporter" arch: "amd64" platform: "linux" version: "v1.2.2" release: "1" version_schema: semver maintainer: Ruohang Feng description: | Prometheus exporter for PostgreSQL / Pgbouncer server metrics. Supported version: Postgres9.x - 18+ & Pgbouncer 1.8 - 1.25+ Part of Project Pigsty -- Battery Included PostgreSQL Distribution with ultimate observability support: https://pigsty.io/docs/pg_exporter vendor: "PGSTY" homepage: "https://pigsty.io/docs/pg_exporter" license: "Apache-2.0 License" rpm: compression: gzip prefixes: - /usr/bin contents: - src: pg_exporter dst: /usr/bin/pg_exporter file_info: mode: 0755 - src: pg_exporter.yml dst: /etc/pg_exporter.yml type: config|noreplace file_info: mode: 0700 owner: prometheus group: prometheus - src: package/pg_exporter.default dst: /etc/default/pg_exporter type: config|noreplace file_info: mode: 0700 owner: prometheus group: prometheus - src: package/pg_exporter.service dst: /lib/systemd/system/pg_exporter.service type: config - src: LICENSE dst: /usr/share/doc/pg_exporter/LICENSE file_info: mode: 0644 scripts: preinstall: package/preinstall.sh ================================================ FILE: package/nfpm-amd64-rpm.yaml ================================================ name: "pg_exporter" arch: "amd64" platform: "linux" version: "v1.2.2" release: "1" version_schema: semver maintainer: Ruohang Feng description: | Prometheus exporter for PostgreSQL / Pgbouncer server metrics. Supported version: Postgres9.x - 18+ & Pgbouncer 1.8 - 1.25+ Part of Project Pigsty -- Battery Included PostgreSQL Distribution with ultimate observability support: https://pigsty.io/docs/pg_exporter vendor: "PGSTY" homepage: "https://pigsty.io/docs/pg_exporter" license: "Apache-2.0 License" rpm: compression: gzip prefixes: - /usr/bin contents: - src: pg_exporter dst: /usr/bin/pg_exporter file_info: mode: 0755 - src: pg_exporter.yml dst: /etc/pg_exporter.yml type: config|noreplace file_info: mode: 0700 owner: prometheus group: prometheus - src: package/pg_exporter.default dst: /etc/default/pg_exporter type: config|noreplace file_info: mode: 0700 owner: prometheus group: prometheus - src: package/pg_exporter.service dst: /usr/lib/systemd/system/pg_exporter.service type: config - src: LICENSE dst: /usr/share/doc/pg_exporter/LICENSE file_info: mode: 0644 scripts: preinstall: package/preinstall.sh ================================================ FILE: package/nfpm-arm64-deb.yaml ================================================ name: "pg-exporter" arch: "arm64" platform: "linux" version: "v1.2.2" release: "1" version_schema: semver maintainer: Ruohang Feng description: | Prometheus exporter for PostgreSQL / Pgbouncer server metrics. Supported version: Postgres9.x - 18+ & Pgbouncer 1.8 - 1.25+ Part of Project Pigsty -- Battery Included PostgreSQL Distribution with ultimate observability support: https://pigsty.io/docs/pg_exporter vendor: "PGSTY" homepage: "https://pigsty.io/docs/pg_exporter" license: "Apache-2.0 License" rpm: compression: gzip prefixes: - /usr/bin contents: - src: pg_exporter dst: /usr/bin/pg_exporter file_info: mode: 0755 - src: pg_exporter.yml dst: /etc/pg_exporter.yml type: config|noreplace file_info: mode: 0700 owner: prometheus group: prometheus - src: package/pg_exporter.default dst: /etc/default/pg_exporter type: config|noreplace file_info: mode: 0700 owner: prometheus group: prometheus - src: package/pg_exporter.service dst: /lib/systemd/system/pg_exporter.service type: config - src: LICENSE dst: /usr/share/doc/pg_exporter/LICENSE file_info: mode: 0644 scripts: preinstall: package/preinstall.sh ================================================ FILE: package/nfpm-arm64-rpm.yaml ================================================ name: "pg_exporter" arch: "arm64" platform: "linux" version: "v1.2.2" release: "1" version_schema: semver maintainer: Ruohang Feng description: | Prometheus exporter for PostgreSQL / Pgbouncer server metrics. Supported version: Postgres9.x - 18+ & Pgbouncer 1.8 - 1.25+ Part of Project Pigsty -- Battery Included PostgreSQL Distribution with ultimate observability support: https://pigsty.io/docs/pg_exporter vendor: "PGSTY" homepage: "https://pigsty.io/docs/pg_exporter" license: "Apache-2.0 License" rpm: compression: gzip prefixes: - /usr/bin contents: - src: pg_exporter dst: /usr/bin/pg_exporter file_info: mode: 0755 - src: pg_exporter.yml dst: /etc/pg_exporter.yml type: config|noreplace file_info: mode: 0700 owner: prometheus group: prometheus - src: package/pg_exporter.default dst: /etc/default/pg_exporter type: config|noreplace file_info: mode: 0700 owner: prometheus group: prometheus - src: package/pg_exporter.service dst: /usr/lib/systemd/system/pg_exporter.service type: config - src: LICENSE dst: /usr/share/doc/pg_exporter/LICENSE file_info: mode: 0644 scripts: preinstall: package/preinstall.sh ================================================ FILE: package/pg_exporter.default ================================================ PG_EXPORTER_URL='postgres://:5432/?sslmode=disable' PG_EXPORTER_CONFIG=/etc/pg_exporter.yml PG_EXPORTER_LABEL="" PG_EXPORTER_TAG="" PG_EXPORTER_DISABLE_CACHE=false PG_EXPORTER_AUTO_DISCOVERY=true PG_EXPORTER_EXCLUDE_DATABASE="template0,template1,postgres" PG_EXPORTER_INCLUDE_DATABASE="" PG_EXPORTER_NAMESPACE="pg" PG_EXPORTER_FAIL_FAST=false PG_EXPORTER_CONNECT_TIMEOUT=100 PG_EXPORTER_TELEMETRY_PATH="/metrics" PG_EXPORTER_OPTS='--log.level=info' ================================================ FILE: package/pg_exporter.service ================================================ [Unit] Description=Prometheus exporter for PostgreSQL/Pgbouncer server metrics Documentation=https://pigsty.io/docs/pg_exporter After=network.target [Service] EnvironmentFile=-/etc/default/pg_exporter User=prometheus ExecStart=/usr/bin/pg_exporter $PG_EXPORTER_OPTS Restart=on-failure [Install] WantedBy=multi-user.target ================================================ FILE: package/preinstall.sh ================================================ #!/bin/bash # create a group & user named prometheus if not exists getent group prometheus >/dev/null || groupadd -r prometheus ; /bin/true getent passwd prometheus >/dev/null || useradd -r -g prometheus -s /sbin/nologin -c "Prometheus services" prometheus exit 0 ================================================ FILE: pg_exporter.yml ================================================ #==============================================================# # Desc : pg_exporter metrics collector definition # Ver : PostgreSQL 10 ~ 18+ and pgbouncer 1.9~1.25+ # Ctime : 2019-12-09 # Mtime : 2026-03-21 # Homepage : https://pigsty.io # Author : Ruohang Feng (rh@vonng.com) # License : Apache-2.0 @ https://github.com/pgsty/pg_exporter # Copyright : 2018-2026 Ruohang Feng / Vonng (rh@vonng.com) #==============================================================# #==============================================================# # 1. Config File #==============================================================# # The configuration file for pg_exporter is a YAML file. # Default configurations are retrieved via following precedence: # 1. command line args: --config= # 2. environment variables: PG_EXPORTER_CONFIG= # 3. pg_exporter.yml (Current directory) # 4. /etc/pg_exporter.yml (config file) # 5. /etc/pg_exporter (config dir) #==============================================================# # 2. Config Format #==============================================================# # pg_exporter config could be a single YAML file, or a directory containing a series of separated YAML files. # Each YAML config file consists of one or more metrics Collector definition, which are top-level objects. # If a directory is provided, all YAML in that directory will be merged in alphabetic order. # Collector definition examples are shown below. #==============================================================# # 3. Collector Example #==============================================================# # # Here is an example of a metrics collector definition # pg_primary_only: # Collector branch name. Must be UNIQUE among the entire configuration # name: pg # Collector namespace, used as METRIC PREFIX, set to branch name by default, can be override # # the same namespace may contain multiple collector branches. It`s the user`s responsibility # # to make sure that AT MOST ONE collector is picked for each namespace. # # desc: PostgreSQL basic information (on primary) # Collector description # query: | # Metrics Query SQL # # SELECT extract(EPOCH FROM CURRENT_TIMESTAMP) AS timestamp, # pg_current_wal_lsn() - '0/0' AS lsn, # pg_current_wal_insert_lsn() - '0/0' AS insert_lsn, # pg_current_wal_lsn() - '0/0' AS write_lsn, # pg_current_wal_flush_lsn() - '0/0' AS flush_lsn, # extract(EPOCH FROM now() - pg_postmaster_start_time()) AS uptime, # extract(EPOCH FROM now() - pg_conf_load_time()) AS conf_reload_time, # pg_is_in_backup() AS is_in_backup, # extract(EPOCH FROM now() - pg_backup_start_time()) AS backup_time; # # # [OPTIONAL] metadata fields, control collector behavior # ttl: 10 # Cache TTL: in seconds, how long will pg_exporter cache this collector`s query result. # timeout: 0.1 # Query Timeout: in seconds, queries that exceed this limit will be canceled. # min_version: 100000 # minimal supported version, boundary IS included. In server version number format, # max_version: 130000 # maximal supported version, boundary NOT included, In server version number format # fatal: false # Collector marked `fatal` fails, the entire scrape will abort immediately and marked as failed # skip: false # Collector marked `skip` will not be installed during the planning procedure # # tags: [cluster, primary] # Collector tags, used for planning and scheduling # # # tags are list of strings, which could be: # # * `cluster` marks this query as cluster level, so it will only execute once for the same PostgreSQL Server # # * `primary` or `master` mark this query can only run on a primary instance (WILL NOT execute if pg_is_in_recovery()) # # * `standby` or `replica` mark this query can only run on a replica instance (WILL execute if pg_is_in_recovery()) # # some special tag prefix have special interpretation: # # * `dbname:` means this query will ONLY be executed on database with name `` # # * `username:` means this query will only be executed when connect with user `` # # * `extension:` means this query will only be executed when extension `` is installed # # * `schema:` means this query will only by executed when schema `` exist # # * `not:` means this query WILL NOT be executed when exporter is tagged with `` # # * `` means this query WILL be executed when exporter is tagged with `` # # ( could not be cluster,primary,standby,master,replica,etc...) # # # One or more "predicate queries" may be defined for a metric query. These # # are run before the main metric query (after any cache hit check). If all # # of them, when run sequentially, return a single row with a single column # # boolean true result, the main metric query is executed. If any of them # # return false or return zero rows, the main query is skipped. If any # # predicate query returns more than one row, a non-boolean result, or fails # # with an error, the whole query is marked failed. Predicate queries can be # # used to check for the presence of specific functions, tables, extensions, # # settings, and vendor-specific pg features before running the main query. # # predicate_queries: # - name: predicate query name # predicate_query: | # SELECT EXISTS (SELECT 1 FROM information_schema.routines WHERE routine_schema = 'pg_catalog' AND routine_name = 'pg_backup_start_time'); # # metrics: # List of returned columns, each column must have a `name` and `usage`, `rename` and `description` are optional # - timestamp: # Column name, should be exactly the same as returned column name # usage: GAUGE # Metric type, `usage` could be # * DISCARD: completely ignoring this field # * LABEL: use columnName=columnValue as a label in metric # * GAUGE: Mark column as a gauge metric, full name will be `_` # * COUNTER: Same as above, except it is a counter rather than a gauge. # rename: ts # [OPTIONAL] Alias, optional, the alias will be used instead of the column name # description: xxxx # [OPTIONAL] Description of the column, will be used as a metric description # default: 0 # [OPTIONAL] Default value, will be used when column is NULL # scale: 1000 # [OPTIONAL] Scale the value by this factor # - lsn: # usage: COUNTER # description: log sequence number, current write location (on primary) # - insert_lsn: # usage: COUNTER # description: primary only, location of current wal inserting # - write_lsn: # usage: COUNTER # description: primary only, location of current wal writing # - flush_lsn: # usage: COUNTER # description: primary only, location of current wal syncing # - uptime: # usage: GAUGE # description: seconds since postmaster start # - conf_reload_time: # usage: GAUGE # description: seconds since last configuration reload # - is_in_backup: # usage: GAUGE # description: 1 if backup is in progress # - backup_time: # usage: GAUGE # description: seconds since the current backup start. null if don`t have one # # .... # you can also use rename & scale to customize the metric name and value: # - checkpoint_write_time: # rename: write_time # usage: COUNTER # scale: 1e-3 # description: Total amount of time that has been spent in the portion of checkpoint processing where files are written to disk, in seconds #==============================================================# # 4. Collector Presets #==============================================================# # pg_exporter is shipped with a series of preset collectors (already numbered and ordered by filename) # # 1xx Basic metrics: basic info, metadata, settings # 2xx Replication metrics: replication, walreceiver, downstream, sync standby, slots, subscription # 3xx Persist metrics: size, wal, background writer, checkpointer, ssl, checkpoint, recovery, slru cache, shmem usage # 4xx Activity metrics: backend count group by state, wait event, locks, xacts, queries # 5xx Progress metrics: clustering, vacuuming, indexing, basebackup, copy # 6xx Database metrics: pg_database, publication, subscription # 7xx Object metrics: pg_class, table, index, function, sequence, default partition # 8xx Optional metrics: optional metrics collector (disable by default, slow queries) # 9xx Pgbouncer metrics: metrics from pgbouncer admin database `pgbouncer` # # 100-599 Metrics for entire database cluster (scrape once) # 600-899 Metrics for single database instance (scrape for each database ,except for pg_db itself) #==============================================================# # 5. Cache TTL #==============================================================# # Cache can be used for reducing query overhead, it can be enabled by setting a non-zero value for `ttl` # It is highly recommended to use cache to avoid duplicate scrapes. Especially when you got multiple Prometheus # scraping the same instance with slow monitoring queries. Setting `ttl` to zero or leaving blank will disable # result caching, which is the default behavior # # TTL has to be smaller than your scrape interval. 15s scrape interval and 10s TTL is a good start for # production environment. Some expensive monitoring queries (such as size/bloat check) will have longer `ttl` # which can also be used as a mechanism to achieve `different scrape frequency` #==============================================================# # 6. Query Timeout #==============================================================# # Collectors can be configured with an optional Timeout. If the collector's query executes more than that # timeout, it will be canceled immediately. Setting the `timeout` to 0 or leaving blank will reset it to # default timeout 0.1 (100ms). Setting it to any negative number will disable the query timeout feature. # All queries have a default timeout of 100ms, if exceeded, the query will be canceled immediately to avoid # avalanche. You can explicitly overwrite that option. but beware: in some extreme cases, if all your # timeouts sum up greater your scrape/cache interval (usually 15s), the queries may still be jammed. # or, you can just disable potential slow queries. #==============================================================# # 7. Version Compatibility #==============================================================# # Each collector has two optional version compatibility parameters: `min_version` and `max_version`. # These two parameters specify the version compatibility of the collector. If target postgres/pgbouncer's # version is less than `min_version`, or higher than `max_version`, the collector will not be installed. # These two parameters are using PostgreSQL server version number format, which is a 6-digit integer # format as :. # For example, 090600 stands for 9.6, and 120100 stands for 12.1 # And beware that version compatibility range is left-inclusive right exclusive: [min, max), set to zero or # leaving blank will affect as -inf or +inf #==============================================================# # 8. Fatality #==============================================================# # If a collector is marked with `fatal` falls, the entire scrape operation will be marked as fail and key metrics # `pg_up` / `pgbouncer_up` will be reset to 0. It is always a good practice to set up AT LEAST ONE fatal # collector for pg_exporter. `pg.pg_primary_only` and `pgbouncer_list` are the default fatal collector. # # If a collector without `fatal` flag fails, it will increase global fail counters. But the scrape operation # will carry on. The entire scrape result will not be marked as faile, thus will not affect the `_up` metric. #==============================================================# # 9. Skip #==============================================================# # Collector with `skip` flag set to true will NOT be installed. # This could be a handy option to disable collectors #==============================================================# # 10. Tags and Planning #==============================================================# # Tags are designed for collector planning & schedule. It can be handy to customize which queries run # on which instances. And thus you can use one-single monolith config for multiple environments # # Tags are a list of strings, each string could be: # Pre-defined special tags # * `cluster` marks this collector as cluster level, so it will ONLY BE EXECUTED ONCE for the same PostgreSQL Server # * `primary` or `master` mark this collector as primary-only, so it WILL NOT work iff pg_is_in_recovery() # * `standby` or `replica` mark this collector as replica-only, so it WILL work iff pg_is_in_recovery() # Special tag prefix which have different interpretation: # * `dbname:` means this collector will ONLY work on database with name `` # * `username:` means this collector will ONLY work when connect with user `` # * `extension:` means this collector will ONLY work when extension `` is installed # * `schema:` means this collector will only work when schema `` exists # Customized positive tags (filter) and negative tags (taint) # * `not:` means this collector WILL NOT work when exporter is tagged with `` # * `` means this query WILL work if exporter is tagged with `` (special tags not included) # # pg_exporter will trigger the Planning procedure after connecting to the target. It will gather database facts # and match them with tags and other metadata (such as supported version range). Collector will only # be installed if and only if it is compatible with the target server. #==============================================================# # 0110 pg #==============================================================# pg_primary_only: name: pg desc: PostgreSQL basic information (on primary) query: |- SELECT extract(EPOCH FROM CURRENT_TIMESTAMP) AS timestamp, extract(EPOCH FROM now() - pg_postmaster_start_time()) AS uptime, extract(EPOCH FROM pg_postmaster_start_time()) AS boot_time, pg_current_wal_lsn() - '0/0' AS lsn, pg_current_wal_insert_lsn() - '0/0' AS insert_lsn, pg_current_wal_lsn() - '0/0' AS write_lsn, pg_current_wal_flush_lsn() - '0/0' AS flush_lsn, NULL::BIGINT AS receive_lsn, NULL::BIGINT AS replay_lsn, extract(EPOCH FROM pg_conf_load_time()) AS reload_time, extract(EPOCH FROM now() - pg_conf_load_time()) AS conf_reload_time, NULL::FLOAT AS last_replay_time, 0::FLOAT AS lag, pg_is_in_recovery() AS is_in_recovery, FALSE AS is_wal_replay_paused; tags: [ cluster, primary ] ttl: 1 min_version: 100000 fatal: true skip: false metrics: - timestamp: { usage: GAUGE ,description: "current database timestamp in unix epoch" } - uptime: { usage: GAUGE ,description: "seconds since postmaster start" } - boot_time: { usage: GAUGE ,description: "postmaster boot timestamp in unix epoch" } - lsn: { usage: COUNTER ,description: "log sequence number, current write location" } - insert_lsn: { usage: COUNTER ,description: "primary only, location of current wal inserting" } - write_lsn: { usage: COUNTER ,description: "primary only, location of current wal writing" } - flush_lsn: { usage: COUNTER ,description: "primary only, location of current wal syncing" } - receive_lsn: { usage: COUNTER ,description: "replica only, location of wal synced to disk" } - replay_lsn: { usage: COUNTER ,description: "replica only, location of wal applied" } - reload_time: { usage: GAUGE ,description: "time when configuration was last reloaded" } - conf_reload_time: { usage: GAUGE ,description: "seconds since last configuration reload" } - last_replay_time: { usage: GAUGE ,description: "time when last transaction been replayed" } - lag: { usage: GAUGE ,description: "replica only, replication lag in seconds" } - is_in_recovery: { usage: GAUGE ,description: "1 if in recovery mode" } - is_wal_replay_paused: { usage: GAUGE ,description: "1 if wal play is paused" } pg_replica_only: name: pg desc: PostgreSQL basic information (on replica) query: |- SELECT extract(EPOCH FROM CURRENT_TIMESTAMP) AS timestamp, extract(EPOCH FROM now() - pg_postmaster_start_time()) AS uptime, extract(EPOCH FROM pg_postmaster_start_time()) AS boot_time, pg_last_wal_replay_lsn() - '0/0' AS lsn, NULL::BIGINT AS insert_lsn, NULL::BIGINT AS write_lsn, NULL::BIGINT AS flush_lsn, pg_last_wal_receive_lsn() - '0/0' AS receive_lsn, pg_last_wal_replay_lsn() - '0/0' AS replay_lsn, extract(EPOCH FROM pg_conf_load_time()) AS reload_time, extract(EPOCH FROM now() - pg_conf_load_time()) AS conf_reload_time, extract(EPOCH FROM pg_last_xact_replay_timestamp()) AS last_replay_time, CASE WHEN pg_last_wal_receive_lsn() = pg_last_wal_replay_lsn() THEN 0 ELSE EXTRACT(EPOCH FROM now() - pg_last_xact_replay_timestamp()) END AS lag, pg_is_in_recovery() AS is_in_recovery, pg_is_wal_replay_paused() AS is_wal_replay_paused; tags: [ cluster, replica ] ttl: 1 min_version: 100000 fatal: true skip: false metrics: - timestamp: { usage: GAUGE ,description: "current database timestamp in unix epoch" } - uptime: { usage: GAUGE ,description: "seconds since postmaster start" } - boot_time: { usage: GAUGE ,description: "postmaster boot timestamp in unix epoch" } - lsn: { usage: COUNTER ,description: "log sequence number, current write location" } - insert_lsn: { usage: COUNTER ,description: "primary only, location of current wal inserting" } - write_lsn: { usage: COUNTER ,description: "primary only, location of current wal writing" } - flush_lsn: { usage: COUNTER ,description: "primary only, location of current wal syncing" } - receive_lsn: { usage: COUNTER ,description: "replica only, location of wal synced to disk" } - replay_lsn: { usage: COUNTER ,description: "replica only, location of wal applied" } - reload_time: { usage: GAUGE ,description: "time when configuration was last reloaded" } - conf_reload_time: { usage: GAUGE ,description: "seconds since last configuration reload" } - last_replay_time: { usage: GAUGE ,description: "time when last transaction been replayed" } - lag: { usage: GAUGE ,description: "replica only, replication lag in seconds" } - is_in_recovery: { usage: GAUGE ,description: "1 if in recovery mode" } - is_wal_replay_paused: { usage: GAUGE ,description: "1 if wal play is paused" } #==============================================================# # 0120 pg_meta #==============================================================# pg_meta_13: name: pg_meta desc: PostgreSQL meta info for pg 13+, with extra primary conninfo query: | SELECT (SELECT system_identifier FROM pg_control_system()) AS cluster_id, current_setting('cluster_name') AS cluster_name, current_setting('port') AS listen_port, current_setting('data_directory', true) AS data_dir, current_setting('config_file', true) AS conf_path, current_setting('hba_file', true) AS hba_path, current_setting('wal_level') AS wal_level, current_setting('server_encoding') AS encoding, current_setting('server_version') AS version, current_setting('server_version_num') AS ver_num, version() AS ver_str, current_setting('shared_preload_libraries', true) AS extensions, current_setting('primary_conninfo', true) AS primary_conninfo, 1 AS info ttl: 10 min_version: 130000 tags: [ cluster ] metrics: - cluster_id: { usage: LABEL ,description: "cluster system identifier" } - cluster_name: { usage: LABEL ,description: "cluster name" } - listen_port: { usage: LABEL ,description: "listen port" } - data_dir: { usage: LABEL ,description: "path to data directory" } - conf_path: { usage: LABEL ,description: "path to postgresql.conf" } - hba_path: { usage: LABEL ,description: "path to pg_hba.conf" } - wal_level: { usage: LABEL ,description: "wal level" } - encoding: { usage: LABEL ,description: "server encoding" } - version: { usage: LABEL ,description: "server version in human-readable format" } - ver_num: { usage: LABEL ,description: "server version number in machine-readable format" } - ver_str: { usage: LABEL ,description: "complete version string" } - extensions: { usage: LABEL ,description: "server installed preload libraries" } - primary_conninfo: { usage: LABEL ,description: "connection string to upstream (do not set password here)" } - info: { usage: GAUGE ,description: "constant 1" } pg_meta_10: name: pg_meta desc: PostgreSQL meta info query: | SELECT (SELECT system_identifier FROM pg_control_system()) AS cluster_id, current_setting('cluster_name') AS cluster_name, current_setting('port') AS listen_port, current_setting('data_directory', true) AS data_dir, current_setting('config_file', true) AS conf_path, current_setting('hba_file', true) AS hba_path, current_setting('wal_level') AS wal_level, current_setting('server_encoding') AS encoding, current_setting('server_version') AS version, current_setting('server_version_num') AS ver_num, version() AS ver_str, current_setting('shared_preload_libraries', true) AS extensions, 'N/A' AS primary_conninfo, 1 AS info ttl: 10 min_version: 090600 max_version: 130000 tags: [ cluster ] metrics: - cluster_id: { usage: LABEL ,description: "cluster system identifier" } - cluster_name: { usage: LABEL ,description: "cluster name" } - listen_port: { usage: LABEL ,description: "listen port" } - data_dir: { usage: LABEL ,description: "path to data directory" } - conf_path: { usage: LABEL ,description: "path to postgresql.conf" } - hba_path: { usage: LABEL ,description: "path to pg_hba.conf" } - wal_level: { usage: LABEL ,description: "wal level" } - encoding: { usage: LABEL ,description: "server encoding" } - version: { usage: LABEL ,description: "server version in human-readable format" } - ver_num: { usage: LABEL ,description: "server version number in machine-readable format" } - ver_str: { usage: LABEL ,description: "complete version string" } - extensions: { usage: LABEL ,description: "server installed preload libraries" } - primary_conninfo: { usage: LABEL ,description: "connection string to upstream (do not set password here)" } - info: { usage: GAUGE ,description: "constant 1" } #==============================================================# # 0130 pg_setting #==============================================================# # Key PostgreSQL configuration parameters # All parameters use current_setting(name, missing_ok) for version safety # Parameters introduced after PG10 use missing_ok=true to return NULL on older versions pg_setting: name: pg_setting desc: PostgreSQL shared configuration parameters (shared across all databases) query: | SELECT current_setting('max_connections')::int AS max_connections, current_setting('max_prepared_transactions')::int AS max_prepared_transactions, current_setting('max_locks_per_transaction')::int AS max_locks_per_transaction, current_setting('max_worker_processes')::int AS max_worker_processes, current_setting('max_parallel_workers')::int AS max_parallel_workers, current_setting('max_parallel_workers_per_gather')::int AS max_parallel_workers_per_gather, current_setting('max_parallel_maintenance_workers', true)::int AS max_parallel_maintenance_workers, current_setting('max_replication_slots')::int AS max_replication_slots, current_setting('max_wal_senders')::int AS max_wal_senders, current_setting('block_size')::int AS block_size, current_setting('wal_block_size')::int AS wal_block_size, pg_size_bytes(current_setting('segment_size')) AS segment_size, pg_size_bytes(current_setting('wal_segment_size')) AS wal_segment_size, CASE current_setting('data_checksums') WHEN 'on' THEN 1 ELSE 0 END AS data_checksums, CASE current_setting('wal_log_hints') WHEN 'on' THEN 1 ELSE 0 END AS wal_log_hints, CASE current_setting('fsync') WHEN 'on' THEN 1 ELSE 0 END AS fsync, CASE current_setting('full_page_writes') WHEN 'on' THEN 1 ELSE 0 END AS full_page_writes, CASE current_setting('wal_level') WHEN 'logical' THEN 3 WHEN 'replica' THEN 2 WHEN 'minimal' THEN 1 ELSE 0 END AS wal_level, pg_size_bytes(current_setting('min_wal_size')) AS min_wal_size, pg_size_bytes(current_setting('max_wal_size')) AS max_wal_size, pg_size_bytes(current_setting('max_slot_wal_keep_size', true)) AS max_slot_wal_keep_size, pg_size_bytes(current_setting('shared_buffers')) AS shared_buffers, pg_size_bytes(current_setting('work_mem')) AS work_mem, pg_size_bytes(current_setting('maintenance_work_mem')) AS maintenance_work_mem, pg_size_bytes(current_setting('effective_cache_size')) AS effective_cache_size, pg_size_bytes(current_setting('shared_memory_size', true)) AS shared_memory_size, CASE current_setting('huge_pages_status', true) WHEN 'on' THEN 1 WHEN 'off' THEN 0 WHEN 'unknown' THEN -1 ELSE NULL END AS hugepage_status, current_setting('shared_memory_size_in_huge_pages', true)::int AS hugepage_count, CASE current_setting('archive_mode') WHEN 'off' THEN 0 WHEN 'on' THEN 1 WHEN 'always' THEN 2 ELSE -1 END AS archive_mode, CASE current_setting('autovacuum') WHEN 'on' THEN 1 ELSE 0 END AS autovacuum, current_setting('autovacuum_max_workers')::int AS autovacuum_max_workers, extract(epoch from current_setting('checkpoint_timeout')::interval)::int AS checkpoint_timeout, current_setting('checkpoint_completion_target')::float AS checkpoint_completion_target, CASE current_setting('hot_standby') WHEN 'on' THEN 1 ELSE 0 END AS hot_standby, CASE current_setting('synchronous_commit') WHEN 'off' THEN 0 WHEN 'local' THEN 1 WHEN 'remote_write' THEN 2 WHEN 'on' THEN 3 WHEN 'remote_apply' THEN 4 ELSE -1 END AS synchronous_commit, CASE current_setting('io_method', true) WHEN 'sync' THEN 0 WHEN 'worker' THEN 1 WHEN 'io_uring' THEN 2 ELSE NULL END AS io_method; ttl: 10 min_version: 100000 tags: [ cluster ] metrics: - max_connections: { usage: GAUGE ,description: "maximum number of concurrent connections to the database server" } - max_prepared_transactions: { usage: GAUGE ,description: "maximum number of transactions that can be in the prepared state simultaneously" } - max_locks_per_transaction: { usage: GAUGE ,description: "maximum number of locks per transaction" } - max_worker_processes: { usage: GAUGE ,description: "maximum number of background processes" } - max_parallel_workers: { usage: GAUGE ,description: "maximum number of parallel workers that can be active at one time" } - max_parallel_workers_per_gather: { usage: GAUGE ,description: "maximum number of parallel workers per Gather node" } - max_parallel_maintenance_workers: { usage: GAUGE ,description: "maximum number of parallel maintenance workers (PG11+, NULL on older)" } - max_replication_slots: { usage: GAUGE ,description: "maximum number of replication slots" } - max_wal_senders: { usage: GAUGE ,description: "maximum number of concurrent WAL sender connections" } - block_size: { usage: GAUGE ,description: "database block size in bytes (default 8192)" } - wal_block_size: { usage: GAUGE ,description: "WAL block size in bytes" } - segment_size: { usage: GAUGE ,description: "database file segment size in bytes" } - wal_segment_size: { usage: GAUGE ,description: "WAL segment size in bytes" } - data_checksums: { usage: GAUGE ,description: "data checksums enabled, 1=on 0=off" } - wal_log_hints: { usage: GAUGE ,description: "WAL log hints enabled, 1=on 0=off" } - fsync: { usage: GAUGE ,description: "fsync enabled (CRITICAL for data safety), 1=on 0=off" } - full_page_writes: { usage: GAUGE ,description: "full page writes enabled, 1=on 0=off" } - wal_level: { usage: GAUGE ,description: "WAL level, 1=minimal 2=replica 3=logical" } - min_wal_size: { usage: GAUGE ,description: "minimum WAL size in bytes" } - max_wal_size: { usage: GAUGE ,description: "maximum WAL size in bytes" } - max_slot_wal_keep_size: { usage: GAUGE ,description: "maximum WAL size retained by replication slots in bytes (PG13+, NULL on older)" } - shared_buffers: { usage: GAUGE ,description: "shared buffer size in bytes" } - work_mem: { usage: GAUGE ,description: "work memory size in bytes" } - maintenance_work_mem: { usage: GAUGE ,description: "maintenance work memory size in bytes" } - effective_cache_size: { usage: GAUGE ,description: "planner's assumption about effective OS cache size in bytes" } - shared_memory_size: { usage: GAUGE ,description: "total shared memory size in bytes (PG13+, NULL on older)" } - hugepage_status: { usage: GAUGE ,description: "huge pages status, 1=on 0=off -1=unknown NULL=unavailable (PG14+)" } - hugepage_count: { usage: GAUGE ,description: "number of huge pages needed for shared memory (PG14+, NULL on older)" } - archive_mode: { usage: GAUGE ,description: "archive mode, 0=off 1=on 2=always" } - autovacuum: { usage: GAUGE ,description: "autovacuum enabled, 1=on 0=off" } - autovacuum_max_workers: { usage: GAUGE ,description: "maximum number of autovacuum worker processes" } - checkpoint_timeout: { usage: GAUGE ,description: "checkpoint timeout in seconds" } - checkpoint_completion_target: { usage: GAUGE ,description: "checkpoint completion target (0.0-1.0)" } - hot_standby: { usage: GAUGE ,description: "hot standby mode enabled, 1=on 0=off" } - synchronous_commit: { usage: GAUGE ,description: "synchronous commit level, 0=off 1=local 2=remote_write 3=on 4=remote_apply" } - io_method: { usage: GAUGE ,description: "I/O method (PG18+), 0=sync 1=worker 2=io_uring NULL=unavailable" } #==============================================================# # 0210 pg_repl #==============================================================# pg_repl_12: name: pg_repl desc: PostgreSQL replication stat metrics 12+ query: | SELECT application_name AS appname, usename, coalesce(client_addr::TEXT,'localhost') AS address, pid::TEXT, client_port, CASE state WHEN 'streaming' THEN 0 WHEN 'startup' THEN 1 WHEN 'catchup' THEN 2 WHEN 'backup' THEN 3 WHEN 'stopping' THEN 4 ELSE -1 END AS state, CASE sync_state WHEN 'async' THEN 0 WHEN 'potential' THEN 1 WHEN 'sync' THEN 2 WHEN 'quorum' THEN 3 ELSE -1 END AS sync_state, sync_priority, backend_xmin::TEXT::BIGINT AS backend_xmin, current.lsn - '0/0' AS lsn, current.lsn - sent_lsn AS sent_diff, current.lsn - write_lsn AS write_diff, current.lsn - flush_lsn AS flush_diff, current.lsn - replay_lsn AS replay_diff, sent_lsn - '0/0' AS sent_lsn, write_lsn - '0/0' AS write_lsn, flush_lsn - '0/0' AS flush_lsn, replay_lsn - '0/0' AS replay_lsn, coalesce(extract(EPOCH FROM write_lag), 0) AS write_lag, coalesce(extract(EPOCH FROM flush_lag), 0) AS flush_lag, coalesce(extract(EPOCH FROM replay_lag), 0) AS replay_lag, extract(EPOCH FROM current_timestamp) AS "time", extract(EPOCH FROM backend_start) AS launch_time, extract(EPOCH FROM reply_time) AS reply_time FROM pg_stat_replication, (SELECT CASE WHEN pg_is_in_recovery() THEN pg_last_wal_replay_lsn() ELSE pg_current_wal_lsn() END AS lsn) current; ttl: 10 min_version: 120000 tags: [ cluster ] metrics: - appname: { usage: LABEL ,description: "Name of the application that is connected to this WAL sender" } - usename: { usage: LABEL ,description: "Name of the user logged into this WAL sender process" } - address: { usage: LABEL ,description: "IP address of the client connected to this WAL sender, localhost for unix socket" } - pid: { usage: LABEL ,description: "Process ID of the WAL sender process" } - client_port: { usage: GAUGE ,description: "TCP port number that the client is using for communication with this WAL sender, or -1 if a Unix socket is used" } - state: { usage: GAUGE ,description: "Current WAL sender encoded state 0-4 for streaming|startup|catchup|backup|stopping" } - sync_state: { usage: GAUGE ,description: "Encoded synchronous state of this standby server, 0-3 for async|potential|sync|quorum" } - sync_priority: { usage: GAUGE ,description: "Priority of this standby server for being chosen as the synchronous standby" } - backend_xmin: { usage: COUNTER ,description: "This standby's xmin horizon reported by hot_standby_feedback." } - lsn: { usage: COUNTER ,description: "Current log position on this server" } - sent_diff: { usage: GAUGE ,description: "Last log position sent to this standby server diff with current lsn" } - write_diff: { usage: GAUGE ,description: "Last log position written to disk by this standby server diff with current lsn" } - flush_diff: { usage: GAUGE ,description: "Last log position flushed to disk by this standby server diff with current lsn" } - replay_diff: { usage: GAUGE ,description: "Last log position replayed into the database on this standby server diff with current lsn" } - sent_lsn: { usage: COUNTER ,description: "Last write-ahead log location sent on this connection" } - write_lsn: { usage: COUNTER ,description: "Last write-ahead log location written to disk by this standby server" } - flush_lsn: { usage: COUNTER ,description: "Last write-ahead log location flushed to disk by this standby server" } - replay_lsn: { usage: COUNTER ,description: "Last write-ahead log location replayed into the database on this standby server" } - write_lag: { usage: GAUGE ,description: "Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written it" } - flush_lag: { usage: GAUGE ,description: "Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written and flushed it" } - replay_lag: { usage: GAUGE ,description: "Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written, flushed and applied it" } - time: { usage: COUNTER ,description: "Current timestamp in unix epoch" } - launch_time: { usage: COUNTER ,description: "Time when this process was started, i.e., when the client connected to this WAL sender" } - reply_time: { usage: GAUGE ,description: "Send time of last reply message received from standby server" } pg_repl_10: name: pg_repl desc: PostgreSQL replication stat metrics v10 v11 query: | SELECT application_name AS appname, usename, coalesce(client_addr::TEXT,'localhost') AS address, pid::TEXT, client_port, CASE state WHEN 'streaming' THEN 0 WHEN 'startup' THEN 1 WHEN 'catchup' THEN 2 WHEN 'backup' THEN 3 WHEN 'stopping' THEN 4 ELSE -1 END AS state, CASE sync_state WHEN 'async' THEN 0 WHEN 'potential' THEN 1 WHEN 'sync' THEN 2 WHEN 'quorum' THEN 3 ELSE -1 END AS sync_state, sync_priority, backend_xmin::TEXT::BIGINT AS backend_xmin, current.lsn - '0/0' AS lsn, current.lsn - sent_lsn AS sent_diff, current.lsn - write_lsn AS write_diff, current.lsn - flush_lsn AS flush_diff, current.lsn - replay_lsn AS replay_diff, sent_lsn - '0/0' AS sent_lsn, write_lsn - '0/0' AS write_lsn, flush_lsn - '0/0' AS flush_lsn, replay_lsn - '0/0' AS replay_lsn, coalesce(extract(EPOCH FROM write_lag), 0) AS write_lag, coalesce(extract(EPOCH FROM flush_lag), 0) AS flush_lag, coalesce(extract(EPOCH FROM replay_lag), 0) AS replay_lag, extract(EPOCH FROM current_timestamp) AS "time", extract(EPOCH FROM backend_start) AS launch_time FROM pg_stat_replication, (SELECT CASE WHEN pg_is_in_recovery() THEN pg_last_wal_replay_lsn() ELSE pg_current_wal_lsn() END AS lsn) current; ttl: 10 min_version: 100000 max_version: 120000 tags: [ cluster ] metrics: - appname: { usage: LABEL ,description: "Name of the application that is connected to this WAL sender" } - usename: { usage: LABEL ,description: "Name of the user logged into this WAL sender process" } - address: { usage: LABEL ,description: "IP address of the client connected to this WAL sender, localhost for unix socket" } - pid: { usage: LABEL ,description: "Process ID of the WAL sender process" } - client_port: { usage: GAUGE ,description: "TCP port number that the client is using for communication with this WAL sender, or -1 if a Unix socket is used" } - state: { usage: GAUGE ,description: "Current WAL sender encoded state 0-4 for streaming|startup|catchup|backup|stopping" } - sync_state: { usage: GAUGE ,description: "Encoded synchronous state of this standby server, 0-3 for async|potential|sync|quorum" } - sync_priority: { usage: GAUGE ,description: "Priority of this standby server for being chosen as the synchronous standby" } - backend_xmin: { usage: COUNTER ,description: "This standby's xmin horizon reported by hot_standby_feedback." } - lsn: { usage: COUNTER ,description: "Current log position on this server" } - sent_diff: { usage: GAUGE ,description: "Last log position sent to this standby server diff with current lsn" } - write_diff: { usage: GAUGE ,description: "Last log position written to disk by this standby server diff with current lsn" } - flush_diff: { usage: GAUGE ,description: "Last log position flushed to disk by this standby server diff with current lsn" } - replay_diff: { usage: GAUGE ,description: "Last log position replayed into the database on this standby server diff with current lsn" } - sent_lsn: { usage: COUNTER ,description: "Last write-ahead log location sent on this connection" } - write_lsn: { usage: COUNTER ,description: "Last write-ahead log location written to disk by this standby server" } - flush_lsn: { usage: COUNTER ,description: "Last write-ahead log location flushed to disk by this standby server" } - replay_lsn: { usage: COUNTER ,description: "Last write-ahead log location replayed into the database on this standby server" } - write_lag: { usage: GAUGE ,description: "Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written it" } - flush_lag: { usage: GAUGE ,description: "Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written and flushed it" } - replay_lag: { usage: GAUGE ,description: "Time elapsed between flushing recent WAL locally and receiving notification that this standby server has written, flushed and applied it" } - time: { usage: COUNTER ,description: "Current timestamp in unix epoch" } - launch_time: { usage: COUNTER ,description: "Time when this process was started, i.e., when the client connected to this WAL sender" } #==============================================================# # 0220 pg_sync_standby #==============================================================# pg_sync_standby: name: pg_sync_standby desc: PostgreSQL synchronous standby status and names query: | SELECT CASE WHEN names <> '' THEN names ELSE '' END AS names, CASE WHEN names <> '' THEN 1 ELSE 0 END AS enabled FROM (SELECT current_setting('synchronous_standby_names') AS names) n; ttl: 10 min_version: 090400 tags: [ cluster ] metrics: - names: { usage: LABEL ,description: "List of standby servers that can support synchronous replication, if not enabled" } - enabled: { usage: GAUGE ,description: "Synchronous commit enabled, 1 if enabled, 0 if disabled" } #==============================================================# # 0230 pg_downstream #==============================================================# pg_downstream: name: pg_downstream desc: PostgreSQL replication client count group by state query: | SELECT l.state, coalesce(count, 0 ) AS count FROM unnest(ARRAY ['streaming','startup','catchup', 'backup', 'stopping']) l(state) LEFT JOIN (SELECT state, count(*) AS count FROM pg_stat_replication GROUP BY state)r ON l.state = r.state; ttl: 10 min_version: 090400 tags: [ cluster ] metrics: - state: { usage: LABEL ,description: "Replication client state, could be one of startup|catchup|streaming|backup|stopping" } - count: { usage: GAUGE ,description: "Count of corresponding state" } #==============================================================# # 0240 pg_slot #==============================================================# pg_slot_17: name: pg_slot desc: PostgreSQL replication slot metrics v17, slot also exists on standby query: |- SELECT s.slot_name, s.slot_type, plugin, database AS datname,datoid,active_pid, active,temporary,two_phase,conflicting,failover,synced, xmin::TEXT::BIGINT AS xmin,catalog_xmin::TEXT::BIGINT AS catalog_xmin, restart_lsn - '0/0' AS restart_lsn, confirmed_flush_lsn - '0/0' AS confirm_lsn, CASE WHEN pg_is_in_recovery() THEN pg_last_wal_replay_lsn() ELSE pg_current_wal_lsn() END - restart_lsn AS retained_bytes, safe_wal_size, CASE wal_status WHEN 'reserved' THEN 0 WHEN 'extended' THEN 1 WHEN 'unreserved' THEN 2 WHEN 'lost' THEN 3 ELSE -1 END AS wal_status, spill_txns,spill_count,spill_bytes,stream_txns,stream_count,stream_bytes,total_txns,total_bytes,extract(EPOCH FROM stats_reset) AS reset_time, extract(EPOCH FROM inactive_since) AS inactive_since, CASE invalidation_reason WHEN 'wal_removed' THEN 1 WHEN 'rows_removed' THEN 2 WHEN 'wal_level_insufficient' THEN 3 ELSE 0 END AS invalidation_reason FROM pg_replication_slots s LEFT OUTER JOIN pg_stat_replication_slots ss ON s.slot_name = ss.slot_name; ttl: 10 min_version: 170000 tags: [ cluster ] metrics: - slot_name: { usage: LABEL ,description: "A unique, cluster-wide identifier for the replication slot" } - slot_type: { usage: LABEL ,description: "The slot type, physical or logical" } - plugin: { usage: LABEL ,description: "The base name of the shared object containing the output plugin this logical slot is using, or null for physical slots." } - datname: { usage: LABEL ,description: "The name of the database this slot is associated with, logical slots only, null for physical slot" } - datoid: { usage: GAUGE ,description: "The OID of the database this slot is associated with, logical slots only, null for physical slot" } - active_pid: { usage: GAUGE ,description: "The process ID of the session streaming data for this slot. NULL if inactive." } - active: { usage: GAUGE ,description: "True(1) if this slot is currently actively being used" } - temporary: { usage: GAUGE ,description: "True(1) if this is a temporary replication slot." } - two_phase: { usage: GAUGE ,description: "True(1) if the slot is enabled for decoding prepared transactions. Always false for physical slots." } - conflicting: { usage: GAUGE ,description: "True(1) if this logical slot conflicted with recovery. Always NULL for physical slots." } - failover: { usage: GAUGE ,description: "True(1) if this is a logical slot enabled to be synced to the standbys" } - synced: { usage: GAUGE ,description: "True(1) if this is a logical slot that was synced from a primary server" } - xmin: { usage: COUNTER ,description: "The oldest transaction that this slot needs the database to retain." } - catalog_xmin: { usage: COUNTER ,description: "The oldest transaction affecting the system catalogs that this slot needs the database to retain." } - restart_lsn: { usage: COUNTER ,description: "The address (LSN) of oldest WAL which still might be required by the consumer of this slot" } - confirm_lsn: { usage: COUNTER ,description: "The address (LSN) up to which the logical slot's consumer has confirmed receiving data." } - retained_bytes: { usage: GAUGE ,description: "Size of bytes that retained for this slot" } - safe_wal_size: { usage: GAUGE ,description: "bytes that can be written to WAL which will not make slot into lost" } - wal_status: { usage: GAUGE ,description: "WAL reserve status 0-3 means reserved,extended,unreserved,lost, -1 means other" } - spill_txns: { usage: COUNTER ,description: "Xacts that spilled to disk due to logical decode mem exceeding (subtrans included)" } - spill_count: { usage: COUNTER ,description: "Xacts that spilled to disk due to logical decode mem exceeding" } - spill_bytes: { usage: COUNTER ,description: "Bytes that spilled to disk due to logical decode mem exceeding" } - stream_txns: { usage: COUNTER ,description: "Xacts that streamed to decoding output plugin after mem exceed" } - stream_count: { usage: COUNTER ,description: "Xacts that streamed to decoding output plugin after mem exceed" } - stream_bytes: { usage: COUNTER ,description: "Bytes that streamed to decoding output plugin after mem exceed" } - total_txns: { usage: COUNTER ,description: "Number of decoded xacts sent to the decoding output plugin for this slot" } - total_bytes: { usage: COUNTER ,description: "Number of decoded bytes sent to the decoding output plugin for this slot" } - reset_time: { usage: GAUGE ,description: "When statistics were last reset" } - invalidation_reason: { usage: GAUGE ,description: "ok=0, wal_removed=1, rows_removed=2, wal_level_insufficient=3" } - inactive_since: { usage: GAUGE ,description: "The time when the slot became inactive" } pg_slot_16: name: pg_slot desc: PostgreSQL replication slot metrics v16 with conflicting, now slot also exists on standby query: |- SELECT s.slot_name, s.slot_type, plugin, database AS datname,datoid,active_pid, active,temporary,two_phase,conflicting,xmin::TEXT::BIGINT AS xmin,catalog_xmin::TEXT::BIGINT AS catalog_xmin, restart_lsn - '0/0' AS restart_lsn, confirmed_flush_lsn - '0/0' AS confirm_lsn, CASE WHEN pg_is_in_recovery() THEN pg_last_wal_replay_lsn() ELSE pg_current_wal_lsn() END - restart_lsn AS retained_bytes, safe_wal_size, CASE wal_status WHEN 'reserved' THEN 0 WHEN 'extended' THEN 1 WHEN 'unreserved' THEN 2 WHEN 'lost' THEN 3 ELSE -1 END AS wal_status, spill_txns,spill_count,spill_bytes,stream_txns,stream_count,stream_bytes,total_txns,total_bytes,extract(EPOCH FROM stats_reset) AS reset_time FROM pg_replication_slots s LEFT OUTER JOIN pg_stat_replication_slots ss ON s.slot_name = ss.slot_name; ttl: 10 min_version: 160000 max_version: 170000 tags: [ cluster ] metrics: - slot_name: { usage: LABEL ,description: "A unique, cluster-wide identifier for the replication slot" } - slot_type: { usage: LABEL ,description: "The slot type, physical or logical" } - plugin: { usage: LABEL ,description: "The base name of the shared object containing the output plugin this logical slot is using, or null for physical slots." } - datname: { usage: LABEL ,description: "The name of the database this slot is associated with, logical slots only, null for physical slot" } - datoid: { usage: GAUGE ,description: "The OID of the database this slot is associated with, logical slots only, null for physical slot" } - active_pid: { usage: GAUGE ,description: "The process ID of the session streaming data for this slot. NULL if inactive." } - active: { usage: GAUGE ,description: "True(1) if this slot is currently actively being used" } - temporary: { usage: GAUGE ,description: "True(1) if this is a temporary replication slot." } - two_phase: { usage: GAUGE ,description: "True(1) if the slot is enabled for decoding prepared transactions. Always false for physical slots." } - conflicting: { usage: GAUGE ,description: "True if this logical slot conflicted with recovery. Always NULL for physical slots." } - xmin: { usage: COUNTER ,description: "The oldest transaction that this slot needs the database to retain." } - catalog_xmin: { usage: COUNTER ,description: "The oldest transaction affecting the system catalogs that this slot needs the database to retain." } - restart_lsn: { usage: COUNTER ,description: "The address (LSN) of oldest WAL which still might be required by the consumer of this slot" } - confirm_lsn: { usage: COUNTER ,description: "The address (LSN) up to which the logical slot's consumer has confirmed receiving data." } - retained_bytes: { usage: GAUGE ,description: "Size of bytes that retained for this slot" } - safe_wal_size: { usage: GAUGE ,description: "bytes that can be written to WAL which will not make slot into lost" } - wal_status: { usage: GAUGE ,description: "WAL reserve status 0-3 means reserved,extended,unreserved,lost, -1 means other" } - spill_txns: { usage: COUNTER ,description: "Xacts that spilled to disk due to logical decode mem exceeding (subtrans included)" } - spill_count: { usage: COUNTER ,description: "Xacts that spilled to disk due to logical decode mem exceeding" } - spill_bytes: { usage: COUNTER ,description: "Bytes that spilled to disk due to logical decode mem exceeding" } - stream_txns: { usage: COUNTER ,description: "Xacts that streamed to decoding output plugin after mem exceed" } - stream_count: { usage: COUNTER ,description: "Xacts that streamed to decoding output plugin after mem exceed" } - stream_bytes: { usage: COUNTER ,description: "Bytes that streamed to decoding output plugin after mem exceed" } - total_txns: { usage: COUNTER ,description: "Number of decoded xacts sent to the decoding output plugin for this slot" } - total_bytes: { usage: COUNTER ,description: "Number of decoded bytes sent to the decoding output plugin for this slot" } - reset_time: { usage: GAUGE ,description: "When statistics were last reset" } pg_slot_14: name: pg_slot desc: PostgreSQL replication slot metrics v14 with pg_stat_replication_slots metrics query: |- SELECT s.slot_name, s.slot_type, plugin, database AS datname,datoid,active_pid, active,temporary,two_phase,xmin::TEXT::BIGINT AS xmin,catalog_xmin::TEXT::BIGINT AS catalog_xmin, restart_lsn - '0/0' AS restart_lsn, confirmed_flush_lsn - '0/0' AS confirm_lsn, CASE WHEN pg_is_in_recovery() THEN pg_last_wal_replay_lsn() ELSE pg_current_wal_lsn() END - restart_lsn AS retained_bytes, safe_wal_size, CASE wal_status WHEN 'reserved' THEN 0 WHEN 'extended' THEN 1 WHEN 'unreserved' THEN 2 WHEN 'lost' THEN 3 ELSE -1 END AS wal_status, spill_txns,spill_count,spill_bytes,stream_txns,stream_count,stream_bytes,total_txns,total_bytes,extract(EPOCH FROM stats_reset) AS reset_time FROM pg_replication_slots s LEFT OUTER JOIN pg_stat_replication_slots ss ON s.slot_name = ss.slot_name; ttl: 10 min_version: 140000 max_version: 160000 tags: [ cluster, primary ] metrics: - slot_name: { usage: LABEL ,description: "A unique, cluster-wide identifier for the replication slot" } - slot_type: { usage: LABEL ,description: "The slot type, physical or logical" } - plugin: { usage: LABEL ,description: "The base name of the shared object containing the output plugin this logical slot is using, or null for physical slots." } - datname: { usage: LABEL ,description: "The name of the database this slot is associated with, logical slots only, null for physical slot" } - datoid: { usage: GAUGE ,description: "The OID of the database this slot is associated with, logical slots only, null for physical slot" } - active_pid: { usage: GAUGE ,description: "The process ID of the session streaming data for this slot. NULL if inactive." } - active: { usage: GAUGE ,description: "True(1) if this slot is currently actively being used" } - temporary: { usage: GAUGE ,description: "True(1) if this is a temporary replication slot." } - two_phase: { usage: GAUGE ,description: "True(1) if the slot is enabled for decoding prepared transactions. Always false for physical slots." } - xmin: { usage: COUNTER ,description: "The oldest transaction that this slot needs the database to retain." } - catalog_xmin: { usage: COUNTER ,description: "The oldest transaction affecting the system catalogs that this slot needs the database to retain." } - restart_lsn: { usage: COUNTER ,description: "The address (LSN) of oldest WAL which still might be required by the consumer of this slot" } - confirm_lsn: { usage: COUNTER ,description: "The address (LSN) up to which the logical slot's consumer has confirmed receiving data." } - retained_bytes: { usage: GAUGE ,description: "Size of bytes that retained for this slot" } - safe_wal_size: { usage: GAUGE ,description: "bytes that can be written to WAL which will not make slot into lost" } - wal_status: { usage: GAUGE ,description: "WAL reserve status 0-3 means reserved,extended,unreserved,lost, -1 means other" } - spill_txns: { usage: COUNTER ,description: "Xacts that spilled to disk due to logical decode mem exceeding (subtrans included)" } - spill_count: { usage: COUNTER ,description: "Xacts that spilled to disk due to logical decode mem exceeding" } - spill_bytes: { usage: COUNTER ,description: "Bytes that spilled to disk due to logical decode mem exceeding" } - stream_txns: { usage: COUNTER ,description: "Xacts that streamed to decoding output plugin after mem exceed" } - stream_count: { usage: COUNTER ,description: "Xacts that streamed to decoding output plugin after mem exceed" } - stream_bytes: { usage: COUNTER ,description: "Bytes that streamed to decoding output plugin after mem exceed" } - total_txns: { usage: COUNTER ,description: "Number of decoded xacts sent to the decoding output plugin for this slot" } - total_bytes: { usage: COUNTER ,description: "Number of decoded bytes sent to the decoding output plugin for this slot" } - reset_time: { usage: GAUGE ,description: "When statistics were last reset" } pg_slot_13: name: pg_slot desc: PostgreSQL replication slot metrics v13 (wal safe size and status) query: |- SELECT slot_name, slot_type, plugin, database AS datname,datoid,active_pid, active,temporary,xmin::TEXT::BIGINT AS xmin,catalog_xmin::TEXT::BIGINT AS catalog_xmin, restart_lsn - '0/0' AS restart_lsn, confirmed_flush_lsn - '0/0' AS confirm_lsn, CASE WHEN pg_is_in_recovery() THEN pg_last_wal_replay_lsn() ELSE pg_current_wal_lsn() END - restart_lsn AS retained_bytes, safe_wal_size, CASE wal_status WHEN 'reserved' THEN 0 WHEN 'extended' THEN 1 WHEN 'unreserved' THEN 2 WHEN 'lost' THEN 3 ELSE -1 END AS wal_status FROM pg_replication_slots; ttl: 10 min_version: 130000 max_version: 140000 tags: [ cluster, primary ] metrics: - slot_name: { usage: LABEL ,description: "A unique, cluster-wide identifier for the replication slot" } - slot_type: { usage: LABEL ,description: "The slot type, physical or logical" } - plugin: { usage: LABEL ,description: "The base name of the shared object containing the output plugin this logical slot is using, or null for physical slots." } - datname: { usage: LABEL ,description: "The name of the database this slot is associated with, logical slots only, null for physical slot" } - datoid: { usage: GAUGE ,description: "The OID of the database this slot is associated with, logical slots only, null for physical slot" } - active_pid: { usage: GAUGE ,description: "The process ID of the session streaming data for this slot. NULL if inactive." } - active: { usage: GAUGE ,description: "True(1) if this slot is currently actively being used" } - temporary: { usage: GAUGE ,description: "True(1) if this is a temporary replication slot." } - xmin: { usage: COUNTER ,description: "The oldest transaction that this slot needs the database to retain." } - catalog_xmin: { usage: COUNTER ,description: "The oldest transaction affecting the system catalogs that this slot needs the database to retain." } - restart_lsn: { usage: COUNTER ,description: "The address (LSN) of oldest WAL which still might be required by the consumer of this slot" } - confirm_lsn: { usage: COUNTER ,description: "The address (LSN) up to which the logical slot's consumer has confirmed receiving data." } - retained_bytes: { usage: GAUGE ,description: "Size of bytes that retained for this slot" } - safe_wal_size: { usage: GAUGE ,description: "bytes that can be written to WAL which will not make slot into lost" } - wal_status: { usage: GAUGE ,description: "WAL reserve status 0-3 means reserved,extended,unreserved,lost, -1 means other" } pg_slot_10: name: pg_slot desc: PostgreSQL replication slot metrics 10 ~ 12 query: |- SELECT slot_name, slot_type, plugin, database AS datname,datoid,active_pid, active,temporary,xmin::TEXT::BIGINT AS xmin,catalog_xmin::TEXT::BIGINT AS catalog_xmin, restart_lsn - '0/0' AS restart_lsn, confirmed_flush_lsn - '0/0' AS confirm_lsn, CASE WHEN pg_is_in_recovery() THEN pg_last_wal_replay_lsn() ELSE pg_current_wal_lsn() END - restart_lsn AS retained_bytes FROM pg_replication_slots; ttl: 10 min_version: 100000 max_version: 130000 tags: [ cluster, primary ] metrics: - slot_name: { usage: LABEL ,description: "A unique, cluster-wide identifier for the replication slot" } - slot_type: { usage: LABEL ,description: "The slot type, physical or logical" } - plugin: { usage: LABEL ,description: "The base name of the shared object containing the output plugin this logical slot is using, or null for physical slots." } - datname: { usage: LABEL ,description: "The name of the database this slot is associated with, logical slots only, null for physical slot" } - datoid: { usage: GAUGE ,description: "The OID of the database this slot is associated with, logical slots only, null for physical slot" } - active_pid: { usage: GAUGE ,description: "The process ID of the session streaming data for this slot. NULL if inactive." } - active: { usage: GAUGE ,description: "True(1) if this slot is currently actively being used" } - temporary: { usage: GAUGE ,description: "True(1) if this is a temporary replication slot." } - xmin: { usage: COUNTER ,description: "The oldest transaction that this slot needs the database to retain." } - catalog_xmin: { usage: COUNTER ,description: "The oldest transaction affecting the system catalogs that this slot needs the database to retain." } - restart_lsn: { usage: COUNTER ,description: "The address (LSN) of oldest WAL which still might be required by the consumer of this slot" } - confirm_lsn: { usage: COUNTER ,description: "The address (LSN) up to which the logical slot's consumer has confirmed receiving data." } - retained_bytes: { usage: GAUGE ,description: "Size of bytes that retained for this slot" } #==============================================================# # 0250 pg_recv #==============================================================# pg_recv_13: name: pg_recv desc: PostgreSQL walreceiver metrics 13+ query: |- SELECT coalesce(sender_host, (regexp_match(conninfo, '.*host=(\S+).*'))[1]) AS sender_host, coalesce(sender_port::TEXT, (regexp_match(conninfo, '.*port=(\S+).*'))[1]) AS sender_port, coalesce(slot_name, 'NULL') AS slot_name, pid, CASE status WHEN 'streaming' THEN 0 WHEN 'startup' THEN 1 WHEN 'catchup' THEN 2 WHEN 'backup' THEN 3 WHEN 'stopping' THEN 4 ELSE -1 END AS state, receive_start_lsn - '0/0' AS init_lsn,receive_start_tli AS init_tli, flushed_lsn - '0/0' AS flush_lsn,written_lsn - '0/0' AS write_lsn, received_tli AS flush_tli, latest_end_lsn - '0/0' AS reported_lsn, last_msg_send_time AS msg_send_time,last_msg_receipt_time AS msg_recv_time,latest_end_time AS reported_time,now() AS time FROM pg_stat_wal_receiver; ttl: 10 min_version: 130000 tags: [ cluster, replica ] metrics: - sender_host: { usage: LABEL ,description: "Host of the PostgreSQL instance this WAL receiver is connected to" } - sender_port: { usage: LABEL ,description: "Port number of the PostgreSQL instance this WAL receiver is connected to." } - slot_name: { usage: LABEL ,description: "Replication slot name used by this WAL receiver" } - pid: { usage: GAUGE ,description: "Process ID of the WAL receiver process" } - state: { usage: GAUGE ,description: "Encoded activity status of the WAL receiver process 0-4 for streaming|startup|catchup|backup|stopping" } - init_lsn: { usage: COUNTER ,description: "First write-ahead log location used when WAL receiver is started" } - init_tli: { usage: COUNTER ,description: "First timeline number used when WAL receiver is started" } - flush_lsn: { usage: COUNTER ,description: "Last write-ahead log location already received and flushed to disk" } - write_lsn: { usage: COUNTER ,description: "Last write-ahead log location already received and written to disk, but not flushed." } - flush_tli: { usage: COUNTER ,description: "Timeline number of last write-ahead log location received and flushed to disk" } - reported_lsn: { usage: COUNTER ,description: "Last write-ahead log location reported to origin WAL sender" } - msg_send_time: { usage: GAUGE ,description: "Send time of last message received from origin WAL sender" } - msg_recv_time: { usage: GAUGE ,description: "Receipt time of last message received from origin WAL sender" } - reported_time: { usage: GAUGE ,description: "Time of last write-ahead log location reported to origin WAL sender" } - time: { usage: GAUGE ,description: "Time of current snapshot" } pg_recv_11: name: pg_recv desc: PostgreSQL walreceiver metrics (11-12) query: |- SELECT coalesce(sender_host, (regexp_match(conninfo, '.*host=(\S+).*'))[1]) AS sender_host, coalesce(sender_port::TEXT, (regexp_match(conninfo, '.*port=(\S+).*'))[1]) AS sender_port, coalesce(slot_name, 'NULL') AS slot_name, pid, CASE status WHEN 'streaming' THEN 0 WHEN 'startup' THEN 1 WHEN 'catchup' THEN 2 WHEN 'backup' THEN 3 WHEN 'stopping' THEN 4 ELSE -1 END AS state, receive_start_lsn - '0/0' AS init_lsn,receive_start_tli AS init_tli, received_lsn - '0/0' AS flush_lsn, received_tli AS flush_tli, latest_end_lsn - '0/0' AS reported_lsn, last_msg_send_time AS msg_send_time,last_msg_receipt_time AS msg_recv_time,latest_end_time AS reported_time,now() AS time FROM pg_stat_wal_receiver; ttl: 10 tags: [ cluster, replica ] min_version: 110000 max_version: 130000 metrics: - sender_host: { usage: LABEL ,description: "Host of the PostgreSQL instance this WAL receiver is connected to" } - sender_port: { usage: LABEL ,description: "Port number of the PostgreSQL instance this WAL receiver is connected to." } - slot_name: { usage: LABEL ,description: "Replication slot name used by this WAL receiver" } - pid: { usage: GAUGE ,description: "Process ID of the WAL receiver process" } - state: { usage: GAUGE ,description: "Encoded activity status of the WAL receiver process 0-4 for streaming|startup|catchup|backup|stopping" } - init_lsn: { usage: COUNTER ,description: "First write-ahead log location used when WAL receiver is started" } - init_tli: { usage: COUNTER ,description: "First timeline number used when WAL receiver is started" } - flush_lsn: { usage: COUNTER ,description: "Last write-ahead log location already received and flushed to disk" } - flush_tli: { usage: COUNTER ,description: "Timeline number of last write-ahead log location received and flushed to disk" } - reported_lsn: { usage: COUNTER ,description: "Last write-ahead log location reported to origin WAL sender" } - msg_send_time: { usage: GAUGE ,description: "Send time of last message received from origin WAL sender" } - msg_recv_time: { usage: GAUGE ,description: "Receipt time of last message received from origin WAL sender" } - reported_time: { usage: GAUGE ,description: "Time of last write-ahead log location reported to origin WAL sender" } - time: { usage: GAUGE ,description: "Time of current snapshot" } pg_recv_10: name: pg_recv desc: PostgreSQL walreceiver metrics (10) query: |- SELECT (regexp_match(conninfo, '.*host=(\S+).*'))[1] AS sender_host, (regexp_match(conninfo, '.*port=(\S+).*'))[1] AS sender_port, coalesce(slot_name, 'NULL') AS slot_name, pid, CASE status WHEN 'streaming' THEN 0 WHEN 'startup' THEN 1 WHEN 'catchup' THEN 2 WHEN 'backup' THEN 3 WHEN 'stopping' THEN 4 ELSE -1 END AS state, receive_start_lsn - '0/0' AS init_lsn,receive_start_tli AS init_tli, received_lsn - '0/0' AS flush_lsn, received_tli AS flush_tli, latest_end_lsn - '0/0' AS reported_lsn, last_msg_send_time AS msg_send_time,last_msg_receipt_time AS msg_recv_time,latest_end_time AS reported_time,now() AS time FROM pg_stat_wal_receiver; ttl: 10 tags: [ cluster, replica ] min_version: 100000 max_version: 110000 metrics: - sender_host: { usage: LABEL ,description: "Host of the PostgreSQL instance this WAL receiver is connected to" } - sender_port: { usage: LABEL ,description: "Port number of the PostgreSQL instance this WAL receiver is connected to." } - slot_name: { usage: LABEL ,description: "Replication slot name used by this WAL receiver" } - pid: { usage: GAUGE ,description: "Process ID of the WAL receiver process" } - state: { usage: GAUGE ,description: "Encoded activity status of the WAL receiver process 0-4 for streaming|startup|catchup|backup|stopping" } - init_lsn: { usage: COUNTER ,description: "First write-ahead log location used when WAL receiver is started" } - init_tli: { usage: COUNTER ,description: "First timeline number used when WAL receiver is started" } - flush_lsn: { usage: COUNTER ,description: "Last write-ahead log location already received and flushed to disk" } - flush_tli: { usage: COUNTER ,description: "Timeline number of last write-ahead log location received and flushed to disk" } - reported_lsn: { usage: COUNTER ,description: "Last write-ahead log location reported to origin WAL sender" } - msg_send_time: { usage: GAUGE ,description: "Send time of last message received from origin WAL sender" } - msg_recv_time: { usage: GAUGE ,description: "Receipt time of last message received from origin WAL sender" } - reported_time: { usage: GAUGE ,description: "Time of last write-ahead log location reported to origin WAL sender" } - time: { usage: GAUGE ,description: "Time of current snapshot" } #==============================================================# # 0260 pg_sub #==============================================================# pg_sub_16: name: pg_sub desc: PostgreSQL subscription statistics (16+) query: |- SELECT s1.subname, subid AS id, pid, received_lsn, reported_lsn, msg_send_time, msg_recv_time, reported_time, apply_error_count, sync_error_count FROM (SELECT subname, subid, pid, received_lsn - '0/0' AS received_lsn, latest_end_lsn - '0/0' AS reported_lsn, extract(epoch from last_msg_send_time) AS msg_send_time, extract(epoch from last_msg_receipt_time) AS msg_recv_time, extract(epoch from latest_end_time) AS reported_time FROM pg_stat_subscription WHERE relid IS NULL AND leader_pid IS NULL) s1 LEFT OUTER JOIN pg_stat_subscription_stats s2 USING(subid); ttl: 10 min_version: 160000 tags: [ cluster ] metrics: - subname: { usage: LABEL ,description: "Name of this subscription" } - id: { usage: GAUGE ,description: "OID of the subscription" } - pid: { usage: GAUGE ,description: "Process ID of the subscription leader apply worker" } - received_lsn: { usage: COUNTER ,description: "Last write-ahead log location received" } - reported_lsn: { usage: COUNTER ,description: "Last write-ahead log location reported to origin WAL sender" } - msg_send_time: { usage: GAUGE ,description: "Send time of last message received from origin WAL sender" } - msg_recv_time: { usage: GAUGE ,description: "Receipt time of last message received from origin WAL sender" } - reported_time: { usage: GAUGE ,description: "Time of last write-ahead log location reported to origin WAL sender" } - apply_error_count: { usage: COUNTER ,description: "Number of times an error occurred while applying changes" } - sync_error_count: { usage: COUNTER ,description: "Number of times an error occurred during the initial table synchronization" } pg_sub_15: name: pg_sub desc: PostgreSQL subscription statistics (15) query: |- SELECT s1.subname, subid AS id, pid, received_lsn, reported_lsn, msg_send_time, msg_recv_time, reported_time, apply_error_count, sync_error_count FROM (SELECT subname, subid, pid, received_lsn - '0/0' AS received_lsn, latest_end_lsn - '0/0' AS reported_lsn, extract(epoch from last_msg_send_time) AS msg_send_time, extract(epoch from last_msg_receipt_time) AS msg_recv_time, extract(epoch from latest_end_time) AS reported_time FROM pg_stat_subscription WHERE relid ISNULL) s1 LEFT OUTER JOIN pg_stat_subscription_stats s2 USING(subid); ttl: 10 min_version: 150000 max_version: 160000 tags: [ cluster ] metrics: - subname: { usage: LABEL ,description: "Name of this subscription" } - id: { usage: GAUGE ,description: "OID of the subscription" } - pid: { usage: GAUGE ,description: "Process ID of the subscription main apply worker process" } - received_lsn: { usage: COUNTER ,description: "Last write-ahead log location received" } - reported_lsn: { usage: COUNTER ,description: "Last write-ahead log location reported to origin WAL sender" } - msg_send_time: { usage: GAUGE ,description: "Send time of last message received from origin WAL sender" } - msg_recv_time: { usage: GAUGE ,description: "Receipt time of last message received from origin WAL sender" } - reported_time: { usage: GAUGE ,description: "Time of last write-ahead log location reported to origin WAL sender" } - apply_error_count: { usage: COUNTER ,description: "Number of times an error occurred while applying changes." } - sync_error_count: { usage: COUNTER ,description: "Number of times an error occurred during the initial table synchronization" } pg_sub_10: name: pg_sub desc: PostgreSQL subscription statistics (10-14) query: |- SELECT subname, subid AS id, pid, received_lsn - '0/0' AS received_lsn, latest_end_lsn - '0/0' AS reported_lsn, extract(epoch from last_msg_send_time) AS msg_send_time, extract(epoch from last_msg_receipt_time) AS msg_recv_time, extract(epoch from latest_end_time) AS reported_time FROM pg_stat_subscription WHERE relid ISNULL; ttl: 10 min_version: 100000 max_version: 150000 tags: [ cluster ] metrics: - subname: { usage: LABEL ,description: "Name of this subscription" } - id: { usage: GAUGE ,description: "OID of the subscription" } - pid: { usage: GAUGE ,description: "Process ID of the subscription main apply worker process" } - received_lsn: { usage: COUNTER ,description: "Last write-ahead log location received" } - reported_lsn: { usage: COUNTER ,description: "Last write-ahead log location reported to origin WAL sender" } - msg_send_time: { usage: GAUGE ,description: "Send time of last message received from origin WAL sender" } - msg_recv_time: { usage: GAUGE ,description: "Receipt time of last message received from origin WAL sender" } - reported_time: { usage: GAUGE ,description: "Time of last write-ahead log location reported to origin WAL sender" } #==============================================================# # 0270 pg_origin #==============================================================# # skip by default, require additional privilege setup # GRANT SELECT ON pg_replication_origin, pg_replication_origin_status TO pg_monitor; pg_origin: name: pg_origin desc: PostgreSQL replay state (approximate) for a certain origin query: SELECT roname, remote_lsn - '0/0' AS remote_lsn, local_lsn - '0/0' AS local_lsn FROM pg_replication_origin o LEFT JOIN pg_replication_origin_status os ON o.roident = os.local_id; ttl: 10 min_version: 090500 skip: true tags: [ cluster ] metrics: - roname: { usage: LABEL ,description: "The external, user defined, name of a replication origin." } - remote_lsn: { usage: COUNTER ,description: "The origin node's LSN up to which data has been replicated." } - local_lsn: { usage: COUNTER ,description: "This node's LSN at which remote_lsn has been replicated." } #==============================================================# # 0300 pg_io #==============================================================# pg_io_18: name: pg_io desc: PostgreSQL I/O stats since v18 query: |- SELECT backend_type AS "type",object,context,reads,read_bytes,read_time,writes,write_bytes,write_time,writebacks,writeback_time, extends,extend_bytes,extend_time,hits,evictions,reuses,fsyncs,fsync_time,extract(EPOCH FROM stats_reset) AS reset_time FROM pg_stat_io; ttl: 10 timeout: 1 min_version: 180000 tags: [ cluster ] metrics: - type: { usage: LABEL ,description: "Type of backend" } - object: { usage: LABEL ,description: "Target object of an I/O operation, relation or temp" } - context: { usage: LABEL ,description: "The context of an I/O operation. normal,vacuum,bulkread,bulkwrite" } - reads: { usage: COUNTER ,default: 0 ,description: "Number of read operations, each of the size specified in op_bytes." } - read_bytes: { usage: COUNTER ,default: 0 ,description: "Number of read bytes" } - read_time: { usage: COUNTER ,default: 0 ,scale: 1e-3 ,description: "Time spent in read operations in seconds" } - writes: { usage: COUNTER ,default: 0 ,description: "Number of write operations, each of the size specified in op_bytes." } - write_time: { usage: COUNTER ,default: 0 ,scale: 1e-3 ,description: "Time spent in write operations in seconds" } - write_bytes: { usage: COUNTER ,default: 0 ,description: "Number of write bytes" } - writebacks: { usage: COUNTER ,default: 0 ,description: "Number of units of size op_bytes which the process requested the kernel write out to permanent storage." } - writeback_time: { usage: COUNTER ,default: 0 ,scale: 1e-3 ,description: "Time spent in writeback operations in seconds" } - extends: { usage: COUNTER ,default: 0 ,description: "Number of relation extend operations, each of the size specified in op_bytes." } - extend_bytes: { usage: COUNTER ,default: 0 ,description: "Number of extend bytes" } - extend_time: { usage: COUNTER ,default: 0 ,scale: 1e-3 ,description: "Time spent in extend operations in seconds" } - hits: { usage: COUNTER ,default: 0 ,description: "The number of times a desired block was found in a shared buffer." } - evictions: { usage: COUNTER ,default: 0 ,description: "Number of times a block has been written out from a shared or local buffer" } - reuses: { usage: COUNTER ,default: 0 ,description: "The number of times an existing buffer is reused" } - fsyncs: { usage: COUNTER ,default: 0 ,description: "Number of fsync calls. These are only tracked in context normal" } - fsync_time: { usage: COUNTER ,default: 0 ,scale: 1e-3 ,description: "Time spent in fsync operations in seconds" } - reset_time: { usage: GAUGE ,description: "Timestamp at which these statistics were last reset" } pg_io_16: name: pg_io desc: PostgreSQL I/O stats query: |- SELECT backend_type AS "type", object, context, reads, read_time,writes,write_time,writebacks,writeback_time,extends, extend_time,hits,evictions,reuses,fsyncs,fsync_time,extract(EPOCH FROM stats_reset) AS reset_time FROM pg_stat_io; ttl: 10 timeout: 1 min_version: 160000 max_version: 180000 tags: [ cluster ] metrics: - type: { usage: LABEL ,description: "Type of backend" } - object: { usage: LABEL ,description: "Target object of an I/O operation, relation or temp" } - context: { usage: LABEL ,description: "The context of an I/O operation. normal,vacuum,bulkread,bulkwrite" } - reads: { usage: COUNTER ,default: 0 ,description: "Number of read operations, each of the size specified in op_bytes." } - read_time: { usage: COUNTER ,default: 0 ,scale: 1e-3 ,description: "Time spent in read operations in seconds" } - writes: { usage: COUNTER ,default: 0 ,description: "Number of write operations, each of the size specified in op_bytes." } - write_time: { usage: COUNTER ,default: 0 ,scale: 1e-3 ,description: "Time spent in write operations in seconds" } - writebacks: { usage: COUNTER ,default: 0 ,description: "Number of units of size op_bytes which the process requested the kernel write out to permanent storage." } - writeback_time: { usage: COUNTER ,default: 0 ,scale: 1e-3 ,description: "Time spent in writeback operations in seconds" } - extends: { usage: COUNTER ,default: 0 ,description: "Number of relation extend operations, each of the size specified in op_bytes." } - extend_time: { usage: COUNTER ,default: 0 ,scale: 1e-3 ,description: "Time spent in extend operations in seconds" } - hits: { usage: COUNTER ,default: 0 ,description: "The number of times a desired block was found in a shared buffer." } - evictions: { usage: COUNTER ,default: 0 ,description: "Number of times a block has been written out from a shared or local buffer" } - reuses: { usage: COUNTER ,default: 0 ,description: "The number of times an existing buffer is reused" } - fsyncs: { usage: COUNTER ,default: 0 ,description: "Number of fsync calls. These are only tracked in context normal" } - fsync_time: { usage: COUNTER ,default: 0 ,scale: 1e-3 ,description: "Time spent in fsync operations in seconds" } - reset_time: { usage: GAUGE ,description: "Timestamp at which these statistics were last reset" } #==============================================================# # 0310 pg_size #==============================================================# pg_size: name: pg_size desc: PostgreSQL Database, WAL, Log size since v10 query: |- SELECT datname, pg_database_size(oid) AS bytes FROM pg_database UNION ALL SELECT 'log', CASE WHEN current_setting('logging_collector') = 'on' THEN COALESCE((SELECT SUM(size) FROM pg_catalog.pg_ls_logdir()), 0) ELSE 0 END UNION ALL SELECT 'wal', COALESCE((SELECT SUM(size) FROM pg_catalog.pg_ls_waldir()), 0); ttl: 60 timeout: 1 min_version: 100000 tags: [ cluster ] metrics: - datname: { usage: LABEL ,description: "Database name, or special category wal, or log" } - bytes: { usage: GAUGE ,description: "File size in bytes" } #==============================================================# # 0320 pg_archiver #==============================================================# pg_archiver: name: pg_archiver desc: PostgreSQL archiver process statistics query: |- SELECT archived_count AS finish_count,failed_count, extract(epoch FROM last_archived_time) AS finish_time, extract(epoch FROM last_failed_time) AS failed_time, extract(epoch FROM stats_reset) AS reset_time FROM pg_stat_archiver; ttl: 60 min_version: 090400 tags: [ cluster ] metrics: - finish_count: { usage: COUNTER ,description: "Number of WAL files that have been successfully archived" } - failed_count: { usage: COUNTER ,description: "Number of failed attempts for archiving WAL files" } - finish_time: { usage: GAUGE ,description: "Time of the last successful archive operation" } - failed_time: { usage: GAUGE ,description: "Time of the last failed archival operation" } - reset_time: { usage: GAUGE ,description: "Time at which archive statistics were last reset" } #==============================================================# # 0330 pg_bgwriter #==============================================================# # https://pgpedia.info/p/pg_stat_bgwriter.html pg_bgwriter_17: name: pg_bgwriter desc: "PostgreSQL background writer metrics PG 17+" query: SELECT buffers_clean, maxwritten_clean, buffers_alloc, extract(EPOCH FROM stats_reset) AS reset_time FROM pg_stat_bgwriter; ttl: 10 min_version: 170000 tags: [ cluster ] metrics: - buffers_clean: { usage: COUNTER ,description: "Number of buffers written by the background writer" } - maxwritten_clean: { usage: COUNTER ,description: "Number of times the background writer stopped a cleaning scan because it had written too many buffers" } - buffers_alloc: { usage: COUNTER ,description: "Number of buffers allocated" } - reset_time: { usage: GAUGE ,description: "Time at which bgwriter statistics were last reset" } pg_bgwriter_10: name: pg_bgwriter desc: "PostgreSQL background writer metrics (PG 9.4-16)" query: SELECT checkpoints_timed, checkpoints_req, checkpoint_write_time, checkpoint_sync_time, buffers_checkpoint, buffers_clean, buffers_backend, maxwritten_clean, buffers_backend_fsync, buffers_alloc, extract(EPOCH FROM stats_reset) AS reset_time FROM pg_stat_bgwriter; ttl: 10 min_version: 090400 max_version: 170000 tags: [ cluster ] metrics: - checkpoints_timed: { usage: COUNTER ,description: "Number of scheduled checkpoints that have been performed" } - checkpoints_req: { usage: COUNTER ,description: "Number of requested checkpoints that have been performed" } - checkpoint_write_time: { usage: COUNTER ,scale: 1e-3 ,description: "Total amount of time that has been spent in the portion of checkpoint processing where files are written to disk, in seconds" } - checkpoint_sync_time: { usage: COUNTER ,scale: 1e-3 ,description: "Total amount of time that has been spent in the portion of checkpoint processing where files are synchronized to disk, in seconds" } - buffers_checkpoint: { usage: COUNTER ,description: "Number of buffers written during checkpoints" } - buffers_clean: { usage: COUNTER ,description: "Number of buffers written by the background writer" } - buffers_backend: { usage: COUNTER ,description: "Number of buffers written directly by a backend" } - maxwritten_clean: { usage: COUNTER ,description: "Number of times the background writer stopped a cleaning scan because it had written too many buffers" } - buffers_backend_fsync: { usage: COUNTER ,description: "Number of times a backend had to execute its own fsync call" } - buffers_alloc: { usage: COUNTER ,description: "Number of buffers allocated" } - reset_time: { usage: GAUGE ,description: "Time at which bgwriter statistics were last reset" } #==============================================================# # 0331 pg_checkpointer #==============================================================# pg_checkpointer_18: name: pg_checkpointer desc: "PostgreSQL checkpointer stat metrics for pg 18+" query: SELECT num_timed, num_requested, num_done, restartpoints_timed, restartpoints_req, restartpoints_done, write_time, sync_time, buffers_written, slru_written, extract(EPOCH FROM stats_reset) AS reset_time FROM pg_stat_checkpointer; ttl: 10 min_version: 180000 tags: [ cluster ] metrics: - num_timed: { usage: COUNTER ,rename: timed ,description: "Number of scheduled checkpoints that have been performed" } - num_requested: { usage: COUNTER ,rename: req ,description: "Number of requested checkpoints that have been performed" } - num_done: { usage: COUNTER ,rename: done ,description: "Number of checkpoints that have been performed" } - restartpoints_timed: { usage: COUNTER ,description: "Number of scheduled restartpoints due to timeout or after a failed attempt to perform it" } - restartpoints_req: { usage: COUNTER ,description: "Number of requested restartpoints" } - restartpoints_done: { usage: COUNTER ,description: "Number of restartpoints that have been performed" } - write_time: { usage: COUNTER ,scale: 1e-3 ,description: "Total amount of time that has been spent in the portion of checkpoint processing where files are written to disk, in seconds" } - sync_time: { usage: COUNTER ,scale: 1e-3 ,description: "Total amount of time that has been spent in the portion of checkpoint processing where files are synchronized to disk, in seconds" } - buffers_written: { usage: COUNTER ,description: "Number of buffers written during checkpoints and restartpoints" } - slru_written: { usage: COUNTER ,description: "Number of SLRU buffers written during checkpoints and restartpoints" } - reset_time: { usage: GAUGE ,description: "Time at which checkpointer statistics were last reset" } pg_checkpointer_17: name: pg_checkpointer desc: "PostgreSQL checkpointer stat metrics for pg 17" query: SELECT num_timed, num_requested, restartpoints_timed, restartpoints_req, restartpoints_done, write_time, sync_time, buffers_written, extract(EPOCH FROM stats_reset) AS reset_time FROM pg_stat_checkpointer; ttl: 10 min_version: 170000 max_version: 180000 tags: [ cluster ] metrics: - num_timed: { usage: COUNTER ,rename: timed ,description: "Number of scheduled checkpoints that have been performed" } - num_requested: { usage: COUNTER ,rename: req ,description: "Number of requested checkpoints that have been performed" } - restartpoints_timed: { usage: COUNTER ,description: "Number of scheduled restartpoints due to timeout or after a failed attempt to perform it" } - restartpoints_req: { usage: COUNTER ,description: "Number of requested restartpoints" } - restartpoints_done: { usage: COUNTER ,description: "Number of restartpoints that have been performed" } - write_time: { usage: COUNTER ,scale: 1e-3 ,description: "Total amount of time that has been spent in the portion of checkpoint processing where files are written to disk, in seconds" } - sync_time: { usage: COUNTER ,scale: 1e-3 ,description: "Total amount of time that has been spent in the portion of checkpoint processing where files are synchronized to disk, in seconds" } - buffers_written: { usage: COUNTER ,description: "Number of buffers written during checkpoints and restartpoints" } - reset_time: { usage: GAUGE ,description: "Time at which checkpointer statistics were last reset" } pg_checkpointer_10: name: pg_checkpointer desc: "PostgreSQL checkpointer stat metrics for pg 9.4-16" query: SELECT checkpoints_timed, checkpoints_req, checkpoint_write_time, checkpoint_sync_time, buffers_checkpoint, extract(EPOCH FROM stats_reset) AS reset_time FROM pg_stat_bgwriter; ttl: 10 min_version: 090400 max_version: 170000 tags: [ cluster ] metrics: - checkpoints_timed: { usage: COUNTER ,rename: timed ,description: "Number of scheduled checkpoints that have been performed" } - checkpoints_req: { usage: COUNTER ,rename: req ,description: "Number of requested checkpoints that have been performed" } - checkpoint_write_time: { usage: COUNTER ,rename: write_time ,scale: 1e-3 ,description: "Total amount of time that has been spent in the portion of checkpoint processing where files are written to disk, in seconds" } - checkpoint_sync_time: { usage: COUNTER ,rename: sync_time ,scale: 1e-3 ,description: "Total amount of time that has been spent in the portion of checkpoint processing where files are synchronized to disk, in seconds" } - buffers_checkpoint: { usage: COUNTER ,rename: buffers_written ,description: "Number of buffers written during checkpoints and restartpoints" } - reset_time: { usage: GAUGE ,description: "Time at which checkpointer statistics were last reset" } #==============================================================# # 0340 pg_ssl #==============================================================# pg_ssl: name: pg_ssl desc: PostgreSQL SSL client connection count query: | SELECT count(*) FILTER (WHERE ssl) AS enabled, count(*) FILTER ( WHERE NOT ssl) AS disabled FROM pg_stat_ssl; ttl: 10 min_version: 090500 tags: [ cluster ] metrics: - enabled: { usage: GAUGE ,description: "Number of client connection that use ssl" } - disabled: { usage: GAUGE ,description: "Number of client connection that does not use ssl" } #==============================================================# # 0350 pg_checkpoint #==============================================================# pg_checkpoint: name: pg_checkpoint desc: checkpoint information from pg_control_checkpoint since 10 query: |- SELECT checkpoint_lsn - '0/0' AS checkpoint_lsn, redo_lsn - '0/0' AS redo_lsn, timeline_id AS tli, prev_timeline_id AS prev_tli, full_page_writes, split_part(next_xid, ':', 1) AS next_xid_epoch, split_part(next_xid, ':', 2) AS next_xid, next_oid::BIGINT, next_multixact_id::text::BIGINT, next_multi_offset::text::BIGINT, oldest_xid::text::BIGINT, oldest_xid_dbid::text::BIGINT, oldest_active_xid::text::BIGINT, oldest_multi_xid::text::BIGINT, oldest_multi_dbid::BIGINT, oldest_commit_ts_xid::text::BIGINT, newest_commit_ts_xid::text::BIGINT, checkpoint_time AS time, extract(epoch from now() - checkpoint_time) AS elapse FROM pg_control_checkpoint(); ttl: 60 min_version: 100000 tags: [ cluster ] metrics: - checkpoint_lsn: { usage: COUNTER ,description: "Latest checkpoint location" } - redo_lsn: { usage: COUNTER ,description: "Latest checkpoint's REDO location" } - tli: { usage: COUNTER ,description: "Latest checkpoint's TimeLineID" } - prev_tli: { usage: COUNTER ,description: "Latest checkpoint's PrevTimeLineID" } - full_page_writes: { usage: GAUGE ,description: "Latest checkpoint's full_page_writes enabled" } - next_xid_epoch: { usage: COUNTER ,description: "Latest checkpoint's NextXID epoch" } - next_xid: { usage: COUNTER ,description: "Latest checkpoint's NextXID xid" } - next_oid: { usage: COUNTER ,description: "Latest checkpoint's NextOID" } - next_multixact_id: { usage: COUNTER ,description: "Latest checkpoint's NextMultiXactId" } - next_multi_offset: { usage: COUNTER ,description: "Latest checkpoint's NextMultiOffset" } - oldest_xid: { usage: COUNTER ,description: "Latest checkpoint's oldestXID" } - oldest_xid_dbid: { usage: GAUGE ,description: "Latest checkpoint's oldestXID's DB OID" } - oldest_active_xid: { usage: COUNTER ,description: "Latest checkpoint's oldestActiveXID" } - oldest_multi_xid: { usage: COUNTER ,description: "Latest checkpoint's oldestMultiXid" } - oldest_multi_dbid: { usage: GAUGE ,description: "Latest checkpoint's oldestMulti's DB OID" } - oldest_commit_ts_xid: { usage: COUNTER ,description: "Latest checkpoint's oldestCommitTsXid" } - newest_commit_ts_xid: { usage: COUNTER ,description: "Latest checkpoint's newestCommitTsXid" } - time: { usage: COUNTER ,description: "Time of latest checkpoint" } - elapse: { usage: GAUGE ,description: "Seconds elapsed since latest checkpoint in seconds" } #==============================================================# # 0355 pg_timeline #==============================================================# pg_timeline: name: pg_timeline desc: Current timeline ID from primary or replica query: | SELECT COALESCE( (SELECT received_tli FROM pg_stat_wal_receiver), (SELECT timeline_id FROM pg_control_checkpoint()) ) AS id; ttl: 10 min_version: 100000 tags: [ cluster ] metrics: - id: { usage: GAUGE ,description: "Current timeline ID" } #==============================================================# # 0360 pg_recovery #==============================================================# pg_recovery: name: pg_recovery desc: PostgreSQL control recovery metrics (9.6+) query: | SELECT min_recovery_end_timeline AS min_timeline, min_recovery_end_lsn - '0/0' AS min_lsn, backup_start_lsn - '0/0' AS backup_start_lsn, backup_end_lsn - '0/0' AS backup_end_lsn, end_of_backup_record_required AS require_record FROM pg_control_recovery(); ttl: 10 min_version: 090600 tags: [ cluster, replica ] metrics: - min_timeline: { usage: COUNTER ,description: "Min recovery ending loc's timeline" } - min_lsn: { usage: COUNTER ,description: "Minimum recovery ending location" } - backup_start_lsn: { usage: COUNTER ,description: "Backup start location" } - backup_end_lsn: { usage: COUNTER ,description: "Backup end location" } - require_record: { usage: GAUGE ,description: "End-of-backup record required" } pg_recovery_prefetch: name: pg_recovery_prefetch desc: PostgreSQL recovery prefetch metrics (15+) query: SELECT prefetch,hit,skip_init,skip_new,skip_fpw,skip_rep,wal_distance,block_distance,io_depth,extract(EPOCH FROM stats_reset) AS reset_time FROM pg_stat_recovery_prefetch; ttl: 10 min_version: 150000 tags: [ cluster, replica ] metrics: - prefetch: { usage: COUNTER ,description: "Number of blocks prefetched because they were not in the buffer pool" } - hit: { usage: COUNTER ,description: "Number of blocks not prefetched because they were already in the buffer pool" } - skip_init: { usage: COUNTER ,description: "Number of blocks not prefetched because they would be zero-initialized" } - skip_new: { usage: COUNTER ,description: "Number of blocks not prefetched because they didn't exist yet" } - skip_fpw: { usage: COUNTER ,description: "Number of blocks not prefetched because a full page image was included in the WAL" } - skip_rep: { usage: COUNTER ,description: "Number of blocks not prefetched because they were already recently prefetched" } - wal_distance: { usage: GAUGE ,description: "How many bytes ahead the prefetcher is looking" } - block_distance: { usage: GAUGE ,description: "How many blocks ahead the prefetcher is looking" } - io_depth: { usage: GAUGE ,description: "How many prefetches have been initiated but are not yet known to have completed" } - reset_time: { usage: GAUGE ,description: "Time at which these recovery prefetch statistics were last reset" } #==============================================================# # 0370 pg_slru #==============================================================# pg_slru_13: name: pg_slru desc: PostgreSQL simple-least-recently-used (SLRU) cache statistics v13 query: SELECT name, blks_zeroed, blks_hit, blks_read, blks_written, blks_exists, flushes, truncates, extract(EPOCH FROM stats_reset) AS reset_time FROM pg_stat_slru; ttl: 60 min_version: 130000 tags: [ cluster ] metrics: - name: { usage: LABEL ,description: "Name of the SLRU" } - blks_zeroed: { usage: COUNTER ,description: "Number of blocks zeroed during initializations" } - blks_hit: { usage: COUNTER ,description: "Number of times disk blocks were found already in the SLRU, so that a read was not necessary" } - blks_read: { usage: COUNTER ,description: "Number of disk blocks read for this SLRU" } - blks_written: { usage: COUNTER ,description: "Number of disk blocks written for this SLRU" } - blks_exists: { usage: COUNTER ,description: "Number of blocks checked for existence for this SLRU" } - flushes: { usage: COUNTER ,description: "Number of flushes of dirty data for this SLRU" } - truncates: { usage: COUNTER ,description: "Number of truncates for this SLRU" } - reset_time: { usage: GAUGE ,description: "Time at which these statistics were last reset" } #==============================================================# # 0380 pg_shmem #==============================================================# # pg_shmem require su privilege to work. Disable it or create auxiliary function with su before use: # CREATE OR REPLACE FUNCTION monitor.pg_shmem() RETURNS SETOF pg_shmem_allocations AS $$ SELECT * FROM pg_shmem_allocations;$$ LANGUAGE SQL SECURITY DEFINER; pg_shmem: name: pg_shmem desc: Allocations made from the server's main shared memory segment query: SELECT coalesce(name, 'Free') AS name, off AS offset, size, allocated_size FROM monitor.pg_shmem(); ttl: 60 min_version: 130000 skip: true # disable it by default tags: [cluster, "schema:monitor" ] metrics: - name: { usage: LABEL ,description: "Name of the shared memory allocation" } - offset: { usage: GAUGE ,description: "The offset at which the allocation starts" } - size: { usage: GAUGE ,description: "Size of the allocation" } - allocated_size: { usage: GAUGE ,description: "Size of the allocation including padding" } #==============================================================# # 0390 pg_wal #==============================================================# pg_wal_18: name: pg_wal desc: PostgreSQL WAL statistics since v18 with some col removed query: SELECT wal_records AS records, wal_fpi AS fpi, wal_bytes AS bytes, wal_buffers_full AS buffers_full,extract(EPOCH FROM stats_reset) AS reset_time FROM pg_stat_wal; ttl: 10 tags: [ cluster ] min_version: 180000 metrics: - records: { usage: COUNTER ,description: "Total number of WAL records generated" } - fpi: { usage: COUNTER ,description: "Total number of WAL full page images generated" } - bytes: { usage: COUNTER ,description: "Total amount of WAL generated in bytes" } - buffers_full: { usage: COUNTER ,description: "Number of times WAL data was written to disk because WAL buffers became full" } - reset_time: { usage: GAUGE ,description: "When statistics were last reset" } pg_wal_14: name: pg_wal desc: PostgreSQL WAL statistics since v14 query: SELECT wal_records AS records, wal_fpi AS fpi, wal_bytes AS bytes, wal_buffers_full AS buffers_full, wal_write AS write, wal_sync AS sync, wal_write_time AS write_time, wal_sync_time AS sync_time, extract(EPOCH FROM stats_reset) AS reset_time FROM pg_stat_wal; ttl: 10 tags: [ cluster ] min_version: 140000 max_version: 180000 metrics: - records: { usage: COUNTER ,description: "Total number of WAL records generated" } - fpi: { usage: COUNTER ,description: "Total number of WAL full page images generated" } - bytes: { usage: COUNTER ,description: "Total amount of WAL generated in bytes" } - buffers_full: { usage: COUNTER ,description: "Number of times WAL data was written to disk because WAL buffers became full" } - write: { usage: COUNTER ,description: "Number of times WAL buffers were written out to disk via XLogWrite request." } - sync: { usage: COUNTER ,description: "Number of times WAL files were synced to disk via issue_xlog_fsync request" } - write_time: { usage: COUNTER ,scale: 1e-3 ,description: "Total amount of time spent writing WAL buffers to disk via XLogWrite request in seconds" } - sync_time: { usage: COUNTER ,scale: 1e-3 ,description: "Total amount of time spent syncing WAL files to disk via issue_xlog_fsync request, in seconds" } - reset_time: { usage: GAUGE ,description: "When statistics were last reset" } #==============================================================# # 0410 pg_activity #==============================================================# pg_activity: name: pg_activity desc: PostgreSQL backend activity group by database and state query: |- SELECT datname, state, coalesce(count, 0) AS count, coalesce(max_duration, 0) AS max_duration, coalesce(max_tx_duration, 0) AS max_tx_duration, coalesce(max_conn_duration, 0) AS max_conn_duration FROM (SELECT d.datname, a.state FROM pg_database d, unnest(ARRAY ['active','idle','idle in transaction','idle in transaction (aborted)','fastpath function call','disabled']) a(state) WHERE d.datallowconn AND NOT d.datistemplate) base LEFT JOIN (SELECT datname, state, count(*) AS count, max(extract(epoch from now() - state_change)) AS max_duration, max(extract(epoch from now() - xact_start)) AS max_tx_duration, max(extract(epoch from now() - backend_start)) AS max_conn_duration FROM pg_stat_activity WHERE pid <> pg_backend_pid() GROUP BY 1,2) data USING (datname,state); ttl: 10 min_version: 090400 tags: [ cluster ] metrics: - datname: { usage: LABEL ,description: "Name of the database this backend is connected to" } - state: { usage: LABEL ,description: "Current overall state of this backend." } - count: { usage: GAUGE ,description: "Count of connection among (datname,state)" } - max_duration: { usage: GAUGE ,description: "Max duration since last state change among (datname, state)" } - max_tx_duration: { usage: GAUGE ,description: "Max transaction duration since state change among (datname, state)" } - max_conn_duration: { usage: GAUGE ,description: "Max backend session duration since state change among (datname, state)" } #==============================================================# # 0420 pg_wait #==============================================================# pg_wait: name: pg_wait desc: PostgreSQL backend client count group by wait event type since 9.6 query: | SELECT coalesce(datname, '_system') AS datname, coalesce(wait_event_type, 'Running') AS event, count(*) AS count FROM pg_stat_activity GROUP BY 1, 2; ttl: 10 min_version: 090600 tags: [ cluster ] metrics: - datname: { usage: LABEL ,description: "Name of the database, _system for global process" } - event: { usage: LABEL ,description: "Wait event type" } - count: { usage: GAUGE ,description: "Count of WaitEvent on target database" } #==============================================================# # 0430 pg_backend #==============================================================# pg_backend: name: pg_backend desc: PostgreSQL backend client count group by wait event type since 10 query: SELECT backend_type AS "type", count(*) AS count FROM pg_stat_activity GROUP BY backend_type; ttl: 10 min_version: 100000 tags: [ cluster ] metrics: - type: { usage: LABEL ,description: "Database backend process type" } - count: { usage: GAUGE ,description: "Database backend process count by backend_type" } #==============================================================# # 0440 pg_xact #==============================================================# pg_xact: name: pg_xact desc: PostgreSQL transaction identifier metrics query: WITH snap(v) AS (SELECT txid_current_snapshot()), xset(v) AS (SELECT txid_snapshot_xip(v) FROM snap), xnum(v) AS (SELECT count(*) from xset), xmin(v) AS (SELECT txid_snapshot_xmin(v) FROM snap), xmax(v) AS (SELECT txid_snapshot_xmax(v) FROM snap) SELECT xmin.v AS xmin, xmax.v AS xmax, xnum.v AS xnum FROM xmin, xmax, xnum; ttl: 10 min_version: 090400 tags: [ cluster ] metrics: - xmin: { usage: COUNTER ,description: "Earliest txid that is still active" } - xmax: { usage: COUNTER ,description: "First as-yet-unassigned txid. txid >= this are invisible." } - xnum: { usage: GAUGE ,description: "Current active transaction count" } #==============================================================# # 0450 pg_lock #==============================================================# pg_lock: name: pg_lock desc: PostgreSQL lock distribution by mode and database query: | SELECT datname, mode, coalesce(count, 0) AS count FROM (SELECT d.oid AS database, d.datname, l.mode FROM pg_database d, unnest(ARRAY ['AccessShareLock','RowShareLock','RowExclusiveLock','ShareUpdateExclusiveLock', 'ShareLock','ShareRowExclusiveLock','ExclusiveLock','AccessExclusiveLock']) l(mode) WHERE d.datallowconn AND NOT d.datistemplate) base LEFT JOIN (SELECT database, mode, count(*) AS count FROM pg_locks WHERE database IS NOT NULL GROUP BY 1, 2) cnt USING (database, mode); ttl: 10 min_version: 090400 tags: [ cluster ] metrics: - datname: { usage: LABEL ,description: "Name of the database this backend is connected to" } - mode: { usage: LABEL ,description: "Name of the lock mode held or desired by this process" } - count: { usage: GAUGE ,description: "Number of locks of corresponding mode and database" } #==============================================================# # 0460 pg_query #==============================================================# pg_query_17: name: pg_query desc: PostgreSQL Query metrics, require pg_stat_statements installed, 17+ query: |- SELECT datname, queryid AS query, sum(calls) AS calls, sum(rows) AS rows, sum(total_exec_time) AS exec_time, sum(shared_blk_read_time) + sum(shared_blk_write_time) AS io_time, sum(wal_bytes) AS wal_bytes ,sum(shared_blks_hit) AS sblk_hit, sum(shared_blks_read) AS sblk_read, sum(shared_blks_dirtied) AS sblk_dirtied, sum(shared_blks_written) AS sblk_written FROM pg_stat_statements(false) s JOIN pg_database d ON s.dbid = d.oid WHERE userid != 10 AND calls > 4 GROUP BY 1, 2 ORDER BY 3 DESC LIMIT 128; ttl: 10 timeout: 2 min_version: 170000 tags: [ cluster, "extension:pg_stat_statements" ] metrics: - datname: { usage: LABEL ,description: "Name of database" } - query: { usage: LABEL ,description: "QueryID generated from internal hash code, computed from the statement's parse tree" } - calls: { usage: COUNTER ,description: "Number of times the statement was executed" } - rows: { usage: COUNTER ,description: "Total number of rows retrieved or affected by the statement" } - exec_time: { usage: COUNTER ,scale: 1e-3 ,description: "Total time spent executing the statement, in seconds" } - io_time: { usage: COUNTER ,scale: 1e-3 ,description: "Total time the statement spent reading and writing blocks, in seconds" } - wal_bytes: { usage: COUNTER ,description: "Total amount of WAL bytes generated by the statement" } - sblk_hit: { usage: COUNTER ,description: "Total number of shared block cache hits by the statement" } - sblk_read: { usage: COUNTER ,description: "Total number of shared blocks read by the statement" } - sblk_dirtied: { usage: COUNTER ,description: "Total number of shared blocks dirtied by the statement" } - sblk_written: { usage: COUNTER ,description: "Total number of shared blocks written by the statement" } pg_query_13: name: pg_query desc: PostgreSQL Query metrics, require pg_stat_statements installed, 13 - 16 query: |- SELECT datname, queryid AS query, sum(calls) AS calls, sum(rows) AS rows, sum(total_exec_time) AS exec_time, sum(blk_read_time) + sum(blk_write_time) AS io_time, sum(wal_bytes) AS wal_bytes ,sum(shared_blks_hit) AS sblk_hit, sum(shared_blks_read) AS sblk_read, sum(shared_blks_dirtied) AS sblk_dirtied, sum(shared_blks_written) AS sblk_written FROM pg_stat_statements(false) s JOIN pg_database d ON s.dbid = d.oid WHERE userid != 10 AND calls > 4 GROUP BY 1, 2 ORDER BY 3 DESC LIMIT 128; ttl: 10 timeout: 2 min_version: 130000 max_version: 170000 tags: [ cluster, "extension:pg_stat_statements" ] metrics: - datname: { usage: LABEL ,description: "Name of database" } - query: { usage: LABEL ,description: "QueryID generated from internal hash code, computed from the statement's parse tree" } - calls: { usage: COUNTER ,description: "Number of times the statement was executed" } - rows: { usage: COUNTER ,description: "Total number of rows retrieved or affected by the statement" } - exec_time: { usage: COUNTER ,scale: 1e-3 ,description: "Total time spent executing the statement, in seconds" } - io_time: { usage: COUNTER ,scale: 1e-3 ,description: "Total time the statement spent reading and writing blocks, in seconds" } - wal_bytes: { usage: COUNTER ,description: "Total amount of WAL bytes generated by the statement" } - sblk_hit: { usage: COUNTER ,description: "Total number of shared block cache hits by the statement" } - sblk_read: { usage: COUNTER ,description: "Total number of shared blocks read by the statement" } - sblk_dirtied: { usage: COUNTER ,description: "Total number of shared blocks dirtied by the statement" } - sblk_written: { usage: COUNTER ,description: "Total number of shared blocks written by the statement" } pg_query_10: name: pg_query desc: PostgreSQL query statement metrics, require pg_stat_statements installed, 9.4 ~ 12 query: |- SELECT datname, queryid AS query, sum(calls) AS calls, sum(rows) AS rows, sum(total_time) AS exec_time, sum(blk_read_time) + sum(blk_write_time) AS io_time, sum(shared_blks_hit) AS sblk_hit, sum(shared_blks_read) AS sblk_read, sum(shared_blks_dirtied) AS sblk_dirtied, sum(shared_blks_written) AS sblk_written FROM pg_stat_statements(false) s JOIN pg_database d ON s.dbid = d.oid WHERE userid != 10 AND calls > 4 GROUP BY 1, 2 ORDER BY 3 DESC LIMIT 128; ttl: 10 timeout: 2 min_version: 090400 max_version: 130000 tags: [ cluster, "extension:pg_stat_statements" ] metrics: - datname: { usage: LABEL ,description: "Name of database" } - query: { usage: LABEL ,description: "QueryID generated from internal hash code, computed from the statement's parse tree" } - calls: { usage: COUNTER ,description: "Number of times the statement was executed" } - rows: { usage: COUNTER ,description: "Total number of rows retrieved or affected by the statement" } - exec_time: { usage: COUNTER ,scale: 1e-3 ,description: "Total time spent executing the statement, in seconds" } - io_time: { usage: COUNTER ,scale: 1e-3 ,description: "Total time the statement spent reading and writing blocks, in seconds" } - sblk_hit: { usage: COUNTER ,description: "Total number of shared block cache hits by the statement" } - sblk_read: { usage: COUNTER ,description: "Total number of shared blocks read by the statement" } - sblk_dirtied: { usage: COUNTER ,description: "Total number of shared blocks dirtied by the statement" } - sblk_written: { usage: COUNTER ,description: "Total number of shared blocks written by the statement" } #==============================================================# # 0510 pg_vacuuming #==============================================================# pg_vacuuming_18: name: pg_vacuuming desc: PostgreSQL vacuum progress 18+ query: |- SELECT datname, pid, relid::RegClass AS relname, CASE phase WHEN 'scanning heap' THEN (CASE WHEN heap_blks_total > 0 THEN 1.0 * heap_blks_scanned / heap_blks_total ELSE 0.0 END) WHEN 'vacuuming heap' THEN (CASE WHEN heap_blks_total > 0 THEN 1.0 * heap_blks_vacuumed / heap_blks_total ELSE 0 END) ELSE NULL END AS progress, indexes_total, indexes_processed, dead_tuple_bytes, delay_time FROM pg_stat_progress_vacuum; ttl: 10 min_version: 180000 tags: [ cluster, primary ] metrics: - datname: { usage: LABEL ,description: "database name" } - pid: { usage: LABEL ,description: "process id of vacuum worker" } - relname: { usage: LABEL ,description: "relation name of vacuuming table" } - progress: { usage: GAUGE ,description: "vacuum progress ratio (0-1) based on heap blocks scanned/vacuumed" } - indexes_total: { usage: GAUGE ,description: "total number of indexes that will be vacuumed or cleaned up" } - indexes_processed: { usage: GAUGE ,description: "number of indexes that have been vacuumed or cleaned up" } - dead_tuple_bytes: { usage: GAUGE ,description: "total size of dead tuples collected since the beginning of vacuum in bytes" } - delay_time: { usage: COUNTER ,scale: 1e-3 ,description: "total time spent sleeping due to cost-based delay in seconds" } pg_vacuuming_17: name: pg_vacuuming desc: PostgreSQL vacuum progress 17 (with index progress tracking) query: |- SELECT datname, pid, relid::RegClass AS relname, CASE phase WHEN 'scanning heap' THEN (CASE WHEN heap_blks_total > 0 THEN 1.0 * heap_blks_scanned / heap_blks_total ELSE 0.0 END) WHEN 'vacuuming heap' THEN (CASE WHEN heap_blks_total > 0 THEN 1.0 * heap_blks_vacuumed / heap_blks_total ELSE 0 END) ELSE NULL END AS progress, indexes_total, indexes_processed, dead_tuple_bytes FROM pg_stat_progress_vacuum; ttl: 10 min_version: 170000 max_version: 180000 tags: [ cluster, primary ] metrics: - datname: { usage: LABEL ,description: "database name" } - pid: { usage: LABEL ,description: "process id of vacuum worker" } - relname: { usage: LABEL ,description: "relation name of vacuuming table" } - progress: { usage: GAUGE ,description: "vacuum progress ratio (0-1) based on heap blocks scanned/vacuumed" } - indexes_total: { usage: GAUGE ,description: "total number of indexes that will be vacuumed or cleaned up" } - indexes_processed: { usage: GAUGE ,description: "number of indexes that have been vacuumed or cleaned up" } - dead_tuple_bytes: { usage: GAUGE ,description: "total size of dead tuples collected since the beginning of vacuum in bytes" } pg_vacuuming_12: name: pg_vacuuming desc: PostgreSQL vacuum progress 12-16 query: |- SELECT datname, pid, relid::RegClass AS relname, CASE phase WHEN 'scanning heap' THEN (CASE WHEN heap_blks_total > 0 THEN 1.0 * heap_blks_scanned / heap_blks_total ELSE 0.0 END) WHEN 'vacuuming heap' THEN (CASE WHEN heap_blks_total > 0 THEN 1.0 * heap_blks_vacuumed / heap_blks_total ELSE 0 END) ELSE NULL END AS progress FROM pg_stat_progress_vacuum; ttl: 10 min_version: 120000 max_version: 170000 tags: [ cluster, primary ] metrics: - datname: { usage: LABEL ,description: "database name" } - pid: { usage: LABEL ,description: "process id of vacuum worker" } - relname: { usage: LABEL ,description: "relation name of vacuuming table" } - progress: { usage: GAUGE ,description: "vacuum progress ratio (0-1) based on heap blocks scanned/vacuumed" } #==============================================================# # 0520 pg_indexing #==============================================================# pg_indexing: name: pg_indexing desc: PostgreSQL index creating progress (v12+) query: |- SELECT datname, pid, relid::RegClass AS relname, (CASE WHEN blocks_total > 0 THEN 1.0 * blocks_done / blocks_total ELSE NULL END) AS blocks, (CASE WHEN tuples_total > 0 THEN 1.0 * tuples_done / tuples_total ELSE NULL END) AS tuples, (CASE WHEN partitions_total > 0 THEN 1.0 * partitions_done / partitions_total ELSE NULL END) AS partitions, (CASE WHEN lockers_total > 0 THEN 1.0 * lockers_done / lockers_total ELSE NULL END) AS lockers FROM pg_stat_progress_create_index pspci; ttl: 10 min_version: 120000 tags: [ cluster, primary ] metrics: - datname: { usage: LABEL ,description: "Name of the database" } - pid: { usage: LABEL ,description: "Process id of indexing table" } - relname: { usage: LABEL ,description: "Relation name of indexed table" } - blocks: { usage: GAUGE ,description: "Percent of blocks been proceeded" } - tuples: { usage: GAUGE ,description: "Percent of tuples been proceeded" } - partitions: { usage: GAUGE ,description: "Percent of partitions been proceeded" } - lockers: { usage: GAUGE ,description: "Percent of lockers been proceeded" } #==============================================================# # 0530 pg_clustering #==============================================================# pg_clustering: name: pg_clustering desc: PostgreSQL cluster or vacuum full progress (v12+) query: SELECT datname, pid, relid::RegClass AS relname, param4 AS tup_scan, CASE WHEN param6 > 0 THEN 1.0 * param7 / param6 ELSE 0 END AS progress FROM pg_stat_get_progress_info('cluster') s LEFT JOIN pg_database d ON s.datid = d.oid; ttl: 10 min_version: 120000 tags: [ cluster, primary ] metrics: - datname: { usage: LABEL ,description: "Name of database been clustering" } - pid: { usage: LABEL ,description: "Process id of indexing table" } - relname: { usage: LABEL ,description: "Relation name of indexed table" } - tup_scan: { usage: GAUGE ,description: "How much tuple been scanned" } - progress: { usage: GAUGE ,description: "Progress of heap been processed" } #==============================================================# # 0540 pg_backup #==============================================================# pg_backup: name: pg_backup desc: PostgreSQL basebackup progress since 13 query: SELECT pid, param1 AS phase, CASE param2 WHEN -1::integer THEN NULL::bigint ELSE param2 END AS total_bytes, param3 AS sent_bytes FROM pg_stat_get_progress_info('BASEBACKUP'); ttl: 10 min_version: 130000 tags: [ cluster ] metrics: - pid: { usage: LABEL ,description: "process id of basebackup sender" } - phase: { usage: GAUGE ,description: "Phase encoded in 0~5 initial, wait checkpoint, estimate, streaming, waiting archive, transfer archive" } - total_bytes: { usage: GAUGE ,description: "Total amount of data that will be streamed" } - sent_bytes: { usage: GAUGE ,description: "Amount of data streamed" } #==============================================================# # 0610 pg_db #==============================================================# pg_db_18: name: pg_db desc: PostgreSQL database stats from pg_stat_database v18 query: |- SELECT d.datname, datid,age(datfrozenxid) AS age, datistemplate AS is_template, datallowconn AS allow_conn, datconnlimit AS conn_limit, datfrozenxid::TEXT::BIGINT as frozen_xid, numbackends,xact_commit,xact_rollback,xact_rollback + xact_commit AS xact_total,blks_read,blks_hit,blks_read + blks_hit AS blks_access,tup_returned,tup_fetched,tup_inserted,tup_updated,tup_deleted,tup_inserted + tup_updated + tup_deleted AS tup_modified, conflicts,temp_files,temp_bytes,deadlocks,coalesce(checksum_failures, -1) AS cks_fails, checksum_last_failure AS cks_fail_time,blk_read_time,blk_write_time, session_time,active_time,idle_in_transaction_time AS ixact_time,sessions,sessions_abandoned,sessions_fatal,sessions_killed,parallel_workers_to_launch,parallel_workers_launched, extract(EPOCH FROM stats_reset) AS reset_time FROM pg_database d JOIN pg_stat_database sd ON d.oid = sd.datid; ttl: 10 min_version: 180000 tags: [ cluster ] metrics: - datname: { usage: LABEL ,description: "Name of the database" } - datid: { usage: GAUGE ,description: "OID of the database" } - age: { usage: GAUGE ,description: "Age of database calculated from datfrozenxid" } - is_template: { usage: GAUGE ,description: "If true(1), then this database can be cloned by any user with CREATEDB privileges" } - allow_conn: { usage: GAUGE ,description: "If false(0) then no one can connect to this database." } - conn_limit: { usage: GAUGE ,description: "Sets maximum number of concurrent connections that can be made to this database. -1 means no limit." } - frozen_xid: { usage: GAUGE ,description: "All transaction IDs before this one have been frozen" } - numbackends: { usage: GAUGE ,description: "Number of backends currently connected to this database" } - xact_commit: { usage: COUNTER ,description: "Number of transactions in this database that have been committed" } - xact_rollback: { usage: COUNTER ,description: "Number of transactions in this database that have been rolled back" } - xact_total: { usage: COUNTER ,description: "Number of transactions in this database" } - blks_read: { usage: COUNTER ,description: "Number of disk blocks read in this database" } - blks_hit: { usage: COUNTER ,description: "Number of times disk blocks were found already in the buffer cache" } - blks_access: { usage: COUNTER ,description: "Number of times disk blocks that accessed read+hit" } - tup_returned: { usage: COUNTER ,description: "Number of rows returned by queries in this database" } - tup_fetched: { usage: COUNTER ,description: "Number of rows fetched by queries in this database" } - tup_inserted: { usage: COUNTER ,description: "Number of rows inserted by queries in this database" } - tup_updated: { usage: COUNTER ,description: "Number of rows updated by queries in this database" } - tup_deleted: { usage: COUNTER ,description: "Number of rows deleted by queries in this database" } - tup_modified: { usage: COUNTER ,description: "Number of rows modified by queries in this database" } - conflicts: { usage: COUNTER ,description: "Number of queries canceled due to conflicts with recovery in this database" } - temp_files: { usage: COUNTER ,description: "Number of temporary files created by queries in this database" } - temp_bytes: { usage: COUNTER ,description: "Total amount of data written to temporary files by queries in this database." } - deadlocks: { usage: COUNTER ,description: "Number of deadlocks detected in this database" } - cks_fails: { usage: COUNTER ,description: "Number of data page checksum failures detected in this database, -1 for not enabled" } - cks_fail_time: { usage: GAUGE ,description: "Time at which the last data page checksum failure was detected in this database" } - blk_read_time: { usage: COUNTER ,scale: 1e-3 ,description: "Time spent reading data file blocks by backends in this database, in seconds" } - blk_write_time: { usage: COUNTER ,scale: 1e-3 ,description: "Time spent writing data file blocks by backends in this database, in seconds" } - session_time: { usage: COUNTER ,scale: 1e-3 ,description: "Time spent by database sessions in this database, in seconds" } - active_time: { usage: COUNTER ,scale: 1e-3 ,description: "Time spent executing SQL statements in this database, in seconds" } - ixact_time: { usage: COUNTER ,scale: 1e-3 ,description: "Time spent idling while in a transaction in this database, in seconds" } - sessions: { usage: COUNTER ,description: "Total number of sessions established to this database" } - sessions_abandoned: { usage: COUNTER ,description: "Number of database sessions to this database that were terminated because connection to the client was lost" } - sessions_fatal: { usage: COUNTER ,description: "Number of database sessions to this database that were terminated by fatal errors" } - sessions_killed: { usage: COUNTER ,description: "Number of database sessions to this database that were terminated by operator intervention" } - parallel_workers_to_launch: { usage: COUNTER ,description: "Number of parallel workers planned to be launched by queries on this database" } - parallel_workers_launched: { usage: COUNTER ,description: "Number of parallel workers launched by queries on this database" } - reset_time: { usage: GAUGE ,description: "Time at which database statistics were last reset" } pg_db_14: name: pg_db desc: PostgreSQL database stats from pg_stat_database v14 (with 7 new time & session metrics) query: |- SELECT d.datname, datid,age(datfrozenxid) AS age, datistemplate AS is_template, datallowconn AS allow_conn, datconnlimit AS conn_limit, datfrozenxid::TEXT::BIGINT as frozen_xid, numbackends,xact_commit,xact_rollback,xact_rollback + xact_commit AS xact_total,blks_read,blks_hit,blks_read + blks_hit AS blks_access,tup_returned,tup_fetched,tup_inserted,tup_updated,tup_deleted,tup_inserted + tup_updated + tup_deleted AS tup_modified, conflicts,temp_files,temp_bytes,deadlocks,coalesce(checksum_failures, -1) AS cks_fails, checksum_last_failure AS cks_fail_time,blk_read_time,blk_write_time, session_time,active_time,idle_in_transaction_time AS ixact_time,sessions,sessions_abandoned,sessions_fatal,sessions_killed,extract(EPOCH FROM stats_reset) AS reset_time FROM pg_database d JOIN pg_stat_database sd ON d.oid = sd.datid; ttl: 10 min_version: 140000 max_version: 180000 tags: [ cluster ] metrics: - datname: { usage: LABEL ,description: "Name of the database" } - datid: { usage: GAUGE ,description: "OID of the database" } - age: { usage: GAUGE ,description: "Age of database calculated from datfrozenxid" } - is_template: { usage: GAUGE ,description: "If true(1), then this database can be cloned by any user with CREATEDB privileges" } - allow_conn: { usage: GAUGE ,description: "If false(0) then no one can connect to this database." } - conn_limit: { usage: GAUGE ,description: "Sets maximum number of concurrent connections that can be made to this database. -1 means no limit." } - frozen_xid: { usage: GAUGE ,description: "All transaction IDs before this one have been frozen" } - numbackends: { usage: GAUGE ,description: "Number of backends currently connected to this database" } - xact_commit: { usage: COUNTER ,description: "Number of transactions in this database that have been committed" } - xact_rollback: { usage: COUNTER ,description: "Number of transactions in this database that have been rolled back" } - xact_total: { usage: COUNTER ,description: "Number of transactions in this database" } - blks_read: { usage: COUNTER ,description: "Number of disk blocks read in this database" } - blks_hit: { usage: COUNTER ,description: "Number of times disk blocks were found already in the buffer cache" } - blks_access: { usage: COUNTER ,description: "Number of times disk blocks that accessed read+hit" } - tup_returned: { usage: COUNTER ,description: "Number of rows returned by queries in this database" } - tup_fetched: { usage: COUNTER ,description: "Number of rows fetched by queries in this database" } - tup_inserted: { usage: COUNTER ,description: "Number of rows inserted by queries in this database" } - tup_updated: { usage: COUNTER ,description: "Number of rows updated by queries in this database" } - tup_deleted: { usage: COUNTER ,description: "Number of rows deleted by queries in this database" } - tup_modified: { usage: COUNTER ,description: "Number of rows modified by queries in this database" } - conflicts: { usage: COUNTER ,description: "Number of queries canceled due to conflicts with recovery in this database" } - temp_files: { usage: COUNTER ,description: "Number of temporary files created by queries in this database" } - temp_bytes: { usage: COUNTER ,description: "Total amount of data written to temporary files by queries in this database." } - deadlocks: { usage: COUNTER ,description: "Number of deadlocks detected in this database" } - cks_fails: { usage: COUNTER ,description: "Number of data page checksum failures detected in this database, -1 for not enabled" } - cks_fail_time: { usage: GAUGE ,description: "Time at which the last data page checksum failure was detected in this database" } - blk_read_time: { usage: COUNTER ,scale: 1e-3 ,description: "Time spent reading data file blocks by backends in this database, in seconds" } - blk_write_time: { usage: COUNTER ,scale: 1e-3 ,description: "Time spent writing data file blocks by backends in this database, in seconds" } - session_time: { usage: COUNTER ,scale: 1e-3 ,description: "Time spent by database sessions in this database, in seconds" } - active_time: { usage: COUNTER ,scale: 1e-3 ,description: "Time spent executing SQL statements in this database, in seconds" } - ixact_time: { usage: COUNTER ,scale: 1e-3 ,description: "Time spent idling while in a transaction in this database, in seconds" } - sessions: { usage: COUNTER ,description: "Total number of sessions established to this database" } - sessions_abandoned: { usage: COUNTER ,description: "Number of database sessions to this database that were terminated because connection to the client was lost" } - sessions_fatal: { usage: COUNTER ,description: "Number of database sessions to this database that were terminated by fatal errors" } - sessions_killed: { usage: COUNTER ,description: "Number of database sessions to this database that were terminated by operator intervention" } - reset_time: { usage: GAUGE ,description: "Time at which database statistics were last reset" } pg_db_12: name: pg_db desc: PostgreSQL database stats from pg_stat_database v12 v13 (with 2 new checksum metrics) query: |- SELECT d.datname, datid,age(datfrozenxid) AS age, datistemplate AS is_template, datallowconn AS allow_conn, datconnlimit AS conn_limit, datfrozenxid::TEXT::BIGINT as frozen_xid, numbackends,xact_commit,xact_rollback,xact_rollback + xact_commit AS xact_total,blks_read,blks_hit,blks_read + blks_hit AS blks_access,tup_returned,tup_fetched,tup_inserted,tup_updated,tup_deleted,tup_inserted + tup_updated + tup_deleted AS tup_modified, conflicts,temp_files,temp_bytes,deadlocks,coalesce(checksum_failures, -1) AS cks_fails, checksum_last_failure AS cks_fail_time,blk_read_time,blk_write_time, extract(EPOCH FROM stats_reset) AS reset_time FROM pg_database d JOIN pg_stat_database sd ON d.oid = sd.datid; ttl: 10 min_version: 120000 max_version: 140000 tags: [ cluster ] metrics: - datname: { usage: LABEL ,description: "Name of the database" } - datid: { usage: GAUGE ,description: "OID of the database" } - age: { usage: GAUGE ,description: "Age of database calculated from datfrozenxid" } - is_template: { usage: GAUGE ,description: "If true(1), then this database can be cloned by any user with CREATEDB privileges" } - allow_conn: { usage: GAUGE ,description: "If false(0) then no one can connect to this database." } - conn_limit: { usage: GAUGE ,description: "Sets maximum number of concurrent connections that can be made to this database. -1 means no limit." } - frozen_xid: { usage: GAUGE ,description: "All transaction IDs before this one have been frozen" } - numbackends: { usage: GAUGE ,description: "Number of backends currently connected to this database" } - xact_commit: { usage: COUNTER ,description: "Number of transactions in this database that have been committed" } - xact_rollback: { usage: COUNTER ,description: "Number of transactions in this database that have been rolled back" } - xact_total: { usage: COUNTER ,description: "Number of transactions in this database" } - blks_read: { usage: COUNTER ,description: "Number of disk blocks read in this database" } - blks_hit: { usage: COUNTER ,description: "Number of times disk blocks were found already in the buffer cache" } - blks_access: { usage: COUNTER ,description: "Number of times disk blocks that accessed read+hit" } - tup_returned: { usage: COUNTER ,description: "Number of rows returned by queries in this database" } - tup_fetched: { usage: COUNTER ,description: "Number of rows fetched by queries in this database" } - tup_inserted: { usage: COUNTER ,description: "Number of rows inserted by queries in this database" } - tup_updated: { usage: COUNTER ,description: "Number of rows updated by queries in this database" } - tup_deleted: { usage: COUNTER ,description: "Number of rows deleted by queries in this database" } - tup_modified: { usage: COUNTER ,description: "Number of rows modified by queries in this database" } - conflicts: { usage: COUNTER ,description: "Number of queries canceled due to conflicts with recovery in this database" } - temp_files: { usage: COUNTER ,description: "Number of temporary files created by queries in this database" } - temp_bytes: { usage: COUNTER ,description: "Total amount of data written to temporary files by queries in this database." } - deadlocks: { usage: COUNTER ,description: "Number of deadlocks detected in this database" } - cks_fails: { usage: COUNTER ,description: "Number of data page checksum failures detected in this database, -1 for not enabled" } - cks_fail_time: { usage: GAUGE ,description: "Time at which the last data page checksum failure was detected in this database" } - blk_read_time: { usage: COUNTER ,scale: 1e-3 ,description: "Time spent reading data file blocks by backends in this database, in seconds" } - blk_write_time: { usage: COUNTER ,scale: 1e-3 ,description: "Time spent writing data file blocks by backends in this database, in seconds" } - reset_time: { usage: GAUGE ,description: "Time at which database statistics were last reset" } pg_db_10: name: pg_db desc: PostgreSQL database stats from pg_stat_database v10 v11 (actually since 9.2) query: |- SELECT d.datname, datid,age(datfrozenxid) AS age, datistemplate AS is_template, datallowconn AS allow_conn, datconnlimit AS conn_limit, datfrozenxid::TEXT::BIGINT as frozen_xid, numbackends,xact_commit,xact_rollback,xact_rollback + xact_commit AS xact_total, blks_read,blks_hit,blks_read + blks_hit AS blks_access,tup_returned,tup_fetched,tup_inserted,tup_updated,tup_deleted,tup_inserted + tup_updated + tup_deleted AS tup_modified, conflicts,temp_files,temp_bytes,deadlocks,blk_read_time,blk_write_time, extract(EPOCH FROM stats_reset) AS reset_time FROM pg_database d JOIN pg_stat_database sd ON d.oid = sd.datid; ttl: 10 min_version: 090200 max_version: 120000 tags: [ cluster ] metrics: - datname: { usage: LABEL ,description: "Name of the database" } - datid: { usage: GAUGE ,description: "OID of the database" } - age: { usage: GAUGE ,description: "Age of database calculated from datfrozenxid" } - is_template: { usage: GAUGE ,description: "If true(1), then this database can be cloned by any user with CREATEDB privileges" } - allow_conn: { usage: GAUGE ,description: "If false(0) then no one can connect to this database." } - conn_limit: { usage: GAUGE ,description: "Sets maximum number of concurrent connections that can be made to this database. -1 means no limit." } - frozen_xid: { usage: GAUGE ,description: "All transaction IDs before this one have been frozen" } - numbackends: { usage: GAUGE ,description: "Number of backends currently connected to this database" } - xact_commit: { usage: COUNTER ,description: "Number of transactions in this database that have been committed" } - xact_rollback: { usage: COUNTER ,description: "Number of transactions in this database that have been rolled back" } - xact_total: { usage: COUNTER ,description: "Number of transactions in this database" } - blks_read: { usage: COUNTER ,description: "Number of disk blocks read in this database" } - blks_hit: { usage: COUNTER ,description: "Number of times disk blocks were found already in the buffer cache" } - blks_access: { usage: COUNTER ,description: "Number of times disk blocks that accessed read+hit" } - tup_returned: { usage: COUNTER ,description: "Number of rows returned by queries in this database" } - tup_fetched: { usage: COUNTER ,description: "Number of rows fetched by queries in this database" } - tup_inserted: { usage: COUNTER ,description: "Number of rows inserted by queries in this database" } - tup_updated: { usage: COUNTER ,description: "Number of rows updated by queries in this database" } - tup_deleted: { usage: COUNTER ,description: "Number of rows deleted by queries in this database" } - tup_modified: { usage: COUNTER ,description: "Number of rows modified by queries in this database" } - conflicts: { usage: COUNTER ,description: "Number of queries canceled due to conflicts with recovery in this database" } - temp_files: { usage: COUNTER ,description: "Number of temporary files created by queries in this database" } - temp_bytes: { usage: COUNTER ,description: "Total amount of data written to temporary files by queries in this database." } - deadlocks: { usage: COUNTER ,description: "Number of deadlocks detected in this database" } - blk_read_time: { usage: COUNTER ,scale: 1e-3 ,description: "Time spent reading data file blocks by backends in this database, in seconds" } - blk_write_time: { usage: COUNTER ,scale: 1e-3 ,description: "Time spent writing data file blocks by backends in this database, in seconds" } - reset_time: { usage: GAUGE ,description: "Time at which database statistics were last reset" } #==============================================================# # 0620 pg_db_confl #==============================================================# # https://pgpedia.info/p/pg_stat_database_conflicts.html pg_db_confl_16: name: pg_db_confl desc: PostgreSQL database conflicts metrics for PG16+ query: SELECT * FROM pg_stat_database_conflicts; ttl: 10 min_version: 160000 tags: [ cluster, replica ] metrics: - datid: { usage: DISCARD } - datname: { usage: LABEL ,description: "Name of this database" } - confl_tablespace: { usage: COUNTER ,description: "Number of queries in this database that have been canceled due to dropped tablespaces" } - confl_lock: { usage: COUNTER ,description: "Number of queries in this database that have been canceled due to lock timeouts" } - confl_snapshot: { usage: COUNTER ,description: "Number of queries in this database that have been canceled due to old snapshots" } - confl_bufferpin: { usage: COUNTER ,description: "Number of queries in this database that have been canceled due to pinned buffers" } - confl_deadlock: { usage: COUNTER ,description: "Number of queries in this database that have been canceled due to deadlocks" } - confl_active_logicalslot: { usage: COUNTER ,description: "Number of uses of logical slots in this database that have been canceled due to old snapshots or too low a wal_level on the primary" } pg_db_confl_15: name: pg_db_confl desc: PostgreSQL database conflicts metrics for pg 9.1 - 15 query: SELECT * FROM pg_stat_database_conflicts; ttl: 10 min_version: 90100 max_version: 160000 tags: [ cluster, replica ] metrics: - datid: { usage: DISCARD } - datname: { usage: LABEL ,description: "Name of this database" } - confl_tablespace: { usage: COUNTER ,description: "Number of queries in this database that have been canceled due to dropped tablespaces" } - confl_lock: { usage: COUNTER ,description: "Number of queries in this database that have been canceled due to lock timeouts" } - confl_snapshot: { usage: COUNTER ,description: "Number of queries in this database that have been canceled due to old snapshots" } - confl_bufferpin: { usage: COUNTER ,description: "Number of queries in this database that have been canceled due to pinned buffers" } - confl_deadlock: { usage: COUNTER ,description: "Number of queries in this database that have been canceled due to deadlocks" } #==============================================================# # 0640 pg_pubrel #==============================================================# pg_pubrel: name: pg_pubrel desc: PostgreSQL publication and relation count query: SELECT CURRENT_CATALOG AS datname, pubname, count(*) AS count FROM pg_publication p, LATERAL pg_get_publication_tables(pubname) GROUP BY pubname; ttl: 10 min_version: 100000 metrics: - datname: { usage: LABEL ,description: "Name of the database which publication belonged" } - pubname: { usage: LABEL ,description: "Name of the publication" } - count: { usage: GAUGE ,description: "Count of relation in the publication" } #==============================================================# # 0650 pg_subrel #==============================================================# pg_subrel: name: pg_subrel desc: PostgreSQL subscripted relation group by state query: SELECT CURRENT_CATALOG AS datname, subname, srsubstate::TEXT AS state, count(*) AS count FROM pg_subscription_rel sr LEFT JOIN pg_stat_subscription ss ON sr.srsubid = ss.subid GROUP BY 2, 3; ttl: 10 min_version: 100000 metrics: - datname: { usage: LABEL ,description: "Name of the database which publication belonged" } - subname: { usage: LABEL ,description: "Name of the subscription" } - state: { usage: LABEL ,description: "State of table in subscription, i=initialize, d=data copy, s=sync, r=ready" } - count: { usage: GAUGE ,description: "Count of relation in this subscription and corresponding state" } #==============================================================# # 0700 pg_table #==============================================================# pg_table_18: name: pg_table desc: PostgreSQL table metrics v18+ query: |- SELECT CURRENT_CATALOG AS datname, nsp.nspname || '.' || c.relname AS relname, c.oid AS relid, ascii(c.relkind) AS kind, c.relpages AS pages, c.reltuples AS tuples, c.relfrozenxid AS frozenxid, age(c.relfrozenxid) AS age, c.relnatts AS ncols, psut.seq_scan,psut.seq_tup_read,psut.idx_scan,psut.idx_tup_fetch,psut.seq_scan + psut.idx_scan AS tbl_scan, psut.seq_tup_read + psut.idx_tup_fetch AS tup_read, psut.n_tup_ins,psut.n_tup_upd,psut.n_tup_del,(psut.n_tup_ins + psut.n_tup_upd + psut.n_tup_del) AS n_tup_mod,psut.n_tup_hot_upd,psut.n_tup_newpage_upd,psut.n_live_tup,psut.n_dead_tup, psut.n_mod_since_analyze,psut.n_ins_since_vacuum,psut.last_vacuum,psut.last_autovacuum,psut.last_analyze,psut.last_autoanalyze,psut.last_seq_scan, psut.vacuum_count,psut.autovacuum_count,psut.analyze_count,psut.autoanalyze_count, psut.total_vacuum_time AS vacuum_time,psut.total_autovacuum_time AS autovacuum_time,psut.total_analyze_time AS analyze_time,psut.total_autoanalyze_time AS autoanalyze_time, psio.heap_blks_read,psio.heap_blks_hit,psio.idx_blks_read,psio.idx_blks_hit,psio.toast_blks_read,psio.toast_blks_hit,psio.tidx_blks_read,psio.tidx_blks_hit FROM pg_class c JOIN pg_namespace nsp ON c.relnamespace = nsp.oid LEFT JOIN pg_stat_user_tables psut ON psut.relid = c.oid LEFT JOIN pg_statio_user_tables psio ON psio.relid = c.oid WHERE nsp.nspname !~ '^pg_' AND nsp.nspname !~ '^_' AND nsp.nspname !~ '^timescaledb' AND nsp.nspname !~ '^citus' AND nsp.nspname !~ '^columnar' AND nsp.nspname NOT IN ('pg_catalog','information_schema','pg_toast','repack','monitor') AND c.relkind = ANY (ARRAY ['r','m','t','p']) ORDER BY c.relpages DESC LIMIT 256; ttl: 10 timeout: 2 min_version: 180000 metrics: - datname: { usage: LABEL ,description: "Database name of this table" } - relname: { usage: LABEL ,description: "Relation name of this table" } - relid: { usage: GAUGE ,description: "Relation oid of this table" } - kind: { usage: GAUGE ,description: "Relation kind r/table/114,m/mview/109,t/toast/116,p/partitioned/112" } - pages: { usage: GAUGE ,description: "Size of the on-disk representation of this table in pages" } - tuples: { usage: GAUGE ,description: "Estimated number of rows in this table" } - frozenxid: { usage: GAUGE ,description: "All txid before this have been frozen on this table" } - age: { usage: GAUGE ,description: "Age of this table in vacuum cycles" } - ncols: { usage: GAUGE ,description: "Number of columns in the table" } - seq_scan: { usage: COUNTER ,default: 0 ,description: "Number of sequential scans initiated on this table" } - seq_tup_read: { usage: COUNTER ,default: 0 ,description: "Number of live rows fetched by sequential scans" } - idx_scan: { usage: COUNTER ,default: 0 ,description: "Number of index scans initiated on this table" } - idx_tup_fetch: { usage: COUNTER ,default: 0 ,description: "Number of live rows fetched by index scans" } - tbl_scan: { usage: COUNTER ,default: 0 ,description: "Number of scans initiated on this table" } - tup_read: { usage: COUNTER ,default: 0 ,description: "Number of live rows fetched by scans" } - n_tup_ins: { usage: COUNTER ,default: 0 ,description: "Number of rows inserted" } - n_tup_upd: { usage: COUNTER ,default: 0 ,description: "Number of rows updated (includes HOT updated rows)" } - n_tup_del: { usage: COUNTER ,default: 0 ,description: "Number of rows deleted" } - n_tup_mod: { usage: COUNTER ,default: 0 ,description: "Number of rows modified (insert + update + delete)" } - n_tup_hot_upd: { usage: COUNTER ,default: 0 ,description: "Number of rows HOT updated (i.e with no separate index update required)" } - n_tup_newpage_upd: { usage: COUNTER ,default: 0 ,description: "Number of rows updated where the successor version goes onto a new heap page" } - n_live_tup: { usage: GAUGE ,description: "Estimated number of live rows" } - n_dead_tup: { usage: GAUGE ,description: "Estimated number of dead rows" } - n_mod_since_analyze: { usage: GAUGE ,description: "Estimated number of rows modified since this table was last analyzed" } - n_ins_since_vacuum: { usage: GAUGE ,description: "Estimated number of rows inserted since this table was last vacuumed" } - last_vacuum: { usage: DISCARD ,description: "Last time at which this table was manually vacuumed (not counting VACUUM FULL)" } - last_autovacuum: { usage: DISCARD ,description: "Last time at which this table was vacuumed by the autovacuum daemon" } - last_analyze: { usage: DISCARD ,description: "Last time at which this table was manually analyzed" } - last_autoanalyze: { usage: DISCARD ,description: "Last time at which this table was analyzed by the autovacuum daemon" } - last_seq_scan: { usage: DISCARD ,description: "The timestamp of the last seq scan on this table" } - vacuum_count: { usage: COUNTER ,default: 0 ,description: "Number of times this table has been manually vacuumed (not counting VACUUM FULL)" } - autovacuum_count: { usage: COUNTER ,default: 0 ,description: "Number of times this table has been vacuumed by the autovacuum daemon" } - analyze_count: { usage: COUNTER ,default: 0 ,description: "Number of times this table has been manually analyzed" } - autoanalyze_count: { usage: COUNTER ,default: 0 ,description: "Number of times this table has been analyzed by the autovacuum daemon" } - vacuum_time: { usage: COUNTER ,default: 0 ,scale: 1e-3 ,description: "Total time this table has been manually vacuumed, in seconds" } - autovacuum_time: { usage: COUNTER ,default: 0 ,scale: 1e-3 ,description: "Total time this table has been vacuumed by the autovacuum daemon, in seconds" } - analyze_time: { usage: COUNTER ,default: 0 ,scale: 1e-3 ,description: "Total time this table has been manually analyzed, in seconds" } - autoanalyze_time: { usage: COUNTER ,default: 0 ,scale: 1e-3 ,description: "Total time this table has been analyzed by the autovacuum daemon, in seconds" } - heap_blks_read: { usage: COUNTER ,default: 0 ,description: "Number of disk blocks read from this table" } - heap_blks_hit: { usage: COUNTER ,default: 0 ,description: "Number of buffer hits in this table" } - idx_blks_read: { usage: COUNTER ,default: 0 ,description: "Number of disk blocks read from all indexes on this table" } - idx_blks_hit: { usage: COUNTER ,default: 0 ,description: "Number of buffer hits in all indexes on this table" } - toast_blks_read: { usage: DISCARD ,default: 0 ,description: "Number of disk blocks read from this table's TOAST table (if any)" } - toast_blks_hit: { usage: DISCARD ,default: 0 ,description: "Number of buffer hits in this table's TOAST table (if any)" } - tidx_blks_read: { usage: DISCARD ,default: 0 ,description: "Number of disk blocks read from this table's TOAST table indexes (if any)" } - tidx_blks_hit: { usage: DISCARD ,default: 0 ,description: "Number of buffer hits in this table's TOAST table indexes (if any)" } pg_table_16: name: pg_table desc: PostgreSQL table metrics 16-17 query: |- SELECT CURRENT_CATALOG AS datname, nsp.nspname || '.' || c.relname AS relname, c.oid AS relid, ascii(c.relkind) AS kind, c.relpages AS pages, c.reltuples AS tuples, c.relfrozenxid AS frozenxid, age(c.relfrozenxid) AS age, c.relnatts AS ncols, psut.seq_scan,psut.seq_tup_read,psut.idx_scan,psut.idx_tup_fetch,psut.seq_scan + psut.idx_scan AS tbl_scan, psut.seq_tup_read + psut.idx_tup_fetch AS tup_read, psut.n_tup_ins,psut.n_tup_upd,psut.n_tup_del,(psut.n_tup_ins + psut.n_tup_upd + psut.n_tup_del) AS n_tup_mod,psut.n_tup_hot_upd,psut.n_tup_newpage_upd,psut.n_live_tup,psut.n_dead_tup, psut.n_mod_since_analyze,psut.n_ins_since_vacuum,psut.last_vacuum,psut.last_autovacuum,psut.last_analyze,psut.last_autoanalyze,psut.last_seq_scan, psut.vacuum_count,psut.autovacuum_count,psut.analyze_count,psut.autoanalyze_count, psio.heap_blks_read,psio.heap_blks_hit,psio.idx_blks_read,psio.idx_blks_hit,psio.toast_blks_read,psio.toast_blks_hit,psio.tidx_blks_read,psio.tidx_blks_hit FROM pg_class c JOIN pg_namespace nsp ON c.relnamespace = nsp.oid LEFT JOIN pg_stat_user_tables psut ON psut.relid = c.oid LEFT JOIN pg_statio_user_tables psio ON psio.relid = c.oid WHERE nsp.nspname !~ '^pg_' AND nsp.nspname !~ '^_' AND nsp.nspname !~ '^timescaledb' AND nsp.nspname !~ '^citus' AND nsp.nspname !~ '^columnar' AND nsp.nspname NOT IN ('pg_catalog','information_schema','pg_toast','repack','monitor') AND c.relkind = ANY (ARRAY ['r','m','t','p']) ORDER BY c.relpages DESC LIMIT 256; ttl: 10 timeout: 2 min_version: 160000 max_version: 180000 metrics: - datname: { usage: LABEL ,description: "Database name of this table" } - relname: { usage: LABEL ,description: "Relation name of this table" } - relid: { usage: GAUGE ,description: "Relation oid of this table" } - kind: { usage: GAUGE ,description: "Relation kind r/table/114,m/mview/109,t/toast/116,p/partitioned/112" } - pages: { usage: GAUGE ,description: "Size of the on-disk representation of this table in pages" } - tuples: { usage: GAUGE ,description: "Estimated number of rows in this table" } - frozenxid: { usage: GAUGE ,description: "All txid before this have been frozen on this table" } - age: { usage: GAUGE ,description: "Age of this table in vacuum cycles" } - ncols: { usage: GAUGE ,description: "Number of columns in the table" } - seq_scan: { usage: COUNTER ,default: 0 ,description: "Number of sequential scans initiated on this table" } - seq_tup_read: { usage: COUNTER ,default: 0 ,description: "Number of live rows fetched by sequential scans" } - idx_scan: { usage: COUNTER ,default: 0 ,description: "Number of index scans initiated on this table" } - idx_tup_fetch: { usage: COUNTER ,default: 0 ,description: "Number of live rows fetched by index scans" } - tbl_scan: { usage: COUNTER ,default: 0 ,description: "Number of scans initiated on this table" } - tup_read: { usage: COUNTER ,default: 0 ,description: "Number of live rows fetched by scans" } - n_tup_ins: { usage: COUNTER ,default: 0 ,description: "Number of rows inserted" } - n_tup_upd: { usage: COUNTER ,default: 0 ,description: "Number of rows updated (includes HOT updated rows)" } - n_tup_del: { usage: COUNTER ,default: 0 ,description: "Number of rows deleted" } - n_tup_mod: { usage: COUNTER ,default: 0 ,description: "Number of rows modified (insert + update + delete)" } - n_tup_hot_upd: { usage: COUNTER ,default: 0 ,description: "Number of rows HOT updated (i.e with no separate index update required)" } - n_tup_newpage_upd: { usage: COUNTER ,default: 0 ,description: "Number of rows updated where the successor version goes onto a new heap page" } - n_live_tup: { usage: GAUGE ,description: "Estimated number of live rows" } - n_dead_tup: { usage: GAUGE ,description: "Estimated number of dead rows" } - n_mod_since_analyze: { usage: GAUGE ,description: "Estimated number of rows modified since this table was last analyzed" } - n_ins_since_vacuum: { usage: GAUGE ,description: "Estimated number of rows inserted since this table was last vacuumed" } - last_vacuum: { usage: DISCARD ,description: "Last time at which this table was manually vacuumed (not counting VACUUM FULL)" } - last_autovacuum: { usage: DISCARD ,description: "Last time at which this table was vacuumed by the autovacuum daemon" } - last_analyze: { usage: DISCARD ,description: "Last time at which this table was manually analyzed" } - last_autoanalyze: { usage: DISCARD ,description: "Last time at which this table was analyzed by the autovacuum daemon" } - last_seq_scan: { usage: DISCARD ,description: "The timestamp of the last seq scan on this table" } - vacuum_count: { usage: COUNTER ,default: 0 ,description: "Number of times this table has been manually vacuumed (not counting VACUUM FULL)" } - autovacuum_count: { usage: COUNTER ,default: 0 ,description: "Number of times this table has been vacuumed by the autovacuum daemon" } - analyze_count: { usage: COUNTER ,default: 0 ,description: "Number of times this table has been manually analyzed" } - autoanalyze_count: { usage: COUNTER ,default: 0 ,description: "Number of times this table has been analyzed by the autovacuum daemon" } - heap_blks_read: { usage: COUNTER ,default: 0 ,description: "Number of disk blocks read from this table" } - heap_blks_hit: { usage: COUNTER ,default: 0 ,description: "Number of buffer hits in this table" } - idx_blks_read: { usage: COUNTER ,default: 0 ,description: "Number of disk blocks read from all indexes on this table" } - idx_blks_hit: { usage: COUNTER ,default: 0 ,description: "Number of buffer hits in all indexes on this table" } - toast_blks_read: { usage: DISCARD ,default: 0 ,description: "Number of disk blocks read from this table's TOAST table (if any)" } - toast_blks_hit: { usage: DISCARD ,default: 0 ,description: "Number of buffer hits in this table's TOAST table (if any)" } - tidx_blks_read: { usage: DISCARD ,default: 0 ,description: "Number of disk blocks read from this table's TOAST table indexes (if any)" } - tidx_blks_hit: { usage: DISCARD ,default: 0 ,description: "Number of buffer hits in this table's TOAST table indexes (if any)" } pg_table_13: name: pg_table desc: PostgreSQL table metrics 13-15 (with n_ins_since_vacuum) query: |- SELECT CURRENT_CATALOG AS datname, nsp.nspname || '.' || c.relname AS relname, c.oid AS relid, ascii(c.relkind) AS kind, c.relpages AS pages, c.reltuples AS tuples, c.relfrozenxid AS frozenxid, age(c.relfrozenxid) AS age, c.relnatts AS ncols, psut.seq_scan,psut.seq_tup_read,psut.idx_scan,psut.idx_tup_fetch,psut.seq_scan + psut.idx_scan AS tbl_scan, psut.seq_tup_read + psut.idx_tup_fetch AS tup_read, psut.n_tup_ins,psut.n_tup_upd,psut.n_tup_del,(psut.n_tup_ins + psut.n_tup_upd + psut.n_tup_del) AS n_tup_mod,psut.n_tup_hot_upd,psut.n_live_tup,psut.n_dead_tup, psut.n_mod_since_analyze,psut.n_ins_since_vacuum,psut.last_vacuum,psut.last_autovacuum,psut.last_analyze,psut.last_autoanalyze, psut.vacuum_count,psut.autovacuum_count,psut.analyze_count,psut.autoanalyze_count, psio.heap_blks_read,psio.heap_blks_hit,psio.idx_blks_read,psio.idx_blks_hit,psio.toast_blks_read,psio.toast_blks_hit,psio.tidx_blks_read,psio.tidx_blks_hit FROM pg_class c JOIN pg_namespace nsp ON c.relnamespace = nsp.oid LEFT JOIN pg_stat_user_tables psut ON psut.relid = c.oid LEFT JOIN pg_statio_user_tables psio ON psio.relid = c.oid WHERE nsp.nspname !~ '^pg_' AND nsp.nspname !~ '^_' AND nsp.nspname !~ '^timescaledb' AND nsp.nspname !~ '^citus' AND nsp.nspname !~ '^columnar' AND nsp.nspname NOT IN ('pg_catalog','information_schema','pg_toast','repack','monitor') AND c.relkind = ANY (ARRAY ['r','m','t','p']) ORDER BY c.relpages DESC LIMIT 256; ttl: 10 timeout: 2 min_version: 130000 max_version: 160000 metrics: - datname: { usage: LABEL ,description: "Database name of this table" } - relname: { usage: LABEL ,description: "Relation name of this table" } - relid: { usage: GAUGE ,description: "Relation oid of this table" } - kind: { usage: GAUGE ,description: "Relation kind r/table/114,m/mview/109,t/toast/116,p/partitioned/112" } - pages: { usage: GAUGE ,description: "Size of the on-disk representation of this table in pages" } - tuples: { usage: GAUGE ,description: "Estimated number of rows in this table" } - frozenxid: { usage: GAUGE ,description: "All txid before this have been frozen on this table" } - age: { usage: GAUGE ,description: "Age of this table in vacuum cycles" } - ncols: { usage: GAUGE ,description: "Number of columns in the table" } - seq_scan: { usage: COUNTER ,default: 0 ,description: "Number of sequential scans initiated on this table" } - seq_tup_read: { usage: COUNTER ,default: 0 ,description: "Number of live rows fetched by sequential scans" } - idx_scan: { usage: COUNTER ,default: 0 ,description: "Number of index scans initiated on this table" } - idx_tup_fetch: { usage: COUNTER ,default: 0 ,description: "Number of live rows fetched by index scans" } - tbl_scan: { usage: COUNTER ,default: 0 ,description: "Number of scans initiated on this table" } - tup_read: { usage: COUNTER ,default: 0 ,description: "Number of live rows fetched by scans" } - n_tup_ins: { usage: COUNTER ,default: 0 ,description: "Number of rows inserted" } - n_tup_upd: { usage: COUNTER ,default: 0 ,description: "Number of rows updated (includes HOT updated rows)" } - n_tup_del: { usage: COUNTER ,default: 0 ,description: "Number of rows deleted" } - n_tup_mod: { usage: COUNTER ,default: 0 ,description: "Number of rows modified (insert + update + delete)" } - n_tup_hot_upd: { usage: COUNTER ,default: 0 ,description: "Number of rows HOT updated (i.e with no separate index update required)" } - n_live_tup: { usage: GAUGE ,description: "Estimated number of live rows" } - n_dead_tup: { usage: GAUGE ,description: "Estimated number of dead rows" } - n_mod_since_analyze: { usage: GAUGE ,description: "Estimated number of rows modified since this table was last analyzed" } - n_ins_since_vacuum: { usage: GAUGE ,description: "Estimated number of rows inserted since this table was last vacuumed" } - last_vacuum: { usage: DISCARD ,description: "Last time at which this table was manually vacuumed (not counting VACUUM FULL)" } - last_autovacuum: { usage: DISCARD ,description: "Last time at which this table was vacuumed by the autovacuum daemon" } - last_analyze: { usage: DISCARD ,description: "Last time at which this table was manually analyzed" } - last_autoanalyze: { usage: DISCARD ,description: "Last time at which this table was analyzed by the autovacuum daemon" } - vacuum_count: { usage: COUNTER ,default: 0 ,description: "Number of times this table has been manually vacuumed (not counting VACUUM FULL)" } - autovacuum_count: { usage: COUNTER ,default: 0 ,description: "Number of times this table has been vacuumed by the autovacuum daemon" } - analyze_count: { usage: COUNTER ,default: 0 ,description: "Number of times this table has been manually analyzed" } - autoanalyze_count: { usage: COUNTER ,default: 0 ,description: "Number of times this table has been analyzed by the autovacuum daemon" } - heap_blks_read: { usage: COUNTER ,default: 0 ,description: "Number of disk blocks read from this table" } - heap_blks_hit: { usage: COUNTER ,default: 0 ,description: "Number of buffer hits in this table" } - idx_blks_read: { usage: COUNTER ,default: 0 ,description: "Number of disk blocks read from all indexes on this table" } - idx_blks_hit: { usage: COUNTER ,default: 0 ,description: "Number of buffer hits in all indexes on this table" } - toast_blks_read: { usage: DISCARD ,default: 0 ,description: "Number of disk blocks read from this table's TOAST table (if any)" } - toast_blks_hit: { usage: DISCARD ,default: 0 ,description: "Number of buffer hits in this table's TOAST table (if any)" } - tidx_blks_read: { usage: DISCARD ,default: 0 ,description: "Number of disk blocks read from this table's TOAST table indexes (if any)" } - tidx_blks_hit: { usage: DISCARD ,default: 0 ,description: "Number of buffer hits in this table's TOAST table indexes (if any)" } pg_table_10: name: pg_table desc: PostgreSQL table metrics 9.4-12 query: |- SELECT CURRENT_CATALOG AS datname, nsp.nspname || '.' || c.relname AS relname, c.oid AS relid, ascii(c.relkind) AS kind, c.relpages AS pages, c.reltuples AS tuples, c.relfrozenxid AS frozenxid, age(c.relfrozenxid) AS age, c.relnatts AS ncols, psut.seq_scan,psut.seq_tup_read,psut.idx_scan,psut.idx_tup_fetch,psut.seq_scan + psut.idx_scan AS tbl_scan, psut.seq_tup_read + psut.idx_tup_fetch AS tup_read, psut.n_tup_ins,psut.n_tup_upd,psut.n_tup_del,(psut.n_tup_ins + psut.n_tup_upd + psut.n_tup_del) AS n_tup_mod,psut.n_tup_hot_upd,psut.n_live_tup,psut.n_dead_tup, psut.n_mod_since_analyze,psut.last_vacuum,psut.last_autovacuum,psut.last_analyze,psut.last_autoanalyze, psut.vacuum_count,psut.autovacuum_count,psut.analyze_count,psut.autoanalyze_count, psio.heap_blks_read,psio.heap_blks_hit,psio.idx_blks_read,psio.idx_blks_hit,psio.toast_blks_read,psio.toast_blks_hit,psio.tidx_blks_read,psio.tidx_blks_hit FROM pg_class c JOIN pg_namespace nsp ON c.relnamespace = nsp.oid LEFT JOIN pg_stat_user_tables psut ON psut.relid = c.oid LEFT JOIN pg_statio_user_tables psio ON psio.relid = c.oid WHERE nsp.nspname !~ '^pg_' AND nsp.nspname !~ '^_' AND nsp.nspname !~ '^timescaledb' AND nsp.nspname !~ '^citus' AND nsp.nspname !~ '^columnar' AND nsp.nspname NOT IN ('pg_catalog','information_schema','pg_toast','repack','monitor') AND c.relkind = ANY (ARRAY ['r','m','t','p']) ORDER BY c.relpages DESC LIMIT 256; ttl: 10 timeout: 2 min_version: 090400 max_version: 130000 metrics: - datname: { usage: LABEL ,description: "Database name of this table" } - relname: { usage: LABEL ,description: "Relation name of this table" } - relid: { usage: GAUGE ,description: "Relation oid of this table" } - kind: { usage: GAUGE ,description: "Relation kind r/table/114,m/mview/109,t/toast/116,p/partitioned/112" } - pages: { usage: GAUGE ,description: "Size of the on-disk representation of this table in pages" } - tuples: { usage: GAUGE ,description: "Estimated number of rows in this table" } - frozenxid: { usage: GAUGE ,description: "All txid before this have been frozen on this table" } - age: { usage: GAUGE ,description: "Age of this table in vacuum cycles" } - ncols: { usage: GAUGE ,description: "Number of columns in the table" } - seq_scan: { usage: COUNTER ,default: 0 ,description: "Number of sequential scans initiated on this table" } - seq_tup_read: { usage: COUNTER ,default: 0 ,description: "Number of live rows fetched by sequential scans" } - idx_scan: { usage: COUNTER ,default: 0 ,description: "Number of index scans initiated on this table" } - idx_tup_fetch: { usage: COUNTER ,default: 0 ,description: "Number of live rows fetched by index scans" } - tbl_scan: { usage: COUNTER ,default: 0 ,description: "Number of scans initiated on this table" } - tup_read: { usage: COUNTER ,default: 0 ,description: "Number of live rows fetched by scans" } - n_tup_ins: { usage: COUNTER ,default: 0 ,description: "Number of rows inserted" } - n_tup_upd: { usage: COUNTER ,default: 0 ,description: "Number of rows updated (includes HOT updated rows)" } - n_tup_del: { usage: COUNTER ,default: 0 ,description: "Number of rows deleted" } - n_tup_mod: { usage: COUNTER ,default: 0 ,description: "Number of rows modified (insert + update + delete)" } - n_tup_hot_upd: { usage: COUNTER ,default: 0 ,description: "Number of rows HOT updated (i.e with no separate index update required)" } - n_live_tup: { usage: GAUGE ,description: "Estimated number of live rows" } - n_dead_tup: { usage: GAUGE ,description: "Estimated number of dead rows" } - n_mod_since_analyze: { usage: GAUGE ,description: "Estimated number of rows modified since this table was last analyzed" } - last_vacuum: { usage: DISCARD ,description: "Last time at which this table was manually vacuumed (not counting VACUUM FULL)" } - last_autovacuum: { usage: DISCARD ,description: "Last time at which this table was vacuumed by the autovacuum daemon" } - last_analyze: { usage: DISCARD ,description: "Last time at which this table was manually analyzed" } - last_autoanalyze: { usage: DISCARD ,description: "Last time at which this table was analyzed by the autovacuum daemon" } - vacuum_count: { usage: COUNTER ,default: 0 ,description: "Number of times this table has been manually vacuumed (not counting VACUUM FULL)" } - autovacuum_count: { usage: COUNTER ,default: 0 ,description: "Number of times this table has been vacuumed by the autovacuum daemon" } - analyze_count: { usage: COUNTER ,default: 0 ,description: "Number of times this table has been manually analyzed" } - autoanalyze_count: { usage: COUNTER ,default: 0 ,description: "Number of times this table has been analyzed by the autovacuum daemon" } - heap_blks_read: { usage: COUNTER ,default: 0 ,description: "Number of disk blocks read from this table" } - heap_blks_hit: { usage: COUNTER ,default: 0 ,description: "Number of buffer hits in this table" } - idx_blks_read: { usage: COUNTER ,default: 0 ,description: "Number of disk blocks read from all indexes on this table" } - idx_blks_hit: { usage: COUNTER ,default: 0 ,description: "Number of buffer hits in all indexes on this table" } - toast_blks_read: { usage: DISCARD ,default: 0 ,description: "Number of disk blocks read from this table's TOAST table (if any)" } - toast_blks_hit: { usage: DISCARD ,default: 0 ,description: "Number of buffer hits in this table's TOAST table (if any)" } - tidx_blks_read: { usage: DISCARD ,default: 0 ,description: "Number of disk blocks read from this table's TOAST table indexes (if any)" } - tidx_blks_hit: { usage: DISCARD ,default: 0 ,description: "Number of buffer hits in this table's TOAST table indexes (if any)" } #==============================================================# # 0710 pg_index #==============================================================# pg_index: name: pg_index desc: PostgreSQL index metrics query: |- SELECT CURRENT_CATALOG AS datname, schemaname || '.' || indexrelname AS idxname, schemaname || '.' || relname AS relname ,indexrelid AS relid, relpages, reltuples, idx_scan, idx_tup_read, idx_tup_fetch, idx_blks_read, idx_blks_hit FROM pg_stat_user_indexes psui, LATERAL (SELECT idx_blks_read, idx_blks_hit FROM pg_statio_user_indexes psio WHERE psio.indexrelid = psui.indexrelid LIMIT 1) p2, LATERAL (SELECT relpages,reltuples FROM pg_class c WHERE c.oid = psui.indexrelid LIMIT 1) p3 WHERE schemaname !~ '^pg_' AND schemaname !~ '^_' AND schemaname !~ '^timescaledb' AND schemaname !~ '^citus' AND schemaname !~ '^columnar' AND schemaname NOT IN ('pg_catalog','information_schema','pg_toast','repack','monitor') ORDER BY idx_tup_read DESC LIMIT 512; ttl: 10 timeout: 1 min_version: 090400 metrics: - datname: { usage: LABEL ,description: "Database name of this index" } - idxname: { usage: LABEL ,description: "Name of this index (full-qualified schema name)" } - relname: { usage: LABEL ,description: "Name of the table for this index (full-qualified schema name)" } - relid: { usage: LABEL ,description: "Relation oid of this index" } - relpages: { usage: GAUGE ,description: "Size of the on-disk representation of this index in pages" } - reltuples: { usage: GAUGE ,description: "Estimate relation tuples" } - idx_scan: { usage: COUNTER ,description: "Number of index scans initiated on this index" } - idx_tup_read: { usage: COUNTER ,description: "Number of index entries returned by scans on this index" } - idx_tup_fetch: { usage: COUNTER ,description: "Number of live table rows fetched by simple index scans using this index" } - idx_blks_read: { usage: COUNTER ,description: "Number of disk blocks read from this index" } - idx_blks_hit: { usage: COUNTER ,description: "Number of buffer hits in this index" } #==============================================================# # 0720 pg_func #==============================================================# pg_func: desc: PostgreSQL function metrics query: SELECT CURRENT_CATALOG AS datname, schemaname || '.' || funcname AS funcname, sum(calls) AS calls, sum(total_time) AS total_time, sum(self_time) AS self_time FROM pg_stat_user_functions GROUP BY 2 ORDER BY 4 DESC LIMIT 128; ttl: 10 min_version: 090400 metrics: - datname: { usage: LABEL ,description: "Name of belonged database" } - funcname: { usage: LABEL ,description: "Name of this function, may have multiple override" } - calls: { usage: COUNTER ,description: "Number of times this function has been called" } - total_time: { usage: COUNTER ,scale: 1e-3 ,description: "Total time spent in this function and all other functions called by it, in seconds" } - self_time: { usage: COUNTER ,scale: 1e-3 ,description: "Total time spent in this function itself, not including other functions called by it, in seconds" } #==============================================================# # 0730 pg_seq #==============================================================# pg_seq: desc: PostgreSQL sequence metrics query: SELECT CURRENT_CATALOG AS datname, schemaname || '.' || sequencename AS seqname, last_value, blks_read, blks_hit FROM pg_sequences s, LATERAL (SELECT relid, blks_read, blks_hit FROM pg_statio_all_sequences sio WHERE s.schemaname = sio.schemaname AND s.sequencename = sio.relname LIMIT 1) d LIMIT 128; ttl: 10 min_version: 100000 metrics: - datname: { usage: LABEL ,description: "Database name of this sequence" } - seqname: { usage: LABEL ,description: "Fully schema qualified sequence name" } - last_value: { usage: COUNTER ,description: "The last sequence value written to disk" } - blks_read: { usage: COUNTER ,description: "Number of disk blocks read from this sequence" } - blks_hit: { usage: COUNTER ,description: "Number of buffer hits in this sequence" } #==============================================================# # 0740 pg_relkind #==============================================================# pg_relkind: name: pg_relkind desc: Postgres relation count by kind (category, r,i,m,t,...) query: SELECT CURRENT_CATALOG AS datname, relkind, count(*) AS count FROM pg_class GROUP BY relkind; ttl: 60 timeout: 1 min_version: 090400 metrics: - datname: { usage: LABEL ,description: "Name of database" } - relkind: { usage: LABEL ,description: "Kind of this relation, could be r,i,S,t,v,m,c,f,p,I" } - count: { usage: GAUGE ,description: "Number of relations of corresponding relkind" } #==============================================================# # 0750 pg_defpart #==============================================================# pg_defpart: name: pg_defpart desc: PostgreSQL default partition tuples query: SELECT CURRENT_CATALOG AS datname, relnamespace::RegNamespace || '.' || relname AS relname, reltuples AS tuples FROM pg_class WHERE relpartbound IS NOT NULL AND pg_catalog.pg_get_expr(relpartbound, oid) = 'DEFAULT' ORDER BY reltuples DESC LIMIT 64; ttl: 60 timeout: 1 min_version: 110000 metrics: - datname: { usage: LABEL ,description: "Database name of this default partition" } - relname: { usage: LABEL ,description: "Schema qualified default partition relation name" } - tuples: { usage: GAUGE ,description: "Number of tuples in this default partition" } #==============================================================# # 0810 pg_table_size #==============================================================# pg_table_size: desc: PostgreSQL table size metrics, quite slow query: |- SELECT CURRENT_CATALOG AS datname, nsp.nspname || '.' || rel.relname AS relname, pg_total_relation_size(rel.oid) AS bytes, pg_relation_size(rel.oid) AS relsize, pg_indexes_size(rel.oid) AS indexsize, pg_total_relation_size(reltoastrelid) AS toastsize FROM pg_namespace nsp JOIN pg_class rel ON nsp.oid = rel.relnamespace WHERE nspname <> ALL(ARRAY['pg_catalog', 'information_schema']) AND rel.relkind = 'r' ORDER BY 3 DESC NULLS LAST LIMIT 256; ttl: 300 timeout: 2 min_version: 100000 metrics: - datname: { usage: LABEL ,description: "Database name of this table" } - relname: { usage: LABEL ,description: "Schema qualified table name" } - bytes: { usage: GAUGE ,default: 0 ,description: "Total bytes of this table (including toast, index, toast index)" } - relsize: { usage: GAUGE ,default: 0 ,description: "Bytes of this table itself (main, vm, fsm)" } - indexsize: { usage: GAUGE ,default: 0 ,description: "Bytes of all related indexes of this table" } - toastsize: { usage: GAUGE ,default: 0 ,description: "Bytes of toast tables of this table" } #==============================================================# # 0820 pg_table_bloat #==============================================================# # pg_table_bloat require auxiliary view to work. Disable it or create auxiliary view before use: pg_table_bloat: name: pg_table_bloat desc: PostgreSQL table bloat metrics, require auxiliary view pg_table_bloat to work query: SELECT datname, nspname || '.' || relname AS relname, size, ratio FROM pg_table_bloat ORDER BY size DESC LIMIT 64; ttl: 300 timeout: 2 min_version: 090400 skip: true metrics: - datname: { usage: LABEL ,description: "Database name of this table" } - relname: { usage: LABEL ,description: "Schema qualified name of this table" } - size: { usage: GAUGE ,description: "Total bytes of this table" } - ratio: { usage: GAUGE ,description: "Estimated bloat ratio of this table from 0 to 1" } #==============================================================# # 0830 pg_index_bloat #==============================================================# # pg_index_bloat require auxiliary view to work. Disable it or create auxiliary view before use: pg_index_bloat: name: pg_index_bloat desc: PostgreSQL index bloat metrics (btree only), require pg_index_bloat query: SELECT datname, nspname || '.' || relname AS relname, size, ratio FROM pg_index_bloat ORDER BY size DESC LIMIT 64; ttl: 300 timeout: 2 min_version: 090400 skip: true metrics: - datname: { usage: LABEL ,description: "Database name of this index" } - relname: { usage: LABEL ,description: "Schema qualified index name" } - size: { usage: GAUGE ,description: "Total bytes of this index" } - ratio: { usage: GAUGE ,description: "Estimated bloat ratio of this index, 0~1" } #==============================================================# # 0910 pgbouncer_list #==============================================================# # http://www.pgbouncer.org/usage.html#show-lists pgbouncer_list: name: pgbouncer_list desc: Pgbouncer entry list query: SHOW LISTS; ttl: 10 min_version: 10800 fatal: true tags: [ pgbouncer ] metrics: - list: { usage: LABEL ,description: "Pgbouncer internal list name" } - items: { usage: GAUGE ,description: "Number of corresponding pgbouncer object" } #==============================================================# # 0920 pgbouncer_database #==============================================================# # http://www.pgbouncer.org/usage.html#show-databases pgbouncer_database_124: name: pgbouncer_database desc: Pgbouncer database stats (since 1.24) query: SHOW DATABASES; ttl: 10 min_version: 12400 tags: [ pgbouncer ] metrics: - name: { usage: LABEL ,rename: datname ,description: "Name of configured database entry" } - host: { usage: LABEL ,description: "Host that pgbouncer will connects to" } - port: { usage: LABEL ,description: "Port that pgbouncer will connects to" } - database: { usage: LABEL ,rename: real_datname ,description: "The real database name pgbouncer connects to" } - force_user: { usage: DISCARD } - pool_size: { usage: GAUGE ,description: "Maximum number of server connections" } - min_pool_size: { usage: GAUGE ,description: "Minimum number of server connections" } - reserve_pool_size: { usage: GAUGE ,rename: reserve_pool ,description: "Maximum number of additional connections for this database" } - server_lifetime: { usage: GAUGE ,description: "The maximum lifetime of a server connection for this database" } - pool_mode: { usage: DISCARD } - load_balance_hosts: { usage: DISCARD } - max_connections: { usage: GAUGE ,description: "Maximum number of allowed connections for this database" } - current_connections: { usage: GAUGE ,description: "Current number of connections for this database" } - max_client_connections: { usage: GAUGE ,description: "Maximum number of allowed client connections for this pgbouncer instance" } - current_client_connections: { usage: GAUGE ,description: "Current number of client connections for this database" } - paused: { usage: GAUGE ,description: "True(1) if this database is currently paused, else 0" } - disabled: { usage: GAUGE ,description: "True(1) if this database is currently disabled, else 0" } pgbouncer_database_123: name: pgbouncer_database desc: Pgbouncer database stats 1.23 query: SHOW DATABASES; ttl: 10 min_version: 12300 max_version: 12400 tags: [ pgbouncer ] metrics: - name: { usage: LABEL ,rename: datname ,description: "Name of configured database entry" } - host: { usage: LABEL ,description: "Host that pgbouncer will connects to" } - port: { usage: LABEL ,description: "Port that pgbouncer will connects to" } - database: { usage: LABEL ,rename: real_datname ,description: "The real database name pgbouncer connects to" } - force_user: { usage: DISCARD } - pool_size: { usage: GAUGE ,description: "Maximum number of server connections" } - min_pool_size: { usage: GAUGE ,description: "Minimum number of server connections" } - reserve_pool: { usage: GAUGE ,description: "Maximum number of additional connections for this database" } - server_lifetime: { usage: GAUGE ,description: "The maximum lifetime of a server connection for this database" } - pool_mode: { usage: DISCARD } - max_connections: { usage: GAUGE ,description: "Maximum number of allowed connections for this database" } - current_connections: { usage: GAUGE ,description: "Current number of connections for this database" } - paused: { usage: GAUGE ,description: "True(1) if this database is currently paused, else 0" } - disabled: { usage: GAUGE ,description: "True(1) if this database is currently disabled, else 0" } pgbouncer_database_116: name: pgbouncer_database desc: Pgbouncer database stats (1.16-1.22) query: SHOW DATABASES; ttl: 10 min_version: 11600 max_version: 12300 tags: [ pgbouncer ] metrics: - name: { usage: LABEL ,rename: datname ,description: "Name of configured database entry" } - host: { usage: LABEL ,description: "Host that pgbouncer will connects to" } - port: { usage: LABEL ,description: "Port that pgbouncer will connects to" } - database: { usage: LABEL ,rename: real_datname ,description: "The real database name pgbouncer connects to" } - force_user: { usage: DISCARD } - pool_size: { usage: GAUGE ,description: "Maximum number of server connections" } - min_pool_size: { usage: GAUGE ,description: "Minimum number of server connections" } - reserve_pool: { usage: GAUGE ,description: "Maximum number of additional connections for this database" } - pool_mode: { usage: DISCARD } - max_connections: { usage: GAUGE ,description: "Maximum number of allowed connections for this database" } - current_connections: { usage: GAUGE ,description: "Current number of connections for this database" } - paused: { usage: GAUGE ,description: "True(1) if this database is currently paused, else 0" } - disabled: { usage: GAUGE ,description: "True(1) if this database is currently disabled, else 0" } pgbouncer_database_108: name: pgbouncer_database desc: Pgbouncer database stats (1.08-1.15) query: SHOW DATABASES; ttl: 10 min_version: 10800 max_version: 11600 tags: [ pgbouncer ] metrics: - name: { usage: LABEL ,rename: datname ,description: "Name of configured database entry" } - host: { usage: LABEL ,description: "Host that pgbouncer will connects to" } - port: { usage: LABEL ,description: "Port that pgbouncer will connects to" } - database: { usage: LABEL ,rename: real_datname ,description: "The real database name pgbouncer connects to" } - force_user: { usage: DISCARD } - pool_size: { usage: GAUGE ,description: "Maximum number of server connections" } - reserve_pool: { usage: GAUGE ,description: "Maximum number of additional connections for this database" } - pool_mode: { usage: DISCARD } - max_connections: { usage: GAUGE ,description: "Maximum number of allowed connections for this database" } - current_connections: { usage: GAUGE ,description: "Current number of connections for this database" } - paused: { usage: GAUGE ,description: "True(1) if this database is currently paused, else 0" } - disabled: { usage: GAUGE ,description: "True(1) if this database is currently disabled, else 0" } #==============================================================# # 0930 pgbouncer_stat #==============================================================# # http://www.pgbouncer.org/usage.html#show-stats pgbouncer_stat_124: name: pgbouncer_stat desc: Pgbouncer stats per database (since 1.24) query: SHOW STATS; ttl: 10 min_version: 12400 tags: [ pgbouncer ] metrics: - database: { usage: LABEL ,rename: datname ,description: "Name of database" } - total_xact_count: { usage: COUNTER ,description: "Total number of SQL transactions pooled by pgbouncer" } - total_query_count: { usage: COUNTER ,description: "Total number of SQL queries pooled by pgbouncer" } - total_server_assignment_count: { usage: COUNTER ,description: "Total times a server was assigned to a client" } - total_received: { usage: COUNTER ,description: "Total volume in bytes of network traffic received by pgbouncer" } - total_sent: { usage: COUNTER ,description: "Total volume in bytes of network traffic sent by pgbouncer" } - total_xact_time: { usage: COUNTER ,scale: 1e-6 ,description: "Total number of seconds spent when in a transaction" } - total_query_time: { usage: COUNTER ,scale: 1e-6 ,description: "Total number of seconds spent when executing queries" } - total_wait_time: { usage: COUNTER ,scale: 1e-6 ,description: "Time spent by clients waiting for a server, in seconds" } - total_client_parse_count: { usage: COUNTER ,description: "Total number of prepared statements created by clients" } - total_server_parse_count: { usage: COUNTER ,description: "Total number of prepared statements created on a server." } - total_bind_count: { usage: COUNTER ,description: "Total number of prepared statements readied for execution by clients and forwarded to postgres" } - avg_xact_count: { usage: GAUGE ,description: "Average transactions per second in last stat period" } - avg_query_count: { usage: GAUGE ,description: "Average queries per second in last stat period" } - avg_server_assignment_count: { usage: GAUGE ,description: "Average number of times a server as assigned to a client per second in the last stat period." } - avg_recv: { usage: GAUGE ,description: "Average received (from clients) bytes per second" } - avg_sent: { usage: GAUGE ,description: "Average sent (to clients) bytes per second" } - avg_xact_time: { usage: GAUGE ,scale: 1e-6 ,description: "Average transaction duration, in seconds" } - avg_query_time: { usage: GAUGE ,scale: 1e-6 ,description: "Average query duration, in seconds" } - avg_wait_time: { usage: GAUGE ,scale: 1e-6 ,description: "Time spent by clients waiting for a server, in seconds (average per second)." } - avg_client_parse_count: { usage: GAUGE ,description: "Average number of prepared statements created by clients" } - avg_server_parse_count: { usage: GAUGE ,description: "Average number of prepared statements created on a server." } - avg_bind_count: { usage: GAUGE ,description: "Average number of prepared statements readied for execution by clients and forwarded to postgres" } pgbouncer_stat_123: name: pgbouncer_stat desc: Pgbouncer stats per database (1.23) query: SHOW STATS; ttl: 10 min_version: 12300 max_version: 12400 tags: [ pgbouncer ] metrics: - database: { usage: LABEL ,rename: datname ,description: "Name of database" } - total_xact_count: { usage: COUNTER ,description: "Total number of SQL transactions pooled by pgbouncer" } - total_query_count: { usage: COUNTER ,description: "Total number of SQL queries pooled by pgbouncer" } - total_server_assignment_count: { usage: COUNTER ,description: "Total times a server was assigned to a client" } - total_received: { usage: COUNTER ,description: "Total volume in bytes of network traffic received by pgbouncer" } - total_sent: { usage: COUNTER ,description: "Total volume in bytes of network traffic sent by pgbouncer" } - total_xact_time: { usage: COUNTER ,scale: 1e-6 ,description: "Total number of seconds spent when in a transaction" } - total_query_time: { usage: COUNTER ,scale: 1e-6 ,description: "Total number of seconds spent when executing queries" } - total_wait_time: { usage: COUNTER ,scale: 1e-6 ,description: "Time spent by clients waiting for a server, in seconds" } - avg_xact_count: { usage: GAUGE ,description: "Average transactions per second in last stat period" } - avg_query_count: { usage: GAUGE ,description: "Average queries per second in last stat period" } - avg_server_assignment_count: { usage: GAUGE ,description: "Average number of times a server as assigned to a client per second in the last stat period." } - avg_recv: { usage: GAUGE ,description: "Average received (from clients) bytes per second" } - avg_sent: { usage: GAUGE ,description: "Average sent (to clients) bytes per second" } - avg_xact_time: { usage: GAUGE ,scale: 1e-6 ,description: "Average transaction duration, in seconds" } - avg_query_time: { usage: GAUGE ,scale: 1e-6 ,description: "Average query duration, in seconds" } - avg_wait_time: { usage: GAUGE ,scale: 1e-6 ,description: "Time spent by clients waiting for a server, in seconds (average per second)." } pgbouncer_stat_108: name: pgbouncer_stat desc: Pgbouncer stats per database (1.08 - 1.22) query: SHOW STATS; ttl: 10 min_version: 10800 max_version: 12300 tags: [ pgbouncer ] metrics: - database: { usage: LABEL ,rename: datname ,description: "Name of database" } - total_xact_count: { usage: COUNTER ,description: "Total number of SQL transactions pooled by pgbouncer" } - total_query_count: { usage: COUNTER ,description: "Total number of SQL queries pooled by pgbouncer" } - total_received: { usage: COUNTER ,description: "Total volume in bytes of network traffic received by pgbouncer" } - total_sent: { usage: COUNTER ,description: "Total volume in bytes of network traffic sent by pgbouncer" } - total_xact_time: { usage: COUNTER ,scale: 1e-6 ,description: "Total number of seconds spent when in a transaction" } - total_query_time: { usage: COUNTER ,scale: 1e-6 ,description: "Total number of seconds spent when executing queries" } - total_wait_time: { usage: COUNTER ,scale: 1e-6 ,description: "Time spent by clients waiting for a server, in seconds" } - avg_xact_count: { usage: GAUGE ,description: "Average transactions per second in last stat period" } - avg_query_count: { usage: GAUGE ,description: "Average queries per second in last stat period" } - avg_recv: { usage: GAUGE ,description: "Average received (from clients) bytes per second" } - avg_sent: { usage: GAUGE ,description: "Average sent (to clients) bytes per second" } - avg_xact_time: { usage: GAUGE ,scale: 1e-6 ,description: "Average transaction duration, in seconds" } - avg_query_time: { usage: GAUGE ,scale: 1e-6 ,description: "Average query duration, in seconds" } - avg_wait_time: { usage: GAUGE ,scale: 1e-6 ,description: "Time spent by clients waiting for a server, in seconds (average per second)." } #==============================================================# # 0940 pgbouncer_pool #==============================================================# # http://www.pgbouncer.org/usage.html#show-pools pgbouncer_pool_124: name: pgbouncer_pool desc: Pgbouncer pool stats (1.24+) query: SHOW POOLS; ttl: 10 min_version: 12400 tags: [ pgbouncer ] metrics: - database: { usage: LABEL, rename: datname, description: "Database name of this pool" } - user: { usage: LABEL, description: "User name of this pool" } - cl_active: { usage: GAUGE, rename: active_clients, description: "Client connections that are linked to server connection and can process queries" } - cl_waiting: { usage: GAUGE, rename: waiting_clients, description: "Client connections that have sent queries but have not yet got a server connection" } - cl_active_cancel_req: { usage: GAUGE, rename: active_cancel_clients, description: "Client connections that have forwarded query cancellations to the server and are waiting for the server response." } - cl_waiting_cancel_req: { usage: GAUGE, rename: cancel_clients, description: "Client connections that have not forwarded query cancellations to the server yet." } - sv_active: { usage: GAUGE, rename: active_servers, description: "Server connections that are linked to a client" } - sv_active_cancel: { usage: GAUGE, rename: active_cancel_servers, description: "Server connections that are currently forwarding a cancel request" } - sv_being_canceled: { usage: GAUGE, rename: cancel_servers, description: "cancel requests have completed that were sent to cancel a query on this server" } - sv_idle: { usage: GAUGE, rename: idle_servers, description: "Server connections that are unused and immediately usable for client queries" } - sv_used: { usage: GAUGE, rename: used_servers, description: "Server connections that have been idle for more than server_check_delay (means have to run check query)" } - sv_tested: { usage: GAUGE, rename: tested_servers, description: "Server connections that are currently running reset or check query" } - sv_login: { usage: GAUGE, rename: login_servers, description: "Server connections currently in the process of logging in" } - maxwait: { usage: GAUGE, description: "How long the first(oldest) client in the queue has waited, in seconds, key metric" } - maxwait_us: { usage: GAUGE, description: "Microsecond part of the maximum waiting time." } - pool_mode: { usage: LABEL, description: "Pooling mode in use" } - load_balance_hosts: { usage: LABEL, description: "The load_balance_hosts in use" } pgbouncer_pool_118: name: pgbouncer_pool desc: Pgbouncer pool stats (1.18-1.23) query: SHOW POOLS; ttl: 10 min_version: 11800 max_version: 12400 tags: [ pgbouncer ] metrics: - database: { usage: LABEL, rename: datname, description: "Database name of this pool" } - user: { usage: LABEL, description: "User name of this pool" } - cl_active: { usage: GAUGE, rename: active_clients, description: "Client connections that are linked to server connection and can process queries" } - cl_waiting: { usage: GAUGE, rename: waiting_clients, description: "Client connections that have sent queries but have not yet got a server connection" } - cl_active_cancel_req: { usage: GAUGE, rename: active_cancel_clients, description: "Client connections that have forwarded query cancellations to the server and are waiting for the server response." } - cl_waiting_cancel_req: { usage: GAUGE, rename: cancel_clients, description: "Client connections that have not forwarded query cancellations to the server yet." } - sv_active: { usage: GAUGE, rename: active_servers, description: "Server connections that are linked to a client" } - sv_active_cancel: { usage: GAUGE, rename: active_cancel_servers, description: "Server connections that are currently forwarding a cancel request" } - sv_being_canceled: { usage: GAUGE, rename: cancel_servers, description: "cancel requests have completed that were sent to cancel a query on this server" } - sv_idle: { usage: GAUGE, rename: idle_servers, description: "Server connections that are unused and immediately usable for client queries" } - sv_used: { usage: GAUGE, rename: used_servers, description: "Server connections that have been idle for more than server_check_delay (means have to run check query)" } - sv_tested: { usage: GAUGE, rename: tested_servers, description: "Server connections that are currently running reset or check query" } - sv_login: { usage: GAUGE, rename: login_servers, description: "Server connections currently in the process of logging in" } - maxwait: { usage: GAUGE, description: "How long the first(oldest) client in the queue has waited, in seconds, key metric" } - maxwait_us: { usage: GAUGE, description: "Microsecond part of the maximum waiting time." } - pool_mode: { usage: LABEL, description: "Pooling mode in use" } pgbouncer_pool_116: name: pgbouncer_pool desc: Pgbouncer pool stats (1.16-1.17) query: SHOW POOLS; ttl: 10 min_version: 11600 max_version: 11800 tags: [ pgbouncer ] metrics: - database: { usage: LABEL, rename: datname, description: "Database name of this pool" } - user: { usage: LABEL, description: "User name of this pool" } - cl_active: { usage: GAUGE, rename: active_clients, description: "Client connections that are linked to server connection and can process queries" } - cl_waiting: { usage: GAUGE, rename: waiting_clients, description: "Client connections that have sent queries but have not yet got a server connection" } - cl_cancel_req: { usage: GAUGE, rename: cancel_clients, description: "Client connections that have not forwarded query cancellations to the server yet." } - sv_active: { usage: GAUGE, rename: active_servers, description: "Server connections that are linked to a client" } - sv_idle: { usage: GAUGE, rename: idle_servers, description: "Server connections that are unused and immediately usable for client queries" } - sv_used: { usage: GAUGE, rename: used_servers, description: "Server connections that have been idle for more than server_check_delay (means have to run check query)" } - sv_tested: { usage: GAUGE, rename: tested_servers, description: "Server connections that are currently running reset or check query" } - sv_login: { usage: GAUGE, rename: login_servers, description: "Server connections currently in the process of logging in" } - maxwait: { usage: GAUGE, description: "How long the first(oldest) client in the queue has waited, in seconds, key metric" } - maxwait_us: { usage: GAUGE, description: "Microsecond part of the maximum waiting time." } - pool_mode: { usage: LABEL, description: "Pooling mode in use" } pgbouncer_pool_108: name: pgbouncer_pool desc: Pgbouncer pool stats (1.08-1.15) query: SHOW POOLS; ttl: 10 min_version: 10800 max_version: 11600 tags: [ pgbouncer ] metrics: - database: { usage: LABEL, rename: datname, description: "Database name of this pool" } - user: { usage: LABEL, description: "User name of this pool" } - cl_active: { usage: GAUGE, rename: active_clients, description: "Client connections that are linked to server connection and can process queries" } - cl_waiting: { usage: GAUGE, rename: waiting_clients, description: "Client connections that have sent queries but have not yet got a server connection" } - sv_active: { usage: GAUGE, rename: active_servers, description: "Server connections that are linked to a client" } - sv_idle: { usage: GAUGE, rename: idle_servers, description: "Server connections that are unused and immediately usable for client queries" } - sv_used: { usage: GAUGE, rename: used_servers, description: "Server connections that have been idle for more than server_check_delay (means have to run check query)" } - sv_tested: { usage: GAUGE, rename: tested_servers, description: "Server connections that are currently running reset or check query" } - sv_login: { usage: GAUGE, rename: login_servers, description: "Server connections currently in the process of logging in" } - maxwait: { usage: GAUGE, description: "How long the first(oldest) client in the queue has waited, in seconds, key metric" } - maxwait_us: { usage: GAUGE, description: "Microsecond part of the maximum waiting time." } - pool_mode: { usage: LABEL, description: "Pooling mode in use" } #==============================================================# # 1000 pg_wait_event #==============================================================# pg_wait_event: name: pg_wait_event desc: PostgreSQL wait event sampling based on pg_wait_sampling extension query: SELECT coalesce(event_type, 'Running') AS etype, coalesce(event, 'Running') AS event, sum(count) AS count FROM pg_wait_sampling_profile GROUP BY 1,2; ttl: 10 min_version: 100000 tags: [ cluster, "extension:pg_wait_sampling" ] metrics: - etype: { usage: "LABEL" ,description: "wait event type" } - event: { usage: "LABEL" ,description: "wait event name" } - count: { usage: "COUNTER" ,description: "Total count of wait events sampled" } pg_wait_event_1s: name: pg_wait_event_1s desc: PostgreSQL wait event sampling based on pg_wait_sampling extension query: SELECT coalesce(event_type, 'Running') AS etype, coalesce(event, 'Running') AS event, count(*) FROM pg_wait_sampling_history WHERE ts BETWEEN now() - '1s'::INTERVAL AND now() GROUP BY 1,2; ttl: 10 min_version: 100000 tags: [ cluster, "extension:pg_wait_sampling" ] metrics: - etype: { usage: "LABEL" ,description: "wait event type" } - event: { usage: "LABEL" ,description: "wait event name" } - count: { usage: "GAUGE" ,description: "Number of wait events in last second" } #==============================================================# # 1800 pg_tsdb_hypertable #==============================================================# # this collector reqires timescaledb extension to be installed pg_tsdb_hypertable: name: pg_tsdb_hypertable desc: TimescaleDB hypertable overview query: |- SELECT current_database() AS datname, format('%I.%I', hypertable_schema, hypertable_name) AS relname, num_dimensions AS dimensions, num_chunks AS chunks, compression_enabled::BOOLEAN::int AS compressed, hypertable_size(format('"%I"."%I"', hypertable_schema, hypertable_name)::RegClass) AS bytes FROM timescaledb_information.hypertables; ttl: 60 timeout: 2 min_version: 100000 skip: true tags: [ "extension:timescaledb", "schema:timescaledb_information" ] metrics: - datname: { usage: LABEL ,description: "database name" } - relname: { usage: LABEL ,description: "Hypertable relation name" } - dimensions: { usage: GAUGE ,description: "Number of partitioning dimensions" } - chunks: { usage: GAUGE ,description: "Total chunks of this hypertable" } - compressed: { usage: GAUGE ,description: "1 if compression enabled" } - bytes: { usage: GAUGE ,description: "Total size of hypertable in bytes" } #==============================================================# # 1900 pg_citus_node #==============================================================# # https://docs.citusdata.com/en/latest/develop/api_metadata.html#worker-node-table pg_citus_node: name: pg_citus_node desc: Citus worker coordinator node inventory query: |- SELECT CONCAT(nodename, ':', nodeport) AS node, current_database() AS datname, nodeid AS id, groupid AS group, hasmetadata::BOOLEAN::INT AS has_meta, isactive::BOOLEAN::INT AS is_active, metadatasynced::BOOLEAN::INT AS meta_synced, shouldhaveshards::BOOLEAN::INT AS have_shards FROM pg_dist_node; ttl: 60 min_version: 100000 tags: [ "extension:citus" ] metrics: - node: { usage: LABEL ,description: "nodename:port of the PostgreSQL instance" } - datname: { usage: LABEL ,description: "database name" } - id: { usage: GAUGE ,description: "auto‑generated node identifier" } - group: { usage: GAUGE ,description: "replication group id (primary + secondaries)" } - has_meta: { usage: GAUGE ,description: "1 = internal use flag set" } - is_active: { usage: GAUGE ,description: "1 = node currently accepts shards" } - meta_synced: { usage: GAUGE ,description: "1 = metadata fully synced to node" } - have_shards: { usage: GAUGE ,description: "1 = rebalancer may place shards here" } #==============================================================# # 2000 heartbeat #==============================================================# # this is a example of application monitoring and predicate queries pg_heartbeat: name: pg_heartbeat desc: monitoring heartbeat in monitor.heartbeat table predicate_queries: - name: if heartbeat table exists predicate_query: | SELECT EXISTS (SELECT 1 FROM information_schema.tables WHERE table_schema = 'monitor' AND table_name = 'heartbeat'); query: |- SELECT id AS cluster_name, extract(EPOCH FROM ts) AS ts, lsn, txid FROM monitor.heartbeat; ttl: 10 min_version: 090100 tags: [ "dbname:postgres", "schema:monitor" ] skip: true metrics: - cluster_name: { usage: LABEL ,description: "cluster_name param of this database cluster" } - ts: { usage: GAUGE ,description: "unix timestamp of the heartbeat" } - lsn: { usage: COUNTER ,description: "lsn of the heartbeat" } - txid: { usage: GAUGE ,description: "txid of the heartbeat" }