diff --git a/.github/workflows/dashboards.yml b/.github/workflows/dashboards.yml index 8efdaf57..ef9d6cae 100644 --- a/.github/workflows/dashboards.yml +++ b/.github/workflows/dashboards.yml @@ -8,6 +8,9 @@ on: paths: - chart/dashboards/** +env: + golang-version: 1.18.4 + jobs: dashboard-sync: runs-on: ubuntu-latest @@ -17,6 +20,14 @@ jobs: with: fetch-depth: 0 + - name: Set up golang + uses: actions/setup-go@v3 + with: + go-version: ${{ env.golang-version }} + - name: Run Dashboard synchronizer run: | - ./scripts/sync-dashboards.sh && git diff --exit-code + go install github.com/monitoring-mixins/mixtool/cmd/mixtool@latest && \ + go install github.com/google/go-jsonnet/cmd/jsonnetfmt@latest && \ + make sync-mixins && \ + git diff --exit-code diff --git a/Makefile b/Makefile index b1571056..985a194f 100644 --- a/Makefile +++ b/Makefile @@ -64,3 +64,7 @@ kubescape: manifests.yaml ## Runs a security analysis on generated manifests - help: ## Displays help. @awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make \033[36m\033[0m\n\nTargets:\n"} /^[a-z0-9A-Z_-]+:.*?##/ { printf " \033[36m%-13s\033[0m %s\n", $$1, $$2 }' $(MAKEFILE_LIST) + +.PHONY: sync-mixins +sync-mixins: ## Syncs mixins from Promscale and Postgres-Exporter + ./scripts/sync-mixins.sh diff --git a/chart/alerts/postgres-exporter-alerts.yaml b/chart/alerts/postgres-exporter-alerts.yaml new file mode 100644 index 00000000..0187fca0 --- /dev/null +++ b/chart/alerts/postgres-exporter-alerts.yaml @@ -0,0 +1,105 @@ +groups: +- name: PostgreSQL + rules: + - alert: PostgreSQLMaxConnectionsReached + annotations: + description: '{{ $labels.instance }} is exceeding the currently configured maximum + Postgres connection limit (current value: {{ $value }}s). Services may be + degraded - please take immediate action (you probably need to increase max_connections + in the Docker image and re-deploy.' + summary: '{{ $labels.instance }} has maxed out Postgres connections.' + expr: | + sum by (instance) (pg_stat_activity_count{}) + >= + sum by (instance) (pg_settings_max_connections{}) + - + sum by (instance) (pg_settings_superuser_reserved_connections{}) + for: 1m + labels: + severity: warning + - alert: PostgreSQLHighConnections + annotations: + description: '{{ $labels.instance }} is exceeding 80% of the currently configured + maximum Postgres connection limit (current value: {{ $value }}s). Please check + utilization graphs and confirm if this is normal service growth, abuse or + an otherwise temporary condition or if new resources need to be provisioned + (or the limits increased, which is mostly likely).' + summary: '{{ $labels.instance }} is over 80% of max Postgres connections.' + expr: | + sum by (instance) (pg_stat_activity_count{}) + > + ( + sum by (instance) (pg_settings_max_connections{}) + - + sum by (instance) (pg_settings_superuser_reserved_connections{}) + ) * 0.8 + for: 10m + labels: + severity: warning + - alert: PostgreSQLDown + annotations: + description: '{{ $labels.instance }} is rejecting query requests from the exporter, + and thus probably not allowing DNS requests to work either. User services + should not be effected provided at least 1 node is still alive.' + summary: 'PostgreSQL is not processing queries: {{ $labels.instance }}' + expr: pg_up{} != 1 + for: 1m + labels: + severity: warning + - alert: PostgreSQLSlowQueries + annotations: + description: 'PostgreSQL high number of slow queries {{ $labels.cluster }} for + database {{ $labels.datname }} with a value of {{ $value }} ' + summary: 'PostgreSQL high number of slow on {{ $labels.cluster }} for database + {{ $labels.datname }} ' + expr: | + avg by (datname) ( + rate ( + pg_stat_activity_max_tx_duration{datname!~"template.*",}[2m] + ) + ) > 2 * 60 + for: 2m + labels: + severity: warning + - alert: PostgreSQLQPS + annotations: + description: PostgreSQL high number of queries per second on {{ $labels.cluster + }} for database {{ $labels.datname }} with a value of {{ $value }} + summary: PostgreSQL high number of queries per second {{ $labels.cluster }} + for database {{ $labels.datname }} + expr: | + avg by (datname) ( + irate( + pg_stat_database_xact_commit{datname!~"template.*",}[5m] + ) + + + irate( + pg_stat_database_xact_rollback{datname!~"template.*",}[5m] + ) + ) > 10000 + for: 5m + labels: + severity: warning + - alert: PostgreSQLCacheHitRatio + annotations: + description: PostgreSQL low on cache hit rate on {{ $labels.cluster }} for database + {{ $labels.datname }} with a value of {{ $value }} + summary: PostgreSQL low cache hit rate on {{ $labels.cluster }} for database + {{ $labels.datname }} + expr: | + avg by (datname) ( + rate(pg_stat_database_blks_hit{datname!~"template.*",}[5m]) + / + ( + rate( + pg_stat_database_blks_hit{datname!~"template.*",}[5m] + ) + + + rate( + pg_stat_database_blks_read{datname!~"template.*",}[5m] + ) + ) + ) < 0.98 + for: 5m + labels: + severity: warning diff --git a/chart/dashboards/postgres-overview.json b/chart/dashboards/postgres-overview.json new file mode 100644 index 00000000..91ddb551 --- /dev/null +++ b/chart/dashboards/postgres-overview.json @@ -0,0 +1,1468 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "prometheus", + "uid": "dc08d25c8f267b054f12002f334e6d3d32a853e4" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "description": "Performance metrics for Postgres", + "editable": true, + "gnetId": 455, + "graphTooltip": 0, + "id": 1, + "iteration": 1603191461722, + "links": [ ], + "panels": [ + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "dc08d25c8f267b054f12002f334e6d3d32a853e4" + }, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "custom": { } + }, + "overrides": [ ] + }, + "fill": 1, + "fillGradient": 0, + "grid": { }, + "gridPos": { + "h": 7, + "w": 20, + "x": 0, + "y": 0 + }, + "hiddenSeries": false, + "id": 1, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "max": true, + "min": true, + "rightSide": true, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.2.1", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "alias": "fetched", + "dsType": "prometheus", + "expr": "sum(irate(pg_stat_database_tup_fetched{datname=~\"$db\",instance=~\"$instance\"}[5m]))", + "format": "time_series", + "groupBy": [ + { + "params": [ + "$interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "intervalFactor": 2, + "legendFormat": "fetched", + "measurement": "postgresql", + "policy": "default", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "tup_fetched" + ], + "type": "field" + }, + { + "params": [ ], + "type": "mean" + }, + { + "params": [ + "10s" + ], + "type": "non_negative_derivative" + } + ] + ], + "step": 120, + "tags": [ + { + "key": "instance", + "operator": "=~", + "value": "/^$instance$/" + } + ] + }, + { + "alias": "fetched", + "dsType": "prometheus", + "expr": "sum(irate(pg_stat_database_tup_returned{datname=~\"$db\",instance=~\"$instance\"}[5m]))", + "format": "time_series", + "groupBy": [ + { + "params": [ + "$interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "intervalFactor": 2, + "legendFormat": "returned", + "measurement": "postgresql", + "policy": "default", + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "tup_fetched" + ], + "type": "field" + }, + { + "params": [ ], + "type": "mean" + }, + { + "params": [ + "10s" + ], + "type": "non_negative_derivative" + } + ] + ], + "step": 120, + "tags": [ + { + "key": "instance", + "operator": "=~", + "value": "/^$instance$/" + } + ] + }, + { + "alias": "fetched", + "dsType": "prometheus", + "expr": "sum(irate(pg_stat_database_tup_inserted{datname=~\"$db\",instance=~\"$instance\"}[5m]))", + "format": "time_series", + "groupBy": [ + { + "params": [ + "$interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "intervalFactor": 2, + "legendFormat": "inserted", + "measurement": "postgresql", + "policy": "default", + "refId": "C", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "tup_fetched" + ], + "type": "field" + }, + { + "params": [ ], + "type": "mean" + }, + { + "params": [ + "10s" + ], + "type": "non_negative_derivative" + } + ] + ], + "step": 120, + "tags": [ + { + "key": "instance", + "operator": "=~", + "value": "/^$instance$/" + } + ] + }, + { + "alias": "fetched", + "dsType": "prometheus", + "expr": "sum(irate(pg_stat_database_tup_updated{datname=~\"$db\",instance=~\"$instance\"}[5m]))", + "format": "time_series", + "groupBy": [ + { + "params": [ + "$interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "intervalFactor": 2, + "legendFormat": "updated", + "measurement": "postgresql", + "policy": "default", + "refId": "D", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "tup_fetched" + ], + "type": "field" + }, + { + "params": [ ], + "type": "mean" + }, + { + "params": [ + "10s" + ], + "type": "non_negative_derivative" + } + ] + ], + "step": 120, + "tags": [ + { + "key": "instance", + "operator": "=~", + "value": "/^$instance$/" + } + ] + }, + { + "alias": "fetched", + "dsType": "prometheus", + "expr": "sum(irate(pg_stat_database_tup_deleted{datname=~\"$db\",instance=~\"$instance\"}[5m]))", + "format": "time_series", + "groupBy": [ + { + "params": [ + "$interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "intervalFactor": 2, + "legendFormat": "deleted", + "measurement": "postgresql", + "policy": "default", + "refId": "E", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "tup_fetched" + ], + "type": "field" + }, + { + "params": [ ], + "type": "mean" + }, + { + "params": [ + "10s" + ], + "type": "non_negative_derivative" + } + ] + ], + "step": 120, + "tags": [ + { + "key": "instance", + "operator": "=~", + "value": "/^$instance$/" + } + ] + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeRegions": [ ], + "timeShift": null, + "title": "Rows", + "tooltip": { + "msResolution": true, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "cacheTimeout": null, + "colorBackground": false, + "colorValue": false, + "colors": [ + "rgba(245, 54, 54, 0.9)", + "rgba(237, 129, 40, 0.89)", + "rgba(50, 172, 45, 0.97)" + ], + "datasource": { + "type": "prometheus", + "uid": "dc08d25c8f267b054f12002f334e6d3d32a853e4" + }, + "decimals": 0, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "custom": { } + }, + "overrides": [ ] + }, + "format": "none", + "gauge": { + "maxValue": 100, + "minValue": 0, + "show": false, + "thresholdLabels": false, + "thresholdMarkers": true + }, + "gridPos": { + "h": 3, + "w": 4, + "x": 20, + "y": 0 + }, + "height": "55px", + "id": 11, + "interval": null, + "isNew": true, + "links": [ ], + "mappingType": 1, + "mappingTypes": [ + { + "name": "value to text", + "value": 1 + }, + { + "name": "range to text", + "value": 2 + } + ], + "maxDataPoints": 100, + "nullPointMode": "connected", + "nullText": null, + "postfix": "", + "postfixFontSize": "50%", + "prefix": "", + "prefixFontSize": "50%", + "rangeMaps": [ + { + "from": "null", + "text": "N/A", + "to": "null" + } + ], + "sparkline": { + "fillColor": "rgba(31, 118, 189, 0.18)", + "full": true, + "lineColor": "rgb(31, 120, 193)", + "show": true + }, + "tableColumn": "", + "targets": [ + { + "dsType": "prometheus", + "expr": "sum(irate(pg_stat_database_xact_commit{datname=~\"$db\",instance=~\"$instance\"}[5m])) + sum(irate(pg_stat_database_xact_rollback{datname=~\"$db\",instance=~\"$instance\"}[5m]))", + "format": "time_series", + "groupBy": [ + { + "params": [ + "$interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "intervalFactor": 2, + "measurement": "postgresql", + "policy": "default", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "xact_commit" + ], + "type": "field" + }, + { + "params": [ ], + "type": "mean" + }, + { + "params": [ + "10s" + ], + "type": "non_negative_derivative" + } + ] + ], + "step": 1800, + "tags": [ + { + "key": "instance", + "operator": "=~", + "value": "/^$instance$/" + } + ] + } + ], + "thresholds": "", + "title": "QPS", + "transparent": true, + "type": "singlestat", + "valueFontSize": "80%", + "valueMaps": [ + { + "op": "=", + "text": "N/A", + "value": "null" + } + ], + "valueName": "avg" + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "dc08d25c8f267b054f12002f334e6d3d32a853e4" + }, + "decimals": 1, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "custom": { } + }, + "overrides": [ ] + }, + "fill": 1, + "fillGradient": 0, + "grid": { }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 7 + }, + "hiddenSeries": false, + "id": 2, + "isNew": true, + "legend": { + "alignAsTable": true, + "avg": true, + "current": false, + "hideZero": true, + "max": true, + "min": true, + "rightSide": false, + "show": true, + "total": false, + "values": true + }, + "lines": true, + "linewidth": 1, + "links": [ ], + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.2.1", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "alias": "Buffers Allocated", + "dsType": "prometheus", + "expr": "irate(pg_stat_bgwriter_buffers_alloc_total{instance='$instance'}[5m])", + "format": "time_series", + "groupBy": [ + { + "params": [ + "$interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "intervalFactor": 2, + "legendFormat": "buffers_alloc", + "measurement": "postgresql", + "policy": "default", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "buffers_alloc" + ], + "type": "field" + }, + { + "params": [ ], + "type": "mean" + }, + { + "params": [ ], + "type": "difference" + } + ] + ], + "step": 240, + "tags": [ + { + "key": "instance", + "operator": "=~", + "value": "/^$instance$/" + } + ] + }, + { + "alias": "Buffers Allocated", + "dsType": "prometheus", + "expr": "irate(pg_stat_bgwriter_buffers_backend_fsync_total{instance='$instance'}[5m])", + "format": "time_series", + "groupBy": [ + { + "params": [ + "$interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "intervalFactor": 2, + "legendFormat": "buffers_backend_fsync", + "measurement": "postgresql", + "policy": "default", + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "buffers_alloc" + ], + "type": "field" + }, + { + "params": [ ], + "type": "mean" + }, + { + "params": [ ], + "type": "difference" + } + ] + ], + "step": 240, + "tags": [ + { + "key": "instance", + "operator": "=~", + "value": "/^$instance$/" + } + ] + }, + { + "alias": "Buffers Allocated", + "dsType": "prometheus", + "expr": "irate(pg_stat_bgwriter_buffers_backend_total{instance='$instance'}[5m])", + "format": "time_series", + "groupBy": [ + { + "params": [ + "$interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "intervalFactor": 2, + "legendFormat": "buffers_backend", + "measurement": "postgresql", + "policy": "default", + "refId": "C", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "buffers_alloc" + ], + "type": "field" + }, + { + "params": [ ], + "type": "mean" + }, + { + "params": [ ], + "type": "difference" + } + ] + ], + "step": 240, + "tags": [ + { + "key": "instance", + "operator": "=~", + "value": "/^$instance$/" + } + ] + }, + { + "alias": "Buffers Allocated", + "dsType": "prometheus", + "expr": "irate(pg_stat_bgwriter_buffers_clean_total{instance='$instance'}[5m])", + "format": "time_series", + "groupBy": [ + { + "params": [ + "$interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "intervalFactor": 2, + "legendFormat": "buffers_clean", + "measurement": "postgresql", + "policy": "default", + "refId": "D", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "buffers_alloc" + ], + "type": "field" + }, + { + "params": [ ], + "type": "mean" + }, + { + "params": [ ], + "type": "difference" + } + ] + ], + "step": 240, + "tags": [ + { + "key": "instance", + "operator": "=~", + "value": "/^$instance$/" + } + ] + }, + { + "alias": "Buffers Allocated", + "dsType": "prometheus", + "expr": "irate(pg_stat_bgwriter_buffers_checkpoint_total{instance='$instance'}[5m])", + "format": "time_series", + "groupBy": [ + { + "params": [ + "$interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "intervalFactor": 2, + "legendFormat": "buffers_checkpoint", + "measurement": "postgresql", + "policy": "default", + "refId": "E", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "buffers_alloc" + ], + "type": "field" + }, + { + "params": [ ], + "type": "mean" + }, + { + "params": [ ], + "type": "difference" + } + ] + ], + "step": 240, + "tags": [ + { + "key": "instance", + "operator": "=~", + "value": "/^$instance$/" + } + ] + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeRegions": [ ], + "timeShift": null, + "title": "Buffers", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "dc08d25c8f267b054f12002f334e6d3d32a853e4" + }, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "custom": { } + }, + "overrides": [ ] + }, + "fill": 1, + "fillGradient": 0, + "grid": { }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 7 + }, + "hiddenSeries": false, + "id": 3, + "isNew": true, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [ ], + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.2.1", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "alias": "conflicts", + "dsType": "prometheus", + "expr": "sum(rate(pg_stat_database_deadlocks{datname=~\"$db\",instance=~\"$instance\"}[5m]))", + "format": "time_series", + "groupBy": [ + { + "params": [ + "$interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "intervalFactor": 2, + "legendFormat": "deadlocks", + "measurement": "postgresql", + "policy": "default", + "refId": "A", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "conflicts" + ], + "type": "field" + }, + { + "params": [ ], + "type": "mean" + }, + { + "params": [ ], + "type": "difference" + } + ] + ], + "step": 240, + "tags": [ + { + "key": "instance", + "operator": "=~", + "value": "/^$instance$/" + } + ] + }, + { + "alias": "deadlocks", + "dsType": "prometheus", + "expr": "sum(rate(pg_stat_database_conflicts{datname=~\"$db\",instance=~\"$instance\"}[5m]))", + "format": "time_series", + "groupBy": [ + { + "params": [ + "$interval" + ], + "type": "time" + }, + { + "params": [ + "null" + ], + "type": "fill" + } + ], + "intervalFactor": 2, + "legendFormat": "conflicts", + "measurement": "postgresql", + "policy": "default", + "refId": "B", + "resultFormat": "time_series", + "select": [ + [ + { + "params": [ + "deadlocks" + ], + "type": "field" + }, + { + "params": [ ], + "type": "mean" + }, + { + "params": [ ], + "type": "difference" + } + ] + ], + "step": 240, + "tags": [ + { + "key": "instance", + "operator": "=~", + "value": "/^$instance$/" + } + ] + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeRegions": [ ], + "timeShift": null, + "title": "Conflicts/Deadlocks", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": 0, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "dc08d25c8f267b054f12002f334e6d3d32a853e4" + }, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "custom": { } + }, + "overrides": [ ] + }, + "fill": 1, + "fillGradient": 0, + "grid": { }, + "gridPos": { + "h": 7, + "w": 12, + "x": 0, + "y": 14 + }, + "hiddenSeries": false, + "id": 12, + "isNew": true, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [ ], + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": true, + "pluginVersion": "7.2.1", + "pointradius": 1, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "sum(pg_stat_database_blks_hit{datname=~\"$db\",instance=~\"$instance\"}) / (sum(pg_stat_database_blks_hit{datname=~\"$db\",instance=~\"$instance\"}) + sum(pg_stat_database_blks_read{datname=~\"$db\",instance=~\"$instance\"}))", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "cache hit rate", + "refId": "A", + "step": 240 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeRegions": [ ], + "timeShift": null, + "title": "Cache hit ratio", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "percentunit", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + }, + { + "aliasColors": { }, + "bars": false, + "dashLength": 10, + "dashes": false, + "datasource": { + "type": "prometheus", + "uid": "dc08d25c8f267b054f12002f334e6d3d32a853e4" + }, + "editable": true, + "error": false, + "fieldConfig": { + "defaults": { + "custom": { } + }, + "overrides": [ ] + }, + "fill": 1, + "fillGradient": 0, + "grid": { }, + "gridPos": { + "h": 7, + "w": 12, + "x": 12, + "y": 14 + }, + "hiddenSeries": false, + "id": 13, + "isNew": true, + "legend": { + "avg": false, + "current": false, + "max": false, + "min": false, + "show": true, + "total": false, + "values": false + }, + "lines": true, + "linewidth": 2, + "links": [ ], + "nullPointMode": "connected", + "options": { + "alertThreshold": true + }, + "percentage": false, + "pluginVersion": "7.2.1", + "pointradius": 5, + "points": false, + "renderer": "flot", + "seriesOverrides": [ ], + "spaceLength": 10, + "stack": false, + "steppedLine": false, + "targets": [ + { + "expr": "pg_stat_database_numbackends{datname=~\"$db\",instance=~\"$instance\"}", + "format": "time_series", + "intervalFactor": 2, + "legendFormat": "{{__name__}}", + "refId": "A", + "step": 240 + } + ], + "thresholds": [ ], + "timeFrom": null, + "timeRegions": [ ], + "timeShift": null, + "title": "Number of active connections", + "tooltip": { + "msResolution": false, + "shared": true, + "sort": 0, + "value_type": "cumulative" + }, + "type": "graph", + "xaxis": { + "buckets": null, + "mode": "time", + "name": null, + "show": true, + "values": [ ] + }, + "yaxes": [ + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + }, + { + "format": "short", + "label": null, + "logBase": 1, + "max": null, + "min": null, + "show": true + } + ], + "yaxis": { + "align": false, + "alignLevel": null + } + } + ], + "refresh": false, + "schemaVersion": 26, + "style": "dark", + "tags": [ + "postgres" + ], + "templating": { + "list": [ + { + "allValue": ".*", + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "dc08d25c8f267b054f12002f334e6d3d32a853e4" + }, + "definition": "", + "hide": 0, + "includeAll": true, + "label": null, + "multi": false, + "name": "instance", + "options": [ ], + "query": "label_values(up{job=~\"postgres.*\"},instance)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": { + "selected": false, + "text": "All", + "value": "$__all" + }, + "datasource": { + "type": "prometheus", + "uid": "dc08d25c8f267b054f12002f334e6d3d32a853e4" + }, + "definition": "label_values(pg_stat_database_tup_fetched{instance=~\"$instance\",datname!~\"template.*|postgres\"},datname)", + "hide": 0, + "includeAll": true, + "label": "db", + "multi": false, + "name": "db", + "options": [ ], + "query": "label_values(pg_stat_database_tup_fetched{instance=~\"$instance\",datname!~\"template.*|postgres\"},datname)", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": { + "selected": false, + "text": "Postgres Overview", + "value": "Postgres Overview" + }, + "hide": 0, + "includeAll": false, + "label": "datasource", + "multi": false, + "name": "datasource", + "options": [ ], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": null, + "current": { + "selected": true, + "text": "postgres", + "value": "postgres" + }, + "datasource": { + "type": "prometheus", + "uid": "dc08d25c8f267b054f12002f334e6d3d32a853e4" + }, + "definition": "label_values(pg_up, job)", + "hide": 0, + "includeAll": false, + "label": "job", + "multi": false, + "name": "job", + "options": [ + { + "selected": true, + "text": "postgres", + "value": "postgres" + } + ], + "query": "label_values(pg_up, job)", + "refresh": 0, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [ ], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-1h", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "5s", + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "Postgres Overview", + "uid": "wGgaPlciz", + "version": 5 + } \ No newline at end of file diff --git a/chart/templates/postgres-exporter-promrule.yaml b/chart/templates/postgres-exporter-promrule.yaml new file mode 100644 index 00000000..89976968 --- /dev/null +++ b/chart/templates/postgres-exporter-promrule.yaml @@ -0,0 +1,21 @@ +{{ if (index .Values "timescaledb-single" "prometheus" "enabled") -}} +--- +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + name: {{ .Release.Name }}-postgres-exporter + annotations: + "helm.sh/hook": post-install,post-upgrade,pre-delete + "helm.sh/hook-weight": "0" + labels: + app: {{ template "tobs.fullname" . }} + chart: {{ template "tobs.chart" . }} + release: {{ .Release.Name }} + heritage: {{ .Release.Service }} + app.kubernetes.io/component: postgres-exporter + app.kubernetes.io/instance: {{ .Release.Namespace }}.{{ .Release.Name }}-postgres-exporter + app.kubernetes.io/managed-by: helm + app.kubernetes.io/part-of: tobs +spec: +{{ .Files.Get "alerts/postgres-exporter-alerts.yaml" | indent 2 }} +{{ end -}} diff --git a/chart/values.yaml b/chart/values.yaml index 4a727ae2..389fd064 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -47,6 +47,18 @@ timescaledb-single: cpu: 100m memory: 2Gi + # Enable Prometheus exporter for PostgreSQL server metrics. + # https://github.com/prometheus-community/postgres_exporter + prometheus: + enabled: true + image: + repository: quay.io/prometheuscommunity/postgres-exporter + tag: v0.11.0 + + # Specifies whether ServiceMonitor for Prometheus operator should be created + serviceMonitor: + enabled: true + # Values for configuring the deployment of the Promscale # The charts README is at: # https://github.com/timescale/promscale/tree/master/helm-chart @@ -267,6 +279,7 @@ kube-prometheus-stack: - dashboards/apm-service-dependencies-upstream.json - dashboards/apm-service-overview.json - dashboards/promscale.json + - dashboards/postgres-overview.json adminUser: admin # To configure password externally refer to https://github.com/grafana/helm-charts/blob/6578497320d3c4672bab3a3c7fd38dffba1c9aba/charts/grafana/values.yaml#L340-L345 adminPassword: "" @@ -373,12 +386,12 @@ opentelemetry-operator: exporters: logging: otlp: - endpoint: "{{ .Release.Name }}-promscale-connector.{{ .Release.Namespace }}.svc:9202" + endpoint: "{{ .Release.Name }}-promscale.{{ .Release.Namespace }}.svc:9202" compression: none tls: insecure: true prometheusremotewrite: - endpoint: "http://{{ .Release.Name }}-promscale-connector.{{ .Release.Namespace }}.svc:9201/write" + endpoint: "http://{{ .Release.Name }}-promscale.{{ .Release.Namespace }}.svc:9201/write" tls: insecure: true diff --git a/scripts/sync-dashboards.sh b/scripts/sync-dashboards.sh deleted file mode 100755 index 74d67977..00000000 --- a/scripts/sync-dashboards.sh +++ /dev/null @@ -1,27 +0,0 @@ -#!/bin/bash - -# TODO(paulfantom): consider using jsonnet for modifications from this script - -set -euo pipefail - -cd "$(git rev-parse --show-toplevel)" - -rm -rf tmp/promscale -mkdir -p tmp -git clone --depth 1 https://github.com/timescale/promscale.git tmp/promscale - -cp -r tmp/promscale/docs/mixin/dashboards/*.json chart/dashboards/ - -# FIXME(paulfantom): those changes should be incorporated into promscale mixin -# dashboard UID is a sha256 hash of ".title" -# datasource UID is a sha256 hash of ".name" -# "__inputs" field needs to be removed -# replace all `${DS_TIMESCALEDB}` with timescaledb datasource UID -# replace all `${DS_PROMSCALE_JAEGER}` with promscale tracing datasource UID -# replace all `${DS_PROMETHEUS}` with promscale prometheus datasource UID -find chart/dashboards/ \( -type d -name '*.json' -prune \) -o -type f -print0 | xargs -0 sed -i.orig 's/${DS_TIMESCALEDB}/c4729dfb8ceeaa0372ef27403a3932695eee995d/g' -find chart/dashboards/ -name '*.orig' -exec rm -f {} \; -find chart/dashboards/ \( -type d -name '*.json' -prune \) -o -type f -print0 | xargs -0 sed -i.orig 's/${DS_PROMSCALE_JAEGER}/f78291126102e0f2e841734d1e90250257543042/g' -find chart/dashboards/ -name '*.orig' -exec rm -f {} \; -find chart/dashboards/ \( -type d -name '*.json' -prune \) -o -type f -print0 | xargs -0 sed -i.orig 's/${DS_PROMETHEUS}/dc08d25c8f267b054f12002f334e6d3d32a853e4/g' -find chart/dashboards/ -name '*.orig' -exec rm -f {} \; diff --git a/scripts/sync-mixins.sh b/scripts/sync-mixins.sh new file mode 100755 index 00000000..cf7283f8 --- /dev/null +++ b/scripts/sync-mixins.sh @@ -0,0 +1,122 @@ +#! /usr/bin/env bash + +set -euo pipefail + +# Variables +psql_exporter="tmp/postgres_exporter" +psql_mixin="${psql_exporter}/postgres_mixin" +prom="tmp/promscale" +prom_mixin="${prom}/docs/mixin/dashboards" + +# This mixin requires a few tools to be installed, we need to check for those +# and print out that they need to be installed if they are not. + +if [ ! -x "$(command -v go)" ]; then + echo "go is not installed, please install it" + exit 1 +fi + +if [ ! -x "$(command -v mixtool)" ]; then + echo "mixtool is not installed, please install it" + echo "https://github.com/prometheus-community/postgres_exporter/tree/master/postgres_mixin" + exit 1 +fi + +# TODO(nhudson) think about adding in jq formatting to make the dashboard +# code look nicer. +#if [ ! -x "$(command -v jq)" ]; then +# echo "jq is not installed, please install it" +# exit 1 +#fi + +git_clone() { + # Checkout mixins for postgres-exporter and promscale + cd "$(git rev-parse --show-toplevel)" + + # Remove local tmp directory if exists + if [ -d "tmp/" ]; then + rm -fr "tmp/" + fi + + mkdir -p tmp + git clone -q --depth 1 https://github.com/prometheus-community/postgres_exporter.git ${psql_exporter} + git clone -q --depth 1 https://github.com/timescale/promscale.git ${prom} + + if [ -d "${psql_exporter}" ]; then + echo "postgres-exporter mixin cloned at ${psql_exporter}" + else + echo "postgres-exporter mixin clone failed" + exit 1 + fi + + if [ -d "${prom}" ]; then + echo "promscale mixin cloned at ${prom}" + else + echo "promscale mixin clone failed" + exit 1 + fi + +} + + +build_psql_exporter() { + + # To build the postgres-exporter mixin alerts and dashboard we need to run + # the following commands: + cd ${psql_mixin} + make build + if [ $? -ne 0 ]; then + echo "postgres-exporter mixin build failed" + exit 1 + fi + cd - + + # This seems like the most straightforward way to replace the datasource in the + # generated dashboard for now. We can revisit this if we find a better way to + # replace the generated datasource. + for file in `ls ${psql_mixin}/dashboards_out/*.json` + do + cp -r $file chart/dashboards/$(basename $file) + sed -i.orig 's/\"datasource\": .*$/\"datasource\": {\n \"type\": \"prometheus\",\n \"uid\": \"dc08d25c8f267b054f12002f334e6d3d32a853e4\"\n }, /g' chart/dashboards/$(basename $file) + find chart/dashboards/ -name '*.orig' -exec rm -f {} \; + done + + # Set and copy over PrometheusRule alert configuration + if [ ! -f ${psql_mixin}/alerts.yaml ]; then + echo "make build failed, alerts.yaml is not found" + exit 1 + else + cp ${psql_mixin}/alerts.yaml chart/alerts/postgres-exporter-alerts.yaml + fi + +} + +copy_promscale_mixin() { + + # Start copying the promscale mixin alerts and dashboard + cp -r ${prom_mixin}/*.json chart/dashboards/ + + # FIXME(paulfantom): those changes should be incorporated into promscale mixin + # dashboard UID is a sha256 hash of ".title" + # datasource UID is a sha256 hash of ".name" + # "__inputs" field needs to be removed + # replace all `${DS_TIMESCALEDB}` with timescaledb datasource UID + # replace all `${DS_PROMSCALE_JAEGER}` with promscale tracing datasource UID + # replace all `${DS_PROMETHEUS}` with promscale prometheus datasource UID + find chart/dashboards/ \( -type d -name '*.json' -prune \) -o -type f -print0 | xargs -0 sed -i.orig 's/${DS_TIMESCALEDB}/c4729dfb8ceeaa0372ef27403a3932695eee995d/g' + find chart/dashboards/ -name '*.orig' -exec rm -f {} \; + find chart/dashboards/ \( -type d -name '*.json' -prune \) -o -type f -print0 | xargs -0 sed -i.orig 's/${DS_PROMSCALE_JAEGER}/f78291126102e0f2e841734d1e90250257543042/g' + find chart/dashboards/ -name '*.orig' -exec rm -f {} \; + find chart/dashboards/ \( -type d -name '*.json' -prune \) -o -type f -print0 | xargs -0 sed -i.orig 's/${DS_PROMETHEUS}/dc08d25c8f267b054f12002f334e6d3d32a853e4/g' + find chart/dashboards/ -name '*.orig' -exec rm -f {} \; + +} + +git_clone +build_psql_exporter +copy_promscale_mixin + +echo "" +echo "Copy of alerts and dashboards is complete." +echo "If you added any new dashboards please make sure you add them in the values.yaml file as well." +