From f2a051576d50091d356fc314e6fe1e3c9a52a3b2 Mon Sep 17 00:00:00 2001 From: Ornel_Zply Date: Thu, 4 Dec 2025 10:37:05 +0100 Subject: [PATCH] 04/12/25 --- .../dev/00_app_runtime_overview.json | 40 +++++++++ .../dashboards/dev/01_logs_by_service.json | 58 +++++++++++++ .../dashboards/ops/00_infra_overview.json | 49 +++++++++++ .../dashboards/ops/03_prometheus_health.json | 86 +++++++++++++++++++ .../dashboards/ops/04_logs_loki_ops.json | 65 ++++++++++++++ .../provisioning/dashboards/folders.yaml | 7 ++ .../provisioning/dashboards/provider-dev.yaml | 12 +++ .../provisioning/dashboards/provider-ops.yaml | 12 +++ .../provisioning/datasources/prometheus.yaml | 8 ++ 9 files changed, 337 insertions(+) create mode 100644 observabilite/observability/vm_observabilite/observability/grafana/dashboards/dev/00_app_runtime_overview.json create mode 100644 observabilite/observability/vm_observabilite/observability/grafana/dashboards/dev/01_logs_by_service.json create mode 100644 observabilite/observability/vm_observabilite/observability/grafana/dashboards/ops/00_infra_overview.json create mode 100644 observabilite/observability/vm_observabilite/observability/grafana/dashboards/ops/03_prometheus_health.json create mode 100644 observabilite/observability/vm_observabilite/observability/grafana/dashboards/ops/04_logs_loki_ops.json create mode 100644 observabilite/observability/vm_observabilite/observability/grafana/provisioning/dashboards/folders.yaml create mode 100644 observabilite/observability/vm_observabilite/observability/grafana/provisioning/dashboards/provider-dev.yaml create mode 100644 observabilite/observability/vm_observabilite/observability/grafana/provisioning/dashboards/provider-ops.yaml create mode 100644 observabilite/observability/vm_observabilite/observability/grafana/provisioning/datasources/prometheus.yaml diff --git a/observabilite/observability/vm_observabilite/observability/grafana/dashboards/dev/00_app_runtime_overview.json b/observabilite/observability/vm_observabilite/observability/grafana/dashboards/dev/00_app_runtime_overview.json new file mode 100644 index 0000000..11c78fd --- /dev/null +++ b/observabilite/observability/vm_observabilite/observability/grafana/dashboards/dev/00_app_runtime_overview.json @@ -0,0 +1,40 @@ +{ + "title": "App Runtime Overview (Dev)", + "uid": "dev-app-runtime-overview", + "tags": ["dev","application"], + "time": { "from": "now-1h", "to": "now" }, + "schemaVersion": 42, + "panels": [ + { + "type": "timeseries", + "title": "Requests per Second (RPS)", + "gridPos": {"x":0,"y":0,"w":12,"h":8}, + "datasource": "prometheus", + "targets": [ + { "refId": "A", "expr": "sum(rate(http_requests_total[2m]))" } + ] + }, + { + "type": "timeseries", + "title": "Error Rate (%)", + "gridPos": {"x":12,"y":0,"w":12,"h":8}, + "datasource": "prometheus", + "fieldConfig": { "defaults": { "unit": "percent" }, "overrides": [] }, + "targets": [ + { "refId": "A", "expr": "sum(rate(http_requests_total{status=~\"5..\"}[2m])) / sum(rate(http_requests_total[2m])) * 100" } + ] + }, + { + "type": "timeseries", + "title": "Latency P95 (s)", + "gridPos": {"x":0,"y":8,"w":24,"h":8}, + "datasource": "prometheus", + "targets": [ + { + "refId": "A", + "expr": "histogram_quantile(0.95, sum by (le) (rate(http_request_duration_seconds_bucket[2m])))" + } + ] + } + ] +} diff --git a/observabilite/observability/vm_observabilite/observability/grafana/dashboards/dev/01_logs_by_service.json b/observabilite/observability/vm_observabilite/observability/grafana/dashboards/dev/01_logs_by_service.json new file mode 100644 index 0000000..ce2f50c --- /dev/null +++ b/observabilite/observability/vm_observabilite/observability/grafana/dashboards/dev/01_logs_by_service.json @@ -0,0 +1,58 @@ +{ + "title": "Logs by Service (Dev)", + "uid": "dev-logs-by-service", + "tags": ["dev","logs","loki"], + "time": { "from": "now-1h", "to": "now" }, + "schemaVersion": 42, + "templating": { + "list": [ + { + "name": "service", + "label": "Service (label app)", + "type": "query", + "datasource": "loki", + "query": "label_values({app!=\"\"}, app)", + "includeAll": true, + "multi": true, + "refresh": 2, + "current": {} + } + ] + }, + "panels": [ + { + "type": "logs", + "title": "Logs – $service", + "gridPos": { "x": 0, "y": 0, "w": 24, "h": 12 }, + "datasource": "loki", + "options": { + "showLabels": true, + "showTime": true, + "wrapLogMessage": true, + "prettifyLogMessage": true + }, + "targets": [ + { + "refId": "A", + "expr": "{app=~\"$service\"}" + } + ] + }, + { + "type": "timeseries", + "title": "Error rate (logs/min) – $service", + "gridPos": { "x": 0, "y": 12, "w": 24, "h": 8 }, + "datasource": "loki", + "fieldConfig": { "defaults": { "unit": "ops" }, "overrides": [] }, + "options": { + "legend": { "showLegend": true, "placement": "bottom" } + }, + "targets": [ + { + "refId": "A", + "expr": "sum by (app)(rate({app=~\"$service\"} |~ \"(?i)(error|exception|fail|timeout)\"[5m])) * 60" + } + ] + } + ] +} diff --git a/observabilite/observability/vm_observabilite/observability/grafana/dashboards/ops/00_infra_overview.json b/observabilite/observability/vm_observabilite/observability/grafana/dashboards/ops/00_infra_overview.json new file mode 100644 index 0000000..018caaf --- /dev/null +++ b/observabilite/observability/vm_observabilite/observability/grafana/dashboards/ops/00_infra_overview.json @@ -0,0 +1,49 @@ +{ + "title": "Infra Overview (Ops)", + "uid": "ops-infra-overview", + "tags": ["ops","infrastructure"], + "time": { "from": "now-1h", "to": "now" }, + "schemaVersion": 42, + "panels": [ + { + "type": "timeseries", + "title": "CPU Utilization (%)", + "gridPos": {"x":0,"y":0,"w":12,"h":8}, + "datasource": "prometheus", + "fieldConfig": { "defaults": { "unit": "percent" }, "overrides": [] }, + "targets": [ + { "refId": "A", "expr": "100 - (avg by(instance)(rate(node_cpu_seconds_total{mode=\"idle\"}[2m]))*100)" } + ] + }, + { + "type": "timeseries", + "title": "Memory Utilization (%)", + "gridPos": {"x":12,"y":0,"w":12,"h":8}, + "datasource": "prometheus", + "fieldConfig": { "defaults": { "unit": "percent" }, "overrides": [] }, + "targets": [ + { "refId": "A", "expr": "((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * 100" } + ] + }, + { + "type": "timeseries", + "title": "Disk Space Used (bytes)", + "gridPos": {"x":0,"y":8,"w":12,"h":8}, + "datasource": "prometheus", + "fieldConfig": { "defaults": { "unit": "bytes" }, "overrides": [] }, + "targets": [ + { "refId": "A", "expr": "node_filesystem_size_bytes{fstype!~\"tmpfs|overlay\"} - node_filesystem_avail_bytes{fstype!~\"tmpfs|overlay\"}" } + ] + }, + { + "type": "timeseries", + "title": "Host Power (Watts) - Scaphandre", + "gridPos": {"x":12,"y":8,"w":12,"h":8}, + "datasource": "prometheus", + "fieldConfig": { "defaults": { "unit": "watt" }, "overrides": [] }, + "targets": [ + { "refId": "A", "expr": "scaph_host_power_microwatts / 1e6" } + ] + } + ] +} diff --git a/observabilite/observability/vm_observabilite/observability/grafana/dashboards/ops/03_prometheus_health.json b/observabilite/observability/vm_observabilite/observability/grafana/dashboards/ops/03_prometheus_health.json new file mode 100644 index 0000000..4299647 --- /dev/null +++ b/observabilite/observability/vm_observabilite/observability/grafana/dashboards/ops/03_prometheus_health.json @@ -0,0 +1,86 @@ +{ + "title": "Prometheus Health (Ops)", + "uid": "ops-prom-health", + "tags": ["ops","prometheus","health"], + "time": { "from": "now-1h", "to": "now" }, + "schemaVersion": 42, + "panels": [ + { + "type": "stat", + "title": "Targets DOWN (total)", + "gridPos": { "x": 0, "y": 0, "w": 6, "h": 4 }, + "datasource": "prometheus", + "fieldConfig": { "defaults": { "unit": "none" }, "overrides": [] }, + "options": { "reduceOptions": { "calcs": ["lastNotNull"], "fields": "" } }, + "targets": [ + { "refId": "A", "expr": "sum(1 - up)" } + ] + }, + { + "type": "stat", + "title": "Alerts firing", + "gridPos": { "x": 6, "y": 0, "w": 6, "h": 4 }, + "datasource": "prometheus", + "fieldConfig": { "defaults": { "unit": "none" }, "overrides": [] }, + "options": { "reduceOptions": { "calcs": ["lastNotNull"], "fields": "" } }, + "targets": [ + { "refId": "A", "expr": "count(ALERTS{alertstate=\"firing\"})" } + ] + }, + { + "type": "stat", + "title": "Ingest rate (samples/s)", + "gridPos": { "x": 12, "y": 0, "w": 12, "h": 4 }, + "datasource": "prometheus", + "fieldConfig": { "defaults": { "unit": "ops" }, "overrides": [] }, + "options": { "reduceOptions": { "calcs": ["lastNotNull"] } }, + "targets": [ + { "refId": "A", "expr": "rate(prometheus_tsdb_head_samples_appended_total[5m])" } + ] + }, + { + "type": "timeseries", + "title": "Targets UP by job", + "gridPos": { "x": 0, "y": 4, "w": 12, "h": 8 }, + "datasource": "prometheus", + "fieldConfig": { "defaults": { "unit": "none" }, "overrides": [] }, + "options": { "legend": { "showLegend": true, "placement": "bottom" } }, + "targets": [ + { "refId": "A", "expr": "sum by(job)(up)" } + ] + }, + { + "type": "timeseries", + "title": "Scrape duration (s) by job", + "gridPos": { "x": 12, "y": 4, "w": 12, "h": 8 }, + "datasource": "prometheus", + "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] }, + "options": { "legend": { "showLegend": true, "placement": "bottom" } }, + "targets": [ + { "refId": "A", "expr": "avg by(job)(scrape_duration_seconds)" } + ] + }, + { + "type": "timeseries", + "title": "Rule group eval duration (s)", + "gridPos": { "x": 0, "y": 12, "w": 12, "h": 8 }, + "datasource": "prometheus", + "fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] }, + "options": { "legend": { "showLegend": true, "placement": "bottom" } }, + "targets": [ + { "refId": "A", "expr": "max by(rule_group) (prometheus_rule_group_last_duration_seconds)" } + ] + }, + { + "type": "timeseries", + "title": "TSDB head chunks", + "gridPos": { "x": 12, "y": 12, "w": 12, "h": 8 }, + "datasource": "prometheus", + "fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] }, + "options": { "legend": { "showLegend": true, "placement": "bottom" } }, + "targets": [ + { "refId": "A", "expr": "prometheus_tsdb_head_chunks" } + ] + } + ] +} diff --git a/observabilite/observability/vm_observabilite/observability/grafana/dashboards/ops/04_logs_loki_ops.json b/observabilite/observability/vm_observabilite/observability/grafana/dashboards/ops/04_logs_loki_ops.json new file mode 100644 index 0000000..41dc7d4 --- /dev/null +++ b/observabilite/observability/vm_observabilite/observability/grafana/dashboards/ops/04_logs_loki_ops.json @@ -0,0 +1,65 @@ +{ + "title": "Logs – Ops (Errors & System)", + "uid": "ops-logs-errors", + "tags": ["ops","logs","loki"], + "time": { "from": "now-2h", "to": "now" }, + "schemaVersion": 42, + "templating": { + "list": [ + { + "name": "job", + "label": "Job", + "type": "query", + "datasource": "loki", + "query": "label_values(job)", + "includeAll": true, + "multi": true, + "refresh": 2 + }, + { + "name": "instance", + "label": "Instance", + "type": "query", + "datasource": "loki", + "query": "label_values({job=~\"$job\"}, instance)", + "includeAll": true, + "multi": true, + "refresh": 2 + } + ] + }, + "panels": [ + { + "type": "timeseries", + "title": "Error rate by instance (logs/min)", + "gridPos": { "x": 0, "y": 0, "w": 24, "h": 8 }, + "datasource": "loki", + "fieldConfig": { "defaults": { "unit": "ops" }, "overrides": [] }, + "options": { "legend": { "showLegend": true, "placement": "bottom" } }, + "targets": [ + { + "refId": "A", + "expr": "sum by (instance)(rate({job=~\"$job\", instance=~\"$instance\"} |~ \"(?i)(critical|error|err)\"[5m])) * 60" + } + ] + }, + { + "type": "logs", + "title": "Recent critical & errors – $job / $instance", + "gridPos": { "x": 0, "y": 8, "w": 24, "h": 14 }, + "datasource": "loki", + "options": { + "showLabels": true, + "showTime": true, + "wrapLogMessage": true, + "prettifyLogMessage": true + }, + "targets": [ + { + "refId": "A", + "expr": "{job=~\"$job\", instance=~\"$instance\"} |~ \"(?i)(critical|error|err|panic|oom)\"" + } + ] + } + ] +} diff --git a/observabilite/observability/vm_observabilite/observability/grafana/provisioning/dashboards/folders.yaml b/observabilite/observability/vm_observabilite/observability/grafana/provisioning/dashboards/folders.yaml new file mode 100644 index 0000000..4901783 --- /dev/null +++ b/observabilite/observability/vm_observabilite/observability/grafana/provisioning/dashboards/folders.yaml @@ -0,0 +1,7 @@ +apiVersion: 1 + +folders: + - uid: ops-folder + title: "Ops – Infrastructure & Plateforme" + - uid: dev-folder + title: "Dev – Application & Qualité" diff --git a/observabilite/observability/vm_observabilite/observability/grafana/provisioning/dashboards/provider-dev.yaml b/observabilite/observability/vm_observabilite/observability/grafana/provisioning/dashboards/provider-dev.yaml new file mode 100644 index 0000000..a997263 --- /dev/null +++ b/observabilite/observability/vm_observabilite/observability/grafana/provisioning/dashboards/provider-dev.yaml @@ -0,0 +1,12 @@ +apiVersion: 1 +providers: + - name: "dev-dashboards" + orgId: 1 + folderUid: "dev-folder" + type: file + disableDeletion: false + editable: true + updateIntervalSeconds: 30 + options: + path: /var/lib/grafana/dashboards/dev + foldersFromFilesStructure: true diff --git a/observabilite/observability/vm_observabilite/observability/grafana/provisioning/dashboards/provider-ops.yaml b/observabilite/observability/vm_observabilite/observability/grafana/provisioning/dashboards/provider-ops.yaml new file mode 100644 index 0000000..091b380 --- /dev/null +++ b/observabilite/observability/vm_observabilite/observability/grafana/provisioning/dashboards/provider-ops.yaml @@ -0,0 +1,12 @@ +apiVersion: 1 +providers: + - name: "ops-dashboards" + orgId: 1 + folderUid: "ops-folder" + type: file + disableDeletion: false + editable: true + updateIntervalSeconds: 30 + options: + path: /var/lib/grafana/dashboards/ops + foldersFromFilesStructure: true diff --git a/observabilite/observability/vm_observabilite/observability/grafana/provisioning/datasources/prometheus.yaml b/observabilite/observability/vm_observabilite/observability/grafana/provisioning/datasources/prometheus.yaml new file mode 100644 index 0000000..3c25da3 --- /dev/null +++ b/observabilite/observability/vm_observabilite/observability/grafana/provisioning/datasources/prometheus.yaml @@ -0,0 +1,8 @@ +apiVersion: 1 +datasources: + - name: prometheus + type: prometheus + access: proxy + url: http://prometheus:9090 + isDefault: true + editable: true