04/12/25
This commit is contained in:
parent
91bc3c856f
commit
f2a051576d
@ -0,0 +1,40 @@
|
|||||||
|
{
|
||||||
|
"title": "App Runtime Overview (Dev)",
|
||||||
|
"uid": "dev-app-runtime-overview",
|
||||||
|
"tags": ["dev","application"],
|
||||||
|
"time": { "from": "now-1h", "to": "now" },
|
||||||
|
"schemaVersion": 42,
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Requests per Second (RPS)",
|
||||||
|
"gridPos": {"x":0,"y":0,"w":12,"h":8},
|
||||||
|
"datasource": "prometheus",
|
||||||
|
"targets": [
|
||||||
|
{ "refId": "A", "expr": "sum(rate(http_requests_total[2m]))" }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Error Rate (%)",
|
||||||
|
"gridPos": {"x":12,"y":0,"w":12,"h":8},
|
||||||
|
"datasource": "prometheus",
|
||||||
|
"fieldConfig": { "defaults": { "unit": "percent" }, "overrides": [] },
|
||||||
|
"targets": [
|
||||||
|
{ "refId": "A", "expr": "sum(rate(http_requests_total{status=~\"5..\"}[2m])) / sum(rate(http_requests_total[2m])) * 100" }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Latency P95 (s)",
|
||||||
|
"gridPos": {"x":0,"y":8,"w":24,"h":8},
|
||||||
|
"datasource": "prometheus",
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"refId": "A",
|
||||||
|
"expr": "histogram_quantile(0.95, sum by (le) (rate(http_request_duration_seconds_bucket[2m])))"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
@ -0,0 +1,58 @@
|
|||||||
|
{
|
||||||
|
"title": "Logs by Service (Dev)",
|
||||||
|
"uid": "dev-logs-by-service",
|
||||||
|
"tags": ["dev","logs","loki"],
|
||||||
|
"time": { "from": "now-1h", "to": "now" },
|
||||||
|
"schemaVersion": 42,
|
||||||
|
"templating": {
|
||||||
|
"list": [
|
||||||
|
{
|
||||||
|
"name": "service",
|
||||||
|
"label": "Service (label app)",
|
||||||
|
"type": "query",
|
||||||
|
"datasource": "loki",
|
||||||
|
"query": "label_values({app!=\"\"}, app)",
|
||||||
|
"includeAll": true,
|
||||||
|
"multi": true,
|
||||||
|
"refresh": 2,
|
||||||
|
"current": {}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"type": "logs",
|
||||||
|
"title": "Logs – $service",
|
||||||
|
"gridPos": { "x": 0, "y": 0, "w": 24, "h": 12 },
|
||||||
|
"datasource": "loki",
|
||||||
|
"options": {
|
||||||
|
"showLabels": true,
|
||||||
|
"showTime": true,
|
||||||
|
"wrapLogMessage": true,
|
||||||
|
"prettifyLogMessage": true
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"refId": "A",
|
||||||
|
"expr": "{app=~\"$service\"}"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Error rate (logs/min) – $service",
|
||||||
|
"gridPos": { "x": 0, "y": 12, "w": 24, "h": 8 },
|
||||||
|
"datasource": "loki",
|
||||||
|
"fieldConfig": { "defaults": { "unit": "ops" }, "overrides": [] },
|
||||||
|
"options": {
|
||||||
|
"legend": { "showLegend": true, "placement": "bottom" }
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"refId": "A",
|
||||||
|
"expr": "sum by (app)(rate({app=~\"$service\"} |~ \"(?i)(error|exception|fail|timeout)\"[5m])) * 60"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
@ -0,0 +1,49 @@
|
|||||||
|
{
|
||||||
|
"title": "Infra Overview (Ops)",
|
||||||
|
"uid": "ops-infra-overview",
|
||||||
|
"tags": ["ops","infrastructure"],
|
||||||
|
"time": { "from": "now-1h", "to": "now" },
|
||||||
|
"schemaVersion": 42,
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "CPU Utilization (%)",
|
||||||
|
"gridPos": {"x":0,"y":0,"w":12,"h":8},
|
||||||
|
"datasource": "prometheus",
|
||||||
|
"fieldConfig": { "defaults": { "unit": "percent" }, "overrides": [] },
|
||||||
|
"targets": [
|
||||||
|
{ "refId": "A", "expr": "100 - (avg by(instance)(rate(node_cpu_seconds_total{mode=\"idle\"}[2m]))*100)" }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Memory Utilization (%)",
|
||||||
|
"gridPos": {"x":12,"y":0,"w":12,"h":8},
|
||||||
|
"datasource": "prometheus",
|
||||||
|
"fieldConfig": { "defaults": { "unit": "percent" }, "overrides": [] },
|
||||||
|
"targets": [
|
||||||
|
{ "refId": "A", "expr": "((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * 100" }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Disk Space Used (bytes)",
|
||||||
|
"gridPos": {"x":0,"y":8,"w":12,"h":8},
|
||||||
|
"datasource": "prometheus",
|
||||||
|
"fieldConfig": { "defaults": { "unit": "bytes" }, "overrides": [] },
|
||||||
|
"targets": [
|
||||||
|
{ "refId": "A", "expr": "node_filesystem_size_bytes{fstype!~\"tmpfs|overlay\"} - node_filesystem_avail_bytes{fstype!~\"tmpfs|overlay\"}" }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Host Power (Watts) - Scaphandre",
|
||||||
|
"gridPos": {"x":12,"y":8,"w":12,"h":8},
|
||||||
|
"datasource": "prometheus",
|
||||||
|
"fieldConfig": { "defaults": { "unit": "watt" }, "overrides": [] },
|
||||||
|
"targets": [
|
||||||
|
{ "refId": "A", "expr": "scaph_host_power_microwatts / 1e6" }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
@ -0,0 +1,86 @@
|
|||||||
|
{
|
||||||
|
"title": "Prometheus Health (Ops)",
|
||||||
|
"uid": "ops-prom-health",
|
||||||
|
"tags": ["ops","prometheus","health"],
|
||||||
|
"time": { "from": "now-1h", "to": "now" },
|
||||||
|
"schemaVersion": 42,
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Targets DOWN (total)",
|
||||||
|
"gridPos": { "x": 0, "y": 0, "w": 6, "h": 4 },
|
||||||
|
"datasource": "prometheus",
|
||||||
|
"fieldConfig": { "defaults": { "unit": "none" }, "overrides": [] },
|
||||||
|
"options": { "reduceOptions": { "calcs": ["lastNotNull"], "fields": "" } },
|
||||||
|
"targets": [
|
||||||
|
{ "refId": "A", "expr": "sum(1 - up)" }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Alerts firing",
|
||||||
|
"gridPos": { "x": 6, "y": 0, "w": 6, "h": 4 },
|
||||||
|
"datasource": "prometheus",
|
||||||
|
"fieldConfig": { "defaults": { "unit": "none" }, "overrides": [] },
|
||||||
|
"options": { "reduceOptions": { "calcs": ["lastNotNull"], "fields": "" } },
|
||||||
|
"targets": [
|
||||||
|
{ "refId": "A", "expr": "count(ALERTS{alertstate=\"firing\"})" }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "stat",
|
||||||
|
"title": "Ingest rate (samples/s)",
|
||||||
|
"gridPos": { "x": 12, "y": 0, "w": 12, "h": 4 },
|
||||||
|
"datasource": "prometheus",
|
||||||
|
"fieldConfig": { "defaults": { "unit": "ops" }, "overrides": [] },
|
||||||
|
"options": { "reduceOptions": { "calcs": ["lastNotNull"] } },
|
||||||
|
"targets": [
|
||||||
|
{ "refId": "A", "expr": "rate(prometheus_tsdb_head_samples_appended_total[5m])" }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Targets UP by job",
|
||||||
|
"gridPos": { "x": 0, "y": 4, "w": 12, "h": 8 },
|
||||||
|
"datasource": "prometheus",
|
||||||
|
"fieldConfig": { "defaults": { "unit": "none" }, "overrides": [] },
|
||||||
|
"options": { "legend": { "showLegend": true, "placement": "bottom" } },
|
||||||
|
"targets": [
|
||||||
|
{ "refId": "A", "expr": "sum by(job)(up)" }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Scrape duration (s) by job",
|
||||||
|
"gridPos": { "x": 12, "y": 4, "w": 12, "h": 8 },
|
||||||
|
"datasource": "prometheus",
|
||||||
|
"fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] },
|
||||||
|
"options": { "legend": { "showLegend": true, "placement": "bottom" } },
|
||||||
|
"targets": [
|
||||||
|
{ "refId": "A", "expr": "avg by(job)(scrape_duration_seconds)" }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Rule group eval duration (s)",
|
||||||
|
"gridPos": { "x": 0, "y": 12, "w": 12, "h": 8 },
|
||||||
|
"datasource": "prometheus",
|
||||||
|
"fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] },
|
||||||
|
"options": { "legend": { "showLegend": true, "placement": "bottom" } },
|
||||||
|
"targets": [
|
||||||
|
{ "refId": "A", "expr": "max by(rule_group) (prometheus_rule_group_last_duration_seconds)" }
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "TSDB head chunks",
|
||||||
|
"gridPos": { "x": 12, "y": 12, "w": 12, "h": 8 },
|
||||||
|
"datasource": "prometheus",
|
||||||
|
"fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] },
|
||||||
|
"options": { "legend": { "showLegend": true, "placement": "bottom" } },
|
||||||
|
"targets": [
|
||||||
|
{ "refId": "A", "expr": "prometheus_tsdb_head_chunks" }
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
@ -0,0 +1,65 @@
|
|||||||
|
{
|
||||||
|
"title": "Logs – Ops (Errors & System)",
|
||||||
|
"uid": "ops-logs-errors",
|
||||||
|
"tags": ["ops","logs","loki"],
|
||||||
|
"time": { "from": "now-2h", "to": "now" },
|
||||||
|
"schemaVersion": 42,
|
||||||
|
"templating": {
|
||||||
|
"list": [
|
||||||
|
{
|
||||||
|
"name": "job",
|
||||||
|
"label": "Job",
|
||||||
|
"type": "query",
|
||||||
|
"datasource": "loki",
|
||||||
|
"query": "label_values(job)",
|
||||||
|
"includeAll": true,
|
||||||
|
"multi": true,
|
||||||
|
"refresh": 2
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "instance",
|
||||||
|
"label": "Instance",
|
||||||
|
"type": "query",
|
||||||
|
"datasource": "loki",
|
||||||
|
"query": "label_values({job=~\"$job\"}, instance)",
|
||||||
|
"includeAll": true,
|
||||||
|
"multi": true,
|
||||||
|
"refresh": 2
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"panels": [
|
||||||
|
{
|
||||||
|
"type": "timeseries",
|
||||||
|
"title": "Error rate by instance (logs/min)",
|
||||||
|
"gridPos": { "x": 0, "y": 0, "w": 24, "h": 8 },
|
||||||
|
"datasource": "loki",
|
||||||
|
"fieldConfig": { "defaults": { "unit": "ops" }, "overrides": [] },
|
||||||
|
"options": { "legend": { "showLegend": true, "placement": "bottom" } },
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"refId": "A",
|
||||||
|
"expr": "sum by (instance)(rate({job=~\"$job\", instance=~\"$instance\"} |~ \"(?i)(critical|error|err)\"[5m])) * 60"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"type": "logs",
|
||||||
|
"title": "Recent critical & errors – $job / $instance",
|
||||||
|
"gridPos": { "x": 0, "y": 8, "w": 24, "h": 14 },
|
||||||
|
"datasource": "loki",
|
||||||
|
"options": {
|
||||||
|
"showLabels": true,
|
||||||
|
"showTime": true,
|
||||||
|
"wrapLogMessage": true,
|
||||||
|
"prettifyLogMessage": true
|
||||||
|
},
|
||||||
|
"targets": [
|
||||||
|
{
|
||||||
|
"refId": "A",
|
||||||
|
"expr": "{job=~\"$job\", instance=~\"$instance\"} |~ \"(?i)(critical|error|err|panic|oom)\""
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
@ -0,0 +1,7 @@
|
|||||||
|
apiVersion: 1
|
||||||
|
|
||||||
|
folders:
|
||||||
|
- uid: ops-folder
|
||||||
|
title: "Ops – Infrastructure & Plateforme"
|
||||||
|
- uid: dev-folder
|
||||||
|
title: "Dev – Application & Qualité"
|
||||||
@ -0,0 +1,12 @@
|
|||||||
|
apiVersion: 1
|
||||||
|
providers:
|
||||||
|
- name: "dev-dashboards"
|
||||||
|
orgId: 1
|
||||||
|
folderUid: "dev-folder"
|
||||||
|
type: file
|
||||||
|
disableDeletion: false
|
||||||
|
editable: true
|
||||||
|
updateIntervalSeconds: 30
|
||||||
|
options:
|
||||||
|
path: /var/lib/grafana/dashboards/dev
|
||||||
|
foldersFromFilesStructure: true
|
||||||
@ -0,0 +1,12 @@
|
|||||||
|
apiVersion: 1
|
||||||
|
providers:
|
||||||
|
- name: "ops-dashboards"
|
||||||
|
orgId: 1
|
||||||
|
folderUid: "ops-folder"
|
||||||
|
type: file
|
||||||
|
disableDeletion: false
|
||||||
|
editable: true
|
||||||
|
updateIntervalSeconds: 30
|
||||||
|
options:
|
||||||
|
path: /var/lib/grafana/dashboards/ops
|
||||||
|
foldersFromFilesStructure: true
|
||||||
@ -0,0 +1,8 @@
|
|||||||
|
apiVersion: 1
|
||||||
|
datasources:
|
||||||
|
- name: prometheus
|
||||||
|
type: prometheus
|
||||||
|
access: proxy
|
||||||
|
url: http://prometheus:9090
|
||||||
|
isDefault: true
|
||||||
|
editable: true
|
||||||
Loading…
x
Reference in New Issue
Block a user