This commit is contained in:
Ornel_Zply 2025-12-18 09:53:28 +01:00
parent 695d9d7643
commit d3ea7355f0
18 changed files with 563 additions and 0 deletions

View File

@ -0,0 +1,82 @@
services:
prometheus:
image: prom/prometheus:latest
container_name: prometheus-observability
volumes:
- ./prometheus.yml:/etc/prometheus/prometheus.yml
command:
- '--config.file=/etc/prometheus/prometheus.yml'
ports:
- "9090:9090"
networks:
- observability
grafana:
image: grafana/grafana:latest
container_name: grafana-observability
ports:
- "3000:3000"
volumes:
- grafana-data:/var/lib/grafana
- ./observability/grafana/provisioning/dashboards:/etc/grafana/provisioning/dashboards:ro
- ./observability/grafana/provisioning/datasources:/etc/grafana/provisioning/datasources:ro
- ./observability/grafana/dashboards:/var/lib/grafana/dashboards:ro
environment:
- GF_SECURITY_ADMIN_USER=admin
- GF_SECURITY_ADMIN_PASSWORD=admin
- GF_SMTP_ENABLED=true
- GF_SMTP_HOST=smtp.gmail.com:587
- GF_SMTP_USER=kimraumilliardaire@gmail.com
- GF_SMTP_PASSWORD=vmfc xrtt yvvm gylz
- GF_SMTP_FROM_ADDRESS=kimraumilliardaire@gmail.com
- GF_SMTP_FROM_NAME=Grafana Alerts
- GF_SMTP_SKIP_VERIFY=true # utile si problème de certifs côté conteneur
# volumes, depends_on, etc.
networks:
- observability
depends_on:
- loki
loki:
image: grafana/loki:2.8.2
container_name: loki-observability
ports:
- "3100:3100"
command: -config.file=/etc/loki/local-config.yaml
volumes:
- ./loki-config.yaml:/etc/loki/local-config.yaml
- ./loki-wal:/wal
- ./loki-chunks:/loki/chunks
- ./loki-index:/loki/index
networks:
- observability
promtail:
image: grafana/promtail:2.8.2
container_name: promtail-observability
volumes:
- ./promtail-config.yaml:/etc/promtail/config.yaml
- /var/lib/docker/containers:/var/lib/docker/containers:ro
- /var/log:/var/log:ro
command:
- -config.file=/etc/promtail/config.yaml
#depends_on:
# - loki
networks:
- observability
node_exporter:
image: prom/node-exporter:latest
container_name: node-exporter
restart: unless-stopped
pid: "host"
network_mode: "host"
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
command:
- '--path.procfs=/host/proc'
- '--path.sysfs=/host/sys'
- '--collector.filesystem.ignored-mount-points="^/(sys|proc|dev|host|etc)($$|/)"'
volumes:
grafana-data:
networks:
observability:
driver: bridge

View File

@ -0,0 +1 @@
{"UID":"714e0dc1-bca9-44e1-aca6-110f8b49de5c","created_at":"2025-09-29T13:46:47.834317171Z","version":{"version":"2.8.2","revision":"9f809eda7","branch":"HEAD","buildUser":"root@e401cfcb874f","buildDate":"2023-05-03T11:07:54Z","goVersion":"go1.20.4"}}

View File

@ -0,0 +1,59 @@
auth_enabled: false
server:
http_listen_port: 3100
grpc_listen_port: 9096
common:
path_prefix: /loki
storage:
filesystem:
chunks_directory: /loki/chunks
rules_directory: /loki/rules
replication_factor: 1
ring:
instance_addr: 127.0.0.1
kvstore:
store: inmemory
ingester:
wal:
enabled: true
dir: /wal
flush_on_shutdown: true
chunk_idle_period: 5m
chunk_retain_period: 30s
max_chunk_age: 1h
lifecycler:
ring:
replication_factor: 1
schema_config:
configs:
- from: 2020-10-24
store: boltdb-shipper
object_store: filesystem
schema: v11
index:
prefix: index_
period: 24h
storage_config:
boltdb_shipper:
active_index_directory: /loki/index
cache_location: /loki/index
shared_store: filesystem
filesystem:
directory: /loki/chunks
limits_config:
enforce_metric_name: false
reject_old_samples: true
reject_old_samples_max_age: 168h
ingestion_rate_mb: 10
ingestion_burst_size_mb: 20
compactor:
working_directory: /loki/compactor
shared_store: filesystem
compaction_interval: 10m

View File

@ -0,0 +1 @@
{"UID":"714e0dc1-bca9-44e1-aca6-110f8b49de5c","created_at":"2025-09-29T13:46:47.834317171Z","version":{"version":"2.8.2","revision":"9f809eda7","branch":"HEAD","buildUser":"root@e401cfcb874f","buildDate":"2023-05-03T11:07:54Z","goVersion":"go1.20.4"}}

View File

View File

@ -0,0 +1,40 @@
{
"title": "App Runtime Overview (Dev)",
"uid": "dev-app-runtime-overview",
"tags": ["dev","application"],
"time": { "from": "now-1h", "to": "now" },
"schemaVersion": 42,
"panels": [
{
"type": "timeseries",
"title": "Requests per Second (RPS)",
"gridPos": {"x":0,"y":0,"w":12,"h":8},
"datasource": "prometheus",
"targets": [
{ "refId": "A", "expr": "sum(rate(http_requests_total[2m]))" }
]
},
{
"type": "timeseries",
"title": "Error Rate (%)",
"gridPos": {"x":12,"y":0,"w":12,"h":8},
"datasource": "prometheus",
"fieldConfig": { "defaults": { "unit": "percent" }, "overrides": [] },
"targets": [
{ "refId": "A", "expr": "sum(rate(http_requests_total{status=~\"5..\"}[2m])) / sum(rate(http_requests_total[2m])) * 100" }
]
},
{
"type": "timeseries",
"title": "Latency P95 (s)",
"gridPos": {"x":0,"y":8,"w":24,"h":8},
"datasource": "prometheus",
"targets": [
{
"refId": "A",
"expr": "histogram_quantile(0.95, sum by (le) (rate(http_request_duration_seconds_bucket[2m])))"
}
]
}
]
}

View File

@ -0,0 +1,58 @@
{
"title": "Logs by Service (Dev)",
"uid": "dev-logs-by-service",
"tags": ["dev","logs","loki"],
"time": { "from": "now-1h", "to": "now" },
"schemaVersion": 42,
"templating": {
"list": [
{
"name": "service",
"label": "Service (label app)",
"type": "query",
"datasource": "loki",
"query": "label_values({app!=\"\"}, app)",
"includeAll": true,
"multi": true,
"refresh": 2,
"current": {}
}
]
},
"panels": [
{
"type": "logs",
"title": "Logs $service",
"gridPos": { "x": 0, "y": 0, "w": 24, "h": 12 },
"datasource": "loki",
"options": {
"showLabels": true,
"showTime": true,
"wrapLogMessage": true,
"prettifyLogMessage": true
},
"targets": [
{
"refId": "A",
"expr": "{app=~\"$service\"}"
}
]
},
{
"type": "timeseries",
"title": "Error rate (logs/min) $service",
"gridPos": { "x": 0, "y": 12, "w": 24, "h": 8 },
"datasource": "loki",
"fieldConfig": { "defaults": { "unit": "ops" }, "overrides": [] },
"options": {
"legend": { "showLegend": true, "placement": "bottom" }
},
"targets": [
{
"refId": "A",
"expr": "sum by (app)(rate({app=~\"$service\"} |~ \"(?i)(error|exception|fail|timeout)\"[5m])) * 60"
}
]
}
]
}

View File

@ -0,0 +1,49 @@
{
"title": "Infra Overview (Ops)",
"uid": "ops-infra-overview",
"tags": ["ops","infrastructure"],
"time": { "from": "now-1h", "to": "now" },
"schemaVersion": 42,
"panels": [
{
"type": "timeseries",
"title": "CPU Utilization (%)",
"gridPos": {"x":0,"y":0,"w":12,"h":8},
"datasource": "prometheus",
"fieldConfig": { "defaults": { "unit": "percent" }, "overrides": [] },
"targets": [
{ "refId": "A", "expr": "100 - (avg by(instance)(rate(node_cpu_seconds_total{mode=\"idle\"}[2m]))*100)" }
]
},
{
"type": "timeseries",
"title": "Memory Utilization (%)",
"gridPos": {"x":12,"y":0,"w":12,"h":8},
"datasource": "prometheus",
"fieldConfig": { "defaults": { "unit": "percent" }, "overrides": [] },
"targets": [
{ "refId": "A", "expr": "((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes) * 100" }
]
},
{
"type": "timeseries",
"title": "Disk Space Used (bytes)",
"gridPos": {"x":0,"y":8,"w":12,"h":8},
"datasource": "prometheus",
"fieldConfig": { "defaults": { "unit": "bytes" }, "overrides": [] },
"targets": [
{ "refId": "A", "expr": "node_filesystem_size_bytes{fstype!~\"tmpfs|overlay\"} - node_filesystem_avail_bytes{fstype!~\"tmpfs|overlay\"}" }
]
},
{
"type": "timeseries",
"title": "Host Power (Watts) - Scaphandre",
"gridPos": {"x":12,"y":8,"w":12,"h":8},
"datasource": "prometheus",
"fieldConfig": { "defaults": { "unit": "watt" }, "overrides": [] },
"targets": [
{ "refId": "A", "expr": "scaph_host_power_microwatts / 1e6" }
]
}
]
}

View File

@ -0,0 +1,86 @@
{
"title": "Prometheus Health (Ops)",
"uid": "ops-prom-health",
"tags": ["ops","prometheus","health"],
"time": { "from": "now-1h", "to": "now" },
"schemaVersion": 42,
"panels": [
{
"type": "stat",
"title": "Targets DOWN (total)",
"gridPos": { "x": 0, "y": 0, "w": 6, "h": 4 },
"datasource": "prometheus",
"fieldConfig": { "defaults": { "unit": "none" }, "overrides": [] },
"options": { "reduceOptions": { "calcs": ["lastNotNull"], "fields": "" } },
"targets": [
{ "refId": "A", "expr": "sum(1 - up)" }
]
},
{
"type": "stat",
"title": "Alerts firing",
"gridPos": { "x": 6, "y": 0, "w": 6, "h": 4 },
"datasource": "prometheus",
"fieldConfig": { "defaults": { "unit": "none" }, "overrides": [] },
"options": { "reduceOptions": { "calcs": ["lastNotNull"], "fields": "" } },
"targets": [
{ "refId": "A", "expr": "count(ALERTS{alertstate=\"firing\"})" }
]
},
{
"type": "stat",
"title": "Ingest rate (samples/s)",
"gridPos": { "x": 12, "y": 0, "w": 12, "h": 4 },
"datasource": "prometheus",
"fieldConfig": { "defaults": { "unit": "ops" }, "overrides": [] },
"options": { "reduceOptions": { "calcs": ["lastNotNull"] } },
"targets": [
{ "refId": "A", "expr": "rate(prometheus_tsdb_head_samples_appended_total[5m])" }
]
},
{
"type": "timeseries",
"title": "Targets UP by job",
"gridPos": { "x": 0, "y": 4, "w": 12, "h": 8 },
"datasource": "prometheus",
"fieldConfig": { "defaults": { "unit": "none" }, "overrides": [] },
"options": { "legend": { "showLegend": true, "placement": "bottom" } },
"targets": [
{ "refId": "A", "expr": "sum by(job)(up)" }
]
},
{
"type": "timeseries",
"title": "Scrape duration (s) by job",
"gridPos": { "x": 12, "y": 4, "w": 12, "h": 8 },
"datasource": "prometheus",
"fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] },
"options": { "legend": { "showLegend": true, "placement": "bottom" } },
"targets": [
{ "refId": "A", "expr": "avg by(job)(scrape_duration_seconds)" }
]
},
{
"type": "timeseries",
"title": "Rule group eval duration (s)",
"gridPos": { "x": 0, "y": 12, "w": 12, "h": 8 },
"datasource": "prometheus",
"fieldConfig": { "defaults": { "unit": "s" }, "overrides": [] },
"options": { "legend": { "showLegend": true, "placement": "bottom" } },
"targets": [
{ "refId": "A", "expr": "max by(rule_group) (prometheus_rule_group_last_duration_seconds)" }
]
},
{
"type": "timeseries",
"title": "TSDB head chunks",
"gridPos": { "x": 12, "y": 12, "w": 12, "h": 8 },
"datasource": "prometheus",
"fieldConfig": { "defaults": { "unit": "short" }, "overrides": [] },
"options": { "legend": { "showLegend": true, "placement": "bottom" } },
"targets": [
{ "refId": "A", "expr": "prometheus_tsdb_head_chunks" }
]
}
]
}

View File

@ -0,0 +1,65 @@
{
"title": "Logs Ops (Errors & System)",
"uid": "ops-logs-errors",
"tags": ["ops","logs","loki"],
"time": { "from": "now-2h", "to": "now" },
"schemaVersion": 42,
"templating": {
"list": [
{
"name": "job",
"label": "Job",
"type": "query",
"datasource": "loki",
"query": "label_values(job)",
"includeAll": true,
"multi": true,
"refresh": 2
},
{
"name": "instance",
"label": "Instance",
"type": "query",
"datasource": "loki",
"query": "label_values({job=~\"$job\"}, instance)",
"includeAll": true,
"multi": true,
"refresh": 2
}
]
},
"panels": [
{
"type": "timeseries",
"title": "Error rate by instance (logs/min)",
"gridPos": { "x": 0, "y": 0, "w": 24, "h": 8 },
"datasource": "loki",
"fieldConfig": { "defaults": { "unit": "ops" }, "overrides": [] },
"options": { "legend": { "showLegend": true, "placement": "bottom" } },
"targets": [
{
"refId": "A",
"expr": "sum by (instance)(rate({job=~\"$job\", instance=~\"$instance\"} |~ \"(?i)(critical|error|err)\"[5m])) * 60"
}
]
},
{
"type": "logs",
"title": "Recent critical & errors $job / $instance",
"gridPos": { "x": 0, "y": 8, "w": 24, "h": 14 },
"datasource": "loki",
"options": {
"showLabels": true,
"showTime": true,
"wrapLogMessage": true,
"prettifyLogMessage": true
},
"targets": [
{
"refId": "A",
"expr": "{job=~\"$job\", instance=~\"$instance\"} |~ \"(?i)(critical|error|err|panic|oom)\""
}
]
}
]
}

View File

@ -0,0 +1,7 @@
apiVersion: 1
folders:
- uid: ops-folder
title: "Ops Infrastructure & Plateforme"
- uid: dev-folder
title: "Dev Application & Qualité"

View File

@ -0,0 +1,12 @@
apiVersion: 1
providers:
- name: "dev-dashboards"
orgId: 1
folderUid: "dev-folder"
type: file
disableDeletion: false
editable: true
updateIntervalSeconds: 30
options:
path: /var/lib/grafana/dashboards/dev
foldersFromFilesStructure: true

View File

@ -0,0 +1,12 @@
apiVersion: 1
providers:
- name: "ops-dashboards"
orgId: 1
folderUid: "ops-folder"
type: file
disableDeletion: false
editable: true
updateIntervalSeconds: 30
options:
path: /var/lib/grafana/dashboards/ops
foldersFromFilesStructure: true

View File

@ -0,0 +1,8 @@
apiVersion: 1
datasources:
- name: prometheus
type: prometheus
access: proxy
url: http://prometheus:9090
isDefault: true
editable: true

View File

@ -0,0 +1,30 @@
global:
scrape_interval: 15s
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
- job_name: 'observabilite'
static_configs:
- targets: ['192.168.4.4:9100'] #ip de la machine observabilite
- job_name: 'scaphandre'
static_configs:
- targets: ['192.168.4.4:8080']
fallback_scrape_protocol: "PrometheusText1.0.0"
- job_name: 'apache_vmservices'
static_configs:
- targets: ['192.168.56.17:9117'] #ip vmService
- job_name: 'vms'
static_configs:
- targets: ['192.168.56.18:9100'] #ip vmHardware
- targets: ['192.168.56.17:9100'] #ip vmServices
- targets: ['192.168.56.15:9100'] #ip vmApplicatifs
- job_name: 'tomcat'
static_configs:
- targets: ['192.168.56.15:9082'] #ip vmApplicatif

View File

@ -0,0 +1,26 @@
server:
http_listen_port: 9080
grpc_listen_port: 0
positions:
filename: /tmp/positions.yaml
clients:
- url: http://loki:3100/loki/api/v1/push
scrape_configs:
- job_name: docker_logs
static_configs:
- targets:
- localhost
labels:
job: docker
__path__: /var/lib/docker/containers/*/*.log
- job_name: system_logs
static_configs:
- targets:
- localhost
labels:
job: syslog
__path__: /var/log/*.log

27
note/commande_docker.md Normal file
View File

@ -0,0 +1,27 @@
# methode 1
# Arrêter tous les containers
docker stop $(docker ps -aq)
# Supprimer tous les containers
docker rm -f $(docker ps -aq)
# Supprimer toutes les images
docker rmi -f $(docker images -q)
# Supprimer tous les volumes
docker volume rm $(docker volume ls -q)
# Supprimer tous les réseaux personnalisés
docker network rm $(docker network ls -q | grep -v "bridge\|host\|none")
# verifier que tt est propre
docker system df
# methode 2
docker system prune -a --volumes --force
# Pour construire
docker compose up --build