Skip to content

Alerting avec Alertmanager

Introduction

Alertmanager gère les alertes générées par Prometheus : déduplication, groupage, silencing, et routing vers les canaux de notification (email, Slack, PagerDuty).

Prérequis

Points à apprendre

Architecture Alerting

graph TB
    prometheus[Prometheus<br/>Évalue les règles<br/>Génère alertes]
    alertmanager[Alertmanager<br/>Reçoit alertes<br/>Dedupe, groupe<br/>Route]
    silences[Silences<br/>Mute temporaire]
    inhibit[Inhibitions<br/>Suppression conditionnelle]

    subgraph Receivers
        email[Email<br/>SMTP]
        slack[Slack<br/>Webhook]
        pagerduty[PagerDuty<br/>API]
        webhook[Webhook<br/>Custom]
    end

    prometheus -->|Push alerts HTTP| alertmanager
    alertmanager -->|Check| silences
    alertmanager -->|Check| inhibit
    alertmanager -->|Route critical| email
    alertmanager -->|Route warning| slack
    alertmanager -->|Route page| pagerduty

Règles d'alerte Prometheus

# /etc/kolla/config/prometheus/rules/openstack-alerts.yml
groups:
  - name: openstack-infrastructure
    rules:
      # === Services Down ===
      - alert: PrometheusTargetDown
        expr: up == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Target {{ $labels.instance }} is down"
          description: "{{ $labels.job }} on {{ $labels.instance }} has been down for more than 5 minutes."

      # === API Latency ===
      - alert: OpenStackAPIHighLatency
        expr: histogram_quantile(0.95, rate(openstack_api_request_duration_seconds_bucket[5m])) > 2
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High API latency on {{ $labels.service }}"
          description: "95th percentile latency is {{ $value }}s"

      # === Galera Cluster ===
      - alert: GaleraClusterNotHealthy
        expr: mysql_global_status_wsrep_cluster_size < 3
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Galera cluster size is {{ $value }}"
          description: "Galera cluster has less than 3 nodes"

      - alert: GaleraNodeNotSynced
        expr: mysql_global_status_wsrep_local_state != 4
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Galera node not synced"
          description: "Node {{ $labels.instance }} wsrep_local_state is {{ $value }}"

      # === RabbitMQ ===
      - alert: RabbitMQDown
        expr: rabbitmq_up == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "RabbitMQ down on {{ $labels.instance }}"

      - alert: RabbitMQQueueGrowing
        expr: rabbitmq_queue_messages > 10000
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "Queue {{ $labels.queue }} has {{ $value }} messages"

  - name: openstack-compute
    rules:
      - alert: HighCPUUsage
        expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "High CPU usage on {{ $labels.instance }}"
          description: "CPU usage is {{ $value }}%"

      - alert: HighMemoryUsage
        expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > 90
        for: 10m
        labels:
          severity: warning
        annotations:
          summary: "High memory usage on {{ $labels.instance }}"
          description: "Memory usage is {{ $value }}%"

      - alert: DiskSpaceLow
        expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 < 10
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Low disk space on {{ $labels.instance }}"
          description: "Only {{ $value }}% disk space remaining"

  - name: ceph-alerts
    rules:
      - alert: CephHealthError
        expr: ceph_health_status == 2
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Ceph cluster is in ERROR state"

      - alert: CephHealthWarning
        expr: ceph_health_status == 1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Ceph cluster is in WARNING state"

      - alert: CephOSDDown
        expr: ceph_osd_up == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Ceph OSD {{ $labels.ceph_daemon }} is down"

      - alert: CephStorageFull
        expr: ceph_cluster_total_used_bytes / ceph_cluster_total_bytes * 100 > 85
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Ceph storage is {{ $value }}% full"

  - name: haproxy-alerts
    rules:
      - alert: HAProxyBackendDown
        expr: haproxy_server_status == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "HAProxy backend {{ $labels.server }} is DOWN"
          description: "Backend {{ $labels.server }} in {{ $labels.backend }} is not responding"

      - alert: HAProxyHighErrorRate
        expr: rate(haproxy_frontend_http_responses_total{code="5xx"}[5m]) / rate(haproxy_frontend_http_responses_total[5m]) > 0.05
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High 5xx error rate on {{ $labels.frontend }}"
          description: "Error rate is {{ $value | humanizePercentage }}"

Configuration Alertmanager

# /etc/kolla/config/prometheus/alertmanager.yml
global:
  smtp_smarthost: 'smtp.example.com:587'
  smtp_from: 'alertmanager@example.com'
  smtp_auth_username: 'alertmanager@example.com'
  smtp_auth_password: 'secret'

  slack_api_url: 'https://hooks.slack.com/services/xxx/yyy/zzz'

  pagerduty_url: 'https://events.pagerduty.com/v2/enqueue'

route:
  receiver: 'default'
  group_by: ['alertname', 'severity']
  group_wait: 30s
  group_interval: 5m
  repeat_interval: 4h

  routes:
    # Alertes critiques → PagerDuty + Email
    - match:
        severity: critical
      receiver: 'critical-alerts'
      continue: true

    # Alertes warning → Slack
    - match:
        severity: warning
      receiver: 'slack-warnings'

    # Alertes Ceph → équipe storage
    - match:
        job: ceph
      receiver: 'storage-team'

receivers:
  - name: 'default'
    email_configs:
      - to: 'ops-team@example.com'

  - name: 'critical-alerts'
    pagerduty_configs:
      - service_key: '<pagerduty-service-key>'
        severity: critical
    email_configs:
      - to: 'critical@example.com'

  - name: 'slack-warnings'
    slack_configs:
      - channel: '#openstack-alerts'
        send_resolved: true
        title: '{{ .Status | toUpper }}: {{ .CommonLabels.alertname }}'
        text: >-
          {{ range .Alerts }}
          *Alert:* {{ .Annotations.summary }}
          *Description:* {{ .Annotations.description }}
          *Severity:* {{ .Labels.severity }}
          {{ end }}

  - name: 'storage-team'
    email_configs:
      - to: 'storage@example.com'
    slack_configs:
      - channel: '#storage-alerts'

inhibit_rules:
  # Si le cluster est down, ne pas alerter sur les services individuels
  - source_match:
      alertname: 'GaleraClusterNotHealthy'
    target_match:
      job: 'mysqld'
    equal: ['cluster']

  # Si un node est down, ne pas alerter sur ses services
  - source_match:
      alertname: 'NodeDown'
    target_match_re:
      instance: '.*'
    equal: ['instance']

Diagramme de routing

flowchart TD
    Start([Alerte reçue]) --> Silence{Silence actif?}
    Silence -->|yes| Supprimee1[Supprimée]
    Supprimee1 --> Stop1([Stop])

    Silence -->|no| Inhibee{Inhibée?}
    Inhibee -->|yes| Supprimee2[Supprimée]
    Supprimee2 --> Stop2([Stop])

    Inhibee -->|no| Groupage[Groupage par alertname, severity]

    Groupage --> CheckSeverity{severity?}
    CheckSeverity -->|critical| PagerDuty[PagerDuty]
    CheckSeverity -->|critical| EmailCritical[Email critical@]
    CheckSeverity -->|warning| Slack[Slack #openstack-alerts]
    CheckSeverity -->|else| EmailOps[Email ops-team@]

    PagerDuty --> Stop3([Stop])
    EmailCritical --> Stop3
    Slack --> Stop3
    EmailOps --> Stop3

Silences

# Créer un silence via API
curl -X POST http://10.0.0.10:9093/api/v2/silences \
    -H "Content-Type: application/json" \
    -d '{
        "matchers": [
            {"name": "instance", "value": "controller-1:9100", "isRegex": false}
        ],
        "startsAt": "2024-01-15T10:00:00Z",
        "endsAt": "2024-01-15T12:00:00Z",
        "createdBy": "admin",
        "comment": "Maintenance planifiée"
    }'

# Lister les silences
curl http://10.0.0.10:9093/api/v2/silences

# Supprimer un silence
curl -X DELETE http://10.0.0.10:9093/api/v2/silence/<silence-id>

Gestion des alertes

# Voir les alertes actives
curl http://10.0.0.10:9093/api/v2/alerts | jq '.[] | {alertname: .labels.alertname, severity: .labels.severity, status: .status.state}'

# Voir les groupes d'alertes
curl http://10.0.0.10:9093/api/v2/alerts/groups | jq .

# Status Alertmanager
curl http://10.0.0.10:9093/api/v2/status | jq .

Exemples pratiques

Template de notification Slack amélioré

slack_configs:
  - channel: '#openstack-alerts'
    send_resolved: true
    icon_emoji: '{{ if eq .Status "firing" }}:fire:{{ else }}:white_check_mark:{{ end }}'
    title: '{{ .Status | toUpper }}{{ if eq .Status "firing" }} ({{ .Alerts.Firing | len }}){{ end }}'
    text: >-
      {{ range .Alerts }}
      {{ if eq .Status "firing" }}:red_circle:{{ else }}:large_green_circle:{{ end }} *{{ .Labels.alertname }}*
      > {{ .Annotations.summary }}
      > _Severity:_ `{{ .Labels.severity }}`
      > _Instance:_ `{{ .Labels.instance }}`
      {{ if .Annotations.runbook_url }}> :book: <{{ .Annotations.runbook_url }}|Runbook>{{ end }}
      {{ end }}
    actions:
      - type: button
        text: ':mag: View in Grafana'
        url: 'http://grafana:3000/d/alerts'
      - type: button
        text: ':no_bell: Silence'
        url: 'http://alertmanager:9093/#/silences/new'

Script de test des alertes

#!/bin/bash
# test-alerting.sh

echo "=== Testing Alert Pipeline ==="

# Générer une alerte de test
curl -X POST http://10.0.0.10:9093/api/v2/alerts \
    -H "Content-Type: application/json" \
    -d '[
        {
            "labels": {
                "alertname": "TestAlert",
                "severity": "warning",
                "instance": "test"
            },
            "annotations": {
                "summary": "This is a test alert",
                "description": "Testing the alerting pipeline"
            },
            "startsAt": "'$(date -u +%Y-%m-%dT%H:%M:%SZ)'",
            "generatorURL": "http://prometheus:9090/graph"
        }
    ]'

echo "Test alert sent. Check your notification channels."

# Attendre et résoudre
sleep 60

echo "Resolving test alert..."
curl -X POST http://10.0.0.10:9093/api/v2/alerts \
    -H "Content-Type: application/json" \
    -d '[
        {
            "labels": {
                "alertname": "TestAlert",
                "severity": "warning",
                "instance": "test"
            },
            "endsAt": "'$(date -u +%Y-%m-%dT%H:%M:%SZ)'"
        }
    ]'

Ressources

Checkpoint

  • Règles d'alerte définies dans Prometheus
  • Alertmanager configuré avec receivers
  • Email/Slack/PagerDuty fonctionnel
  • Routing par severity configuré
  • Inhibitions configurées
  • Test d'alerte validé
  • Silences documentés