Tests de Failover¶
Introduction¶
Les tests de failover valident que l'infrastructure HA fonctionne comme prévu. Cette section couvre les procédures de test systématiques pour chaque composant.
Prérequis¶
- Cluster HA complet déployé (3 controllers)
- HAProxy, Keepalived, Galera, RabbitMQ configurés
- Environnement de test (pas de production!)
Points à apprendre¶
Matrice de tests¶
graph TB
subgraph net["Tests Réseau"]
t1["VIP Failover<br/>(Keepalived)"]
t2["HAProxy Backend<br/>Failover"]
end
subgraph data["Tests Données"]
t3["Galera Node<br/>Failover"]
t4["RabbitMQ Node<br/>Failover"]
t5["Ceph OSD/MON<br/>Failover"]
end
subgraph svc["Tests Services"]
t6["API Service<br/>Failover"]
t7["Agent Failover<br/>(L3, DHCP)"]
end
subgraph full["Tests Complets"]
t8["Controller Node<br/>Failover"]
t9["Compute Node<br/>Failover"]
t10["Network<br/>Partition"]
end
net -.->|"< 5 secondes"| net
data -.->|"< 30 secondes"| data
svc -.->|"< 60 secondes"| svc
full -.->|"Validation bout-en-bout"| full
Script de test complet¶
#!/bin/bash
# failover-tests.sh
set -e
TIMESTAMP=$(date +%Y%m%d_%H%M%S)
LOG_FILE="/var/log/failover-tests-$TIMESTAMP.log"
log() {
echo "[$(date '+%Y-%m-%d %H:%M:%S')] $1" | tee -a $LOG_FILE
}
check_openstack() {
log "Checking OpenStack API..."
if openstack token issue &>/dev/null; then
log "✓ OpenStack API responsive"
return 0
else
log "✗ OpenStack API not responding"
return 1
fi
}
# Test 1: VIP Failover
test_vip_failover() {
log "=== TEST 1: VIP Failover ==="
# Identifier le MASTER actuel
MASTER=$(for h in controller-{1,2,3}; do
ssh $h "ip addr | grep -q '10.0.0.10' && echo $h"
done)
log "Current VIP master: $MASTER"
# Arrêter Keepalived sur le MASTER
log "Stopping keepalived on $MASTER..."
ssh $MASTER "docker stop keepalived"
# Attendre le failover
sleep 5
# Vérifier le nouveau MASTER
NEW_MASTER=$(for h in controller-{1,2,3}; do
ssh $h "ip addr | grep -q '10.0.0.10' && echo $h" 2>/dev/null
done)
if [ -n "$NEW_MASTER" ] && [ "$NEW_MASTER" != "$MASTER" ]; then
log "✓ VIP failover successful: $MASTER -> $NEW_MASTER"
# Restaurer
ssh $MASTER "docker start keepalived"
return 0
else
log "✗ VIP failover FAILED"
ssh $MASTER "docker start keepalived"
return 1
fi
}
# Test 2: Galera Failover
test_galera_failover() {
log "=== TEST 2: Galera Failover ==="
# Vérifier l'état initial
CLUSTER_SIZE=$(docker exec mariadb mysql -N -e "SHOW STATUS LIKE 'wsrep_cluster_size';" | awk '{print $2}')
log "Initial cluster size: $CLUSTER_SIZE"
# Arrêter un nœud
log "Stopping mariadb on controller-3..."
ssh controller-3 "docker stop mariadb"
sleep 10
# Vérifier que le cluster continue
NEW_SIZE=$(docker exec mariadb mysql -N -e "SHOW STATUS LIKE 'wsrep_cluster_size';" | awk '{print $2}')
# Tester une écriture
docker exec mariadb mysql -e "CREATE DATABASE IF NOT EXISTS failover_test; DROP DATABASE failover_test;"
if [ "$NEW_SIZE" -eq 2 ]; then
log "✓ Galera failover successful: cluster size $CLUSTER_SIZE -> $NEW_SIZE"
# Restaurer
ssh controller-3 "docker start mariadb"
sleep 30
FINAL_SIZE=$(docker exec mariadb mysql -N -e "SHOW STATUS LIKE 'wsrep_cluster_size';" | awk '{print $2}')
log "Cluster recovered: size = $FINAL_SIZE"
return 0
else
log "✗ Galera failover FAILED"
ssh controller-3 "docker start mariadb"
return 1
fi
}
# Test 3: RabbitMQ Failover
test_rabbitmq_failover() {
log "=== TEST 3: RabbitMQ Failover ==="
# État initial
NODES=$(docker exec rabbitmq rabbitmqctl cluster_status 2>/dev/null | grep -c "rabbit@")
log "Initial cluster nodes: $NODES"
# Arrêter un nœud
log "Stopping rabbitmq on controller-2..."
ssh controller-2 "docker stop rabbitmq"
sleep 10
# Vérifier le cluster
if docker exec rabbitmq rabbitmqctl cluster_status &>/dev/null; then
log "✓ RabbitMQ cluster still operational"
# Tester la publication
docker exec rabbitmq rabbitmqadmin publish exchange=amq.default \
routing_key=test payload="failover_test" &>/dev/null
log "✓ Message published successfully"
# Restaurer
ssh controller-2 "docker start rabbitmq"
sleep 30
return 0
else
log "✗ RabbitMQ failover FAILED"
ssh controller-2 "docker start rabbitmq"
return 1
fi
}
# Test 4: API Service Failover
test_api_failover() {
log "=== TEST 4: Nova API Failover ==="
# Tester l'API
check_openstack || return 1
# Arrêter Nova API sur un controller
log "Stopping nova_api on controller-1..."
ssh controller-1 "docker stop nova_api"
sleep 5
# L'API doit toujours répondre via HAProxy
if openstack server list &>/dev/null; then
log "✓ Nova API failover successful"
ssh controller-1 "docker start nova_api"
return 0
else
log "✗ Nova API failover FAILED"
ssh controller-1 "docker start nova_api"
return 1
fi
}
# Test 5: Controller Node Failover complet
test_controller_failover() {
log "=== TEST 5: Full Controller Failover ==="
# État initial
check_openstack || return 1
# Arrêter tous les containers sur controller-1
log "Stopping all docker containers on controller-1..."
ssh controller-1 "docker stop \$(docker ps -q)"
# Attendre la stabilisation
sleep 30
# Vérifier les services
TESTS_PASSED=0
if check_openstack; then
((TESTS_PASSED++))
fi
if docker exec mariadb mysql -e "SELECT 1" &>/dev/null; then
log "✓ Database accessible"
((TESTS_PASSED++))
fi
if docker exec rabbitmq rabbitmqctl status &>/dev/null; then
log "✓ RabbitMQ accessible"
((TESTS_PASSED++))
fi
# Restaurer
log "Restoring controller-1..."
ssh controller-1 "docker start \$(docker ps -aq)"
sleep 60
if [ $TESTS_PASSED -ge 3 ]; then
log "✓ Full controller failover: $TESTS_PASSED/3 tests passed"
return 0
else
log "✗ Full controller failover: $TESTS_PASSED/3 tests passed"
return 1
fi
}
# Exécution des tests
main() {
log "Starting Failover Tests"
log "========================"
PASSED=0
FAILED=0
for test in test_vip_failover test_galera_failover test_rabbitmq_failover test_api_failover test_controller_failover; do
if $test; then
((PASSED++))
else
((FAILED++))
fi
log ""
sleep 10 # Pause entre les tests
done
log "========================"
log "Results: $PASSED passed, $FAILED failed"
log "Log file: $LOG_FILE"
}
main "$@"
Test VIP avec mesure du temps¶
#!/bin/bash
# test-vip-timing.sh
VIP="10.0.0.10"
INTERFACE="eth0"
# Trouver le MASTER
MASTER=$(for h in controller-{1,2,3}; do
ssh $h "ip addr show $INTERFACE" 2>/dev/null | grep -q "$VIP" && echo $h
done)
echo "Current MASTER: $MASTER"
echo "Starting VIP failover timing test..."
# Lancer un ping continu en background
ping $VIP > /tmp/ping_output.txt 2>&1 &
PING_PID=$!
sleep 2
# Arrêter keepalived
ssh $MASTER "docker stop keepalived"
STOP_TIME=$(date +%s.%N)
# Attendre le failover (max 10s)
for i in $(seq 1 100); do
if ping -c 1 -W 1 $VIP &>/dev/null; then
RECOVERY_TIME=$(date +%s.%N)
break
fi
sleep 0.1
done
kill $PING_PID 2>/dev/null
# Calculer le temps
FAILOVER_TIME=$(echo "$RECOVERY_TIME - $STOP_TIME" | bc)
echo "Failover time: ${FAILOVER_TIME}s"
# Analyser les pings perdus
LOST=$(grep -c "Request timeout" /tmp/ping_output.txt || echo 0)
echo "Packets lost: $LOST"
# Restaurer
ssh $MASTER "docker start keepalived"
# Verdict
if (( $(echo "$FAILOVER_TIME < 5" | bc -l) )); then
echo "✓ PASS: Failover under 5 seconds"
else
echo "✗ FAIL: Failover exceeded 5 seconds"
fi
Test de charge pendant failover¶
#!/bin/bash
# stress-failover-test.sh
# Lancer une charge continue
run_load() {
while true; do
openstack server list &>/dev/null
openstack network list &>/dev/null
openstack volume list &>/dev/null
sleep 0.5
done
}
# Démarrer la charge
run_load &
LOAD_PID=$!
echo "Load generator started (PID: $LOAD_PID)"
echo "Running failover test under load..."
# Déclencher failover
ssh controller-1 "docker stop keepalived haproxy nova_api"
# Mesurer les erreurs
ERRORS=0
for i in $(seq 1 20); do
if ! openstack token issue &>/dev/null; then
((ERRORS++))
fi
sleep 1
done
# Restaurer
ssh controller-1 "docker start keepalived haproxy nova_api"
# Arrêter la charge
kill $LOAD_PID 2>/dev/null
echo "Errors during failover: $ERRORS/20"
if [ $ERRORS -lt 5 ]; then
echo "✓ PASS: Less than 25% error rate"
else
echo "✗ FAIL: Error rate too high"
fi
Diagramme de flux de test¶
flowchart TB
start([Start]) --> verify["Vérifier état initial<br/>du cluster"]
verify --> doc["Documenter la configuration<br/>actuelle (baseline)"]
doc --> comp["Tests Composants"]
subgraph comp["Tests Composants"]
vip["Test VIP (Keepalived)"]
vip --> vip_check{VIP failover<br/>< 5s?}
vip_check -->|yes| vip_pass["✓ PASS"]
vip_check -->|no| vip_fail["✗ FAIL<br/>Investiguer"]
vip_pass --> galera["Test Galera"]
vip_fail --> galera
galera --> galera_check{Cluster continue<br/>avec N-1?}
galera_check -->|yes| galera_pass["✓ PASS"]
galera_check -->|no| galera_fail["✗ FAIL"]
galera_pass --> rmq["Test RabbitMQ"]
galera_fail --> rmq
rmq --> rmq_check{Queues<br/>accessibles?}
rmq_check -->|yes| rmq_pass["✓ PASS"]
rmq_check -->|no| rmq_fail["✗ FAIL"]
end
rmq_pass --> svc["Tests Services"]
rmq_fail --> svc
subgraph svc["Tests Services"]
api["Test API Services"]
agents["Test Agents Neutron"]
api --> agents
end
agents --> integ["Tests Intégration"]
subgraph integ["Tests Intégration"]
vm["Créer VM pendant failover"]
vm --> vm_check{VM créée<br/>avec succès?}
vm_check -->|yes| vm_pass["✓ PASS"]
vm_check -->|no| vm_fail["✗ FAIL"]
end
vm_pass --> restore["Restaurer configuration<br/>initiale"]
vm_fail --> restore
restore --> report["Générer rapport<br/>de tests"]
report --> stop([Stop])
Rapport de test¶
# Rapport de Tests Failover
Date: {{ date }}
Environnement: {{ env }}
## Résumé
| Test | Résultat | Durée | Notes |
|------|----------|-------|-------|
| VIP Failover | ✓ PASS | 2.3s | |
| Galera Failover | ✓ PASS | 15s | Rejoin OK |
| RabbitMQ Failover | ✓ PASS | 8s | |
| API Failover | ✓ PASS | 3s | |
| Controller Failover | ✓ PASS | 45s | |
## Métriques
- Temps moyen de failover VIP: 2.5s
- Paquets perdus pendant failover: 3
- Requêtes API échouées: 2/100
## Recommandations
- [ ] Ajuster advert_int Keepalived si failover trop lent
- [ ] Vérifier les health checks HAProxy
- [ ] Documenter les procédures de recovery
## Validation
- [ ] Tous les tests passent
- [ ] RTO < objectif (5 minutes)
- [ ] RPO = 0 (pas de perte de données)
Exemples pratiques¶
Test rapide de tous les composants¶
#!/bin/bash
# quick-ha-check.sh
echo "=== Quick HA Health Check ==="
echo -n "VIP: "
ping -c 1 10.0.0.10 &>/dev/null && echo "✓" || echo "✗"
echo -n "Galera (3 nodes): "
SIZE=$(docker exec mariadb mysql -N -e "SHOW STATUS LIKE 'wsrep_cluster_size'" 2>/dev/null | awk '{print $2}')
[ "$SIZE" = "3" ] && echo "✓" || echo "✗ (size=$SIZE)"
echo -n "RabbitMQ (3 nodes): "
NODES=$(docker exec rabbitmq rabbitmqctl cluster_status 2>/dev/null | grep -c "rabbit@")
[ "$NODES" = "3" ] && echo "✓" || echo "✗ (nodes=$NODES)"
echo -n "HAProxy backends: "
UP=$(curl -s http://localhost:1984/stats 2>/dev/null | grep -c "UP")
echo "$UP services UP"
echo -n "OpenStack API: "
openstack token issue &>/dev/null && echo "✓" || echo "✗"
Ressources¶
Checkpoint¶
- Script de test failover fonctionnel
- VIP failover < 5 secondes
- Galera continue avec N-1 nœuds
- RabbitMQ continue avec N-1 nœuds
- APIs restent accessibles pendant failover
- Rapport de tests généré et documenté
- Procédures de recovery validées