#!/bin/bash
# reproduce_bug.sh
#
# Reproduces: servers_no_promotion silently ignored when a server in the
# parameter list is destroyed without first removing it from the parameter.
#
# RCA: rca_servers_no_promotion_bug.md
# Bug: server/modules/monitor/mariadbmon/mariadbmon.cc (post_configure)
#      server/core/monitor.cc (get_monitored_serverlist)
#
# Usage:
#   docker compose up -d
#   bash reproduce_bug.sh

set -euo pipefail

# ── Configuration ─────────────────────────────────────────────────────────────
MAXSCALE_URL="http://localhost:8989"
MXS_AUTH="admin:mariadb"
DB_PWD="DevPassword123!"

# ── Colors ────────────────────────────────────────────────────────────────────
RED='\033[0;31m'; GREEN='\033[0;32m'; YELLOW='\033[1;33m'
BLUE='\033[0;34m'; CYAN='\033[0;36m'; BOLD='\033[1m'; NC='\033[0m'

# ── Logging helpers ───────────────────────────────────────────────────────────
log()      { echo -e "\n${BOLD}${BLUE}[$(date '+%H:%M:%S')] ==> $*${NC}"; }
log_ok()   { echo -e "  ${GREEN}[OK]${NC} $*"; }
log_info() { echo -e "  ${CYAN}[INFO]${NC} $*"; }
log_warn() { echo -e "  ${YELLOW}[WARN]${NC} $*"; }
log_err()  { echo -e "  ${RED}[ERROR]${NC} $*"; }
sep()      { echo -e "${YELLOW}────────────────────────────────────────────────────────${NC}"; }

bug_confirmed() {
    echo -e "\n${BOLD}${RED}╔═══════════════════════════════════════════════════╗${NC}"
    echo -e "${BOLD}${RED}║           BUG CONFIRMED                           ║${NC}"
    echo -e "${BOLD}${RED}║  $1${NC}"
    echo -e "${BOLD}${RED}╚═══════════════════════════════════════════════════╝${NC}"
}

# ── DB helper ─────────────────────────────────────────────────────────────────
db() {
    local host=$1; shift
    docker exec "rca-${host}-1" mariadb -uroot -p"$DB_PWD" --silent -e "$@" 2>/dev/null
}

# ── MaxScale REST API helpers ─────────────────────────────────────────────────
mxs_get() {
    curl -s -u "$MXS_AUTH" "$MAXSCALE_URL/v1/$1"
}

mxs_post() {
    local path=$1 body=$2
    curl -s -u "$MXS_AUTH" -X POST \
        -H "Content-Type: application/json" \
        -d "$body" \
        "$MAXSCALE_URL/v1/$path"
}

mxs_patch() {
    local path=$1 body=$2
    curl -s -u "$MXS_AUTH" -X PATCH \
        -H "Content-Type: application/json" \
        -d "$body" \
        "$MAXSCALE_URL/v1/$path"
}

mxs_delete() {
    curl -s -u "$MXS_AUTH" -X DELETE "$MAXSCALE_URL/v1/$1"
}

# ── Display helpers ───────────────────────────────────────────────────────────
show_server_states() {
    local label=$1
    echo -e "\n  ${BOLD}MaxScale Server States — ${label}${NC}"
    mxs_get "servers" | python3 -c "
import json, sys
data = json.load(sys.stdin)
for s in data['data']:
    name = s['id']
    attr = s['attributes']
    state  = attr.get('state', 'unknown')
    master = attr.get('master_id', None)
    flags  = ', '.join(attr.get('server_flags', {}).keys()) if isinstance(attr.get('server_flags'), dict) else ''
    print(f'    {name:12s}: {state}')
" 2>/dev/null || echo "  (could not parse server states)"
}

show_monitor_params() {
    echo -e "\n  ${BOLD}Monitor parameters (MariaDB-Monitor)${NC}"
    mxs_get "monitors/MariaDB-Monitor" | python3 -c "
import json, sys
data = json.load(sys.stdin)
params = data['data']['attributes'].get('parameters', {})
keys = ['servers_no_promotion', 'auto_failover', 'auto_rejoin']
for k in keys:
    val = params.get(k, '<not set>')
    print(f'    {k} = {val}')
" 2>/dev/null || echo "  (could not parse monitor params)"
}

show_maxscale_logs() {
    local lines=${1:-30}
    echo -e "\n  ${BOLD}MaxScale logs (last ${lines} lines)${NC}"
    docker logs rca-maxscale-1 --tail="$lines" 2>&1 | sed 's/^/    /'
}

wait_for_db() {
    local host=$1
    echo -n "  Waiting for ${host}"
    for i in $(seq 1 40); do
        if docker exec "rca-${host}-1" mariadb -uroot -p"$DB_PWD" -e "SELECT 1" \
                --connect-timeout=3 &>/dev/null 2>&1; then
            echo -e " ${GREEN}ready${NC}"
            return 0
        fi
        echo -n "."
        sleep 2
    done
    echo -e " ${RED}TIMEOUT${NC}"
    exit 1
}

wait_for_master() {
    local expected=${1:-mariadb1}
    echo -n "  Waiting for MaxScale to detect ${expected} as Master"
    for i in $(seq 1 30); do
        local master
        master=$(mxs_get "servers" | python3 -c "
import json, sys
data = json.load(sys.stdin)
for s in data['data']:
    if 'Master' in s['attributes'].get('state', '') and 'Running' in s['attributes'].get('state', ''):
        print(s['id'])
        break
" 2>/dev/null)
        if [ "$master" = "$expected" ]; then
            echo -e " ${GREEN}confirmed${NC}"
            return 0
        fi
        echo -n "."
        sleep 2
    done
    echo -e " ${YELLOW}timeout${NC}"
}

# ═══════════════════════════════════════════════════════════════════════════════
sep
echo -e "${BOLD}  Bug Reproduction: servers_no_promotion Silently Ignored${NC}"
echo -e "  RCA: rca_servers_no_promotion_bug.md"
sep

# ═══════════════════════════════════════════════════════════════════════════════
log "PHASE 0: Preflight — checking containers"

for svc in mariadb1 mariadb2 mariadb3 mariadb4 maxscale; do
    container="rca-${svc}-1"
    if docker inspect "$container" &>/dev/null 2>&1; then
        log_ok "$container is running"
    else
        log_err "$container not found — run: docker compose up -d"
        exit 1
    fi
done

wait_for_db mariadb1
wait_for_db mariadb2
wait_for_db mariadb3
wait_for_db mariadb4

# ═══════════════════════════════════════════════════════════════════════════════
log "PHASE 1: Setting up replication (mariadb1=primary, mariadb2/3/4=replicas)"

# MaxScale's failover algorithm requires gtid_binlog_pos to be set on the primary.
# A fresh MariaDB instance has no GTID history, so we seed one transaction first.
log_info "Seeding an initial GTID transaction on mariadb1..."
db mariadb1 "CREATE DATABASE IF NOT EXISTS \`_rca_gtid_init\`; DROP DATABASE IF EXISTS \`_rca_gtid_init\`;"
GTID_POS=$(db mariadb1 "SELECT @@global.gtid_binlog_pos;")
log_ok "mariadb1 gtid_binlog_pos: ${GTID_POS}"

log_info "Configuring mariadb2 → replicating from mariadb1 (MASTER_USE_GTID=slave_pos)"
db mariadb2 "STOP SLAVE; RESET SLAVE ALL; \
  CHANGE MASTER TO \
    MASTER_HOST='mariadb1', \
    MASTER_USER='root', \
    MASTER_PASSWORD='${DB_PWD}', \
    MASTER_USE_GTID=slave_pos; \
  START SLAVE;"
log_ok "mariadb2 slave started"

log_info "Configuring mariadb3 → replicating from mariadb1 (MASTER_USE_GTID=slave_pos)"
db mariadb3 "STOP SLAVE; RESET SLAVE ALL; \
  CHANGE MASTER TO \
    MASTER_HOST='mariadb1', \
    MASTER_USER='root', \
    MASTER_PASSWORD='${DB_PWD}', \
    MASTER_USE_GTID=slave_pos; \
  START SLAVE;"
log_ok "mariadb3 slave started"

log_info "Configuring mariadb4 → replicating from mariadb1 (this server will be our victim)"
db mariadb4 "STOP SLAVE; RESET SLAVE ALL; \
  CHANGE MASTER TO \
    MASTER_HOST='mariadb1', \
    MASTER_USER='root', \
    MASTER_PASSWORD='${DB_PWD}', \
    MASTER_USE_GTID=slave_pos; \
  START SLAVE;"
log_ok "mariadb4 slave started"

# Verify GTID replication is recognized on each replica
log_info "Verifying GTID replication state on replicas..."
for replica in mariadb2 mariadb3 mariadb4; do
    using_gtid=$(db "$replica" "SHOW SLAVE STATUS\G" 2>/dev/null | grep 'Using_Gtid' | awk '{print $2}')
    slave_pos=$(db "$replica" "SELECT @@global.gtid_slave_pos;" 2>/dev/null)
    if [ "$using_gtid" = "Slave_Pos" ]; then
        log_ok "${replica}: Using_Gtid=${using_gtid}, gtid_slave_pos=${slave_pos}"
    else
        log_warn "${replica}: Using_Gtid=${using_gtid:-unknown} — expected Slave_Pos"
    fi
done

log_info "Giving MaxScale time to detect the replication topology..."
sleep 5
wait_for_master mariadb1

show_server_states "After replication setup"

# ═══════════════════════════════════════════════════════════════════════════════
log "PHASE 2: Adding mariadb4 to MaxScale via REST API"

log_info "Creating server object for mariadb4 in MaxScale..."
RESPONSE=$(mxs_post "servers" '{
  "data": {
    "id": "mariadb4",
    "type": "servers",
    "attributes": {
      "parameters": {
        "address": "mariadb4",
        "port": 3306,
        "protocol": "MariaDBBackend"
      }
    }
  }
}')
log_ok "POST /v1/servers → mariadb4 created"
log_info "Response: $(echo "$RESPONSE" | python3 -c "import json,sys; d=json.load(sys.stdin); print(d.get('data',{}).get('id','(see raw)') if 'data' in d else d.get('errors',[{}])[0].get('detail','check logs'))" 2>/dev/null || echo "$RESPONSE")"

log_info "Linking mariadb4 to MariaDB-Monitor (all 4 servers in monitor)..."
mxs_patch "monitors/MariaDB-Monitor/relationships/servers" '{
  "data": [
    {"id": "mariadb1", "type": "servers"},
    {"id": "mariadb2", "type": "servers"},
    {"id": "mariadb3", "type": "servers"},
    {"id": "mariadb4", "type": "servers"}
  ]
}' >/dev/null
log_ok "PATCH /v1/monitors/MariaDB-Monitor/relationships/servers → {mariadb1,2,3,4}"

log_info "Linking mariadb4 to Read-Write-Service (all 4 servers)..."
mxs_patch "services/Read-Write-Service/relationships/servers" '{
  "data": [
    {"id": "mariadb1", "type": "servers"},
    {"id": "mariadb2", "type": "servers"},
    {"id": "mariadb3", "type": "servers"},
    {"id": "mariadb4", "type": "servers"}
  ]
}' >/dev/null
log_ok "PATCH /v1/services/Read-Write-Service/relationships/servers → {mariadb1,2,3,4}"

sleep 4
show_server_states "After adding mariadb4 to MaxScale"

# ═══════════════════════════════════════════════════════════════════════════════
log "PHASE 3: Setting servers_no_promotion trap"
log_info "Setting servers_no_promotion=mariadb2,mariadb3,mariadb4"
log_info "This means only mariadb1 is eligible for promotion."
log_info "  → mariadb2: EXCLUDED"
log_info "  → mariadb3: EXCLUDED"
log_info "  → mariadb4: EXCLUDED (the victim — will be destroyed next)"

mxs_patch "monitors/MariaDB-Monitor" '{
  "data": {
    "id": "MariaDB-Monitor",
    "type": "monitors",
    "attributes": {
      "parameters": {
        "servers_no_promotion": "mariadb2,mariadb3,mariadb4"
      }
    }
  }
}' >/dev/null
log_ok "PATCH /v1/monitors/MariaDB-Monitor → servers_no_promotion=mariadb2,mariadb3,mariadb4"

sleep 4
show_monitor_params
show_server_states "After setting servers_no_promotion (exclusions now active)"
echo
log_info "At this point mariadb2 and mariadb3 are CORRECTLY excluded from promotion."
log_info "If mariadb1 failed right now, auto-failover would find NO eligible candidates"
log_info "and refuse to promote anyone."

# ═══════════════════════════════════════════════════════════════════════════════
log "PHASE 4: Triggering the bug — destroying mariadb4 WITHOUT updating servers_no_promotion"
sep
log_warn "NOTE: servers_no_promotion will NOT be updated before removing mariadb4."
log_warn "This leaves 'mariadb4' as a dangling reference in the parameter string."
log_warn "MaxScale's get_monitored_serverlist() will fail validation for the entire list,"
log_warn "silently dropping the exclusions for mariadb2 AND mariadb3."
sep

log_info "Step 4a — Unlinking mariadb4 from MariaDB-Monitor only (mariadb1,2,3 remain)..."
mxs_patch "monitors/MariaDB-Monitor/relationships/servers" '{
  "data": [
    {"id": "mariadb1", "type": "servers"},
    {"id": "mariadb2", "type": "servers"},
    {"id": "mariadb3", "type": "servers"}
  ]
}' >/dev/null
log_ok "PATCH /v1/monitors/MariaDB-Monitor/relationships/servers → {mariadb1,2,3}"
log_warn "servers_no_promotion still = 'mariadb2,mariadb3,mariadb4' (not updated!)"

sleep 2

log_info "Step 4b — Unlinking mariadb4 from Read-Write-Service..."
mxs_patch "services/Read-Write-Service/relationships/servers" '{
  "data": [
    {"id": "mariadb1", "type": "servers"},
    {"id": "mariadb2", "type": "servers"},
    {"id": "mariadb3", "type": "servers"}
  ]
}' >/dev/null
log_ok "PATCH /v1/services/Read-Write-Service/relationships/servers → {mariadb1,2,3}"

log_info "Step 4c — Destroying mariadb4 from MaxScale..."
log_warn ">>> This is the moment the bug is triggered <<<"
mxs_delete "servers/mariadb4" >/dev/null
log_ok "DELETE /v1/servers/mariadb4"
log_warn "mariadb4 destroyed. servers_no_promotion still references it."
log_warn "MaxScale monitor will now reconfigure and call post_configure()."
log_warn "get_monitored_serverlist() will fail for 'mariadb4' → entire exclusion list dropped."

log_info "Waiting for monitor to reconfigure..."
sleep 5

log_info "Checking MaxScale logs for the dangling-reference error..."
echo
echo -e "  ${BOLD}MaxScale logs — look for 'is not monitored by monitor' error:${NC}"
docker logs rca-maxscale-1 --tail=40 2>&1 | grep -E \
    "(error|warning|not monitored|promotion|excluded|servers_no_promotion|post_configure)" \
    --color=always | sed 's/^/    /' || true

echo
show_monitor_params
log_warn "servers_no_promotion parameter string still shows 'mariadb2,mariadb3,mariadb4'"
log_warn "but the internal server flags have been SILENTLY RESET to available_for_promotion=true"

show_server_states "After mariadb4 destroyed — exclusions should be GONE (bug)"
echo
log_warn "mariadb2 and mariadb3 are no longer excluded — they are now eligible for promotion."

# ═══════════════════════════════════════════════════════════════════════════════
log "PHASE 5: Proving the impact — triggering failover"
log_info "Stopping mariadb1 (the primary) to force auto-failover..."
log_info "If servers_no_promotion were working: NO promotion would occur (no eligible candidates)."
log_info "Because of the bug: mariadb2 or mariadb3 will be promoted."

docker stop rca-mariadb1-1
log_ok "rca-mariadb1-1 stopped"

log_info "Polling MaxScale for new primary (up to 90s)..."
echo -n "  "
NEW_MASTER=""
for i in $(seq 1 30); do
    NEW_MASTER=$(mxs_get "servers" | python3 -c "
import json, sys
data = json.load(sys.stdin)
for s in data['data']:
    state = s['attributes'].get('state', '')
    if 'Master' in state and 'Running' in state and s['id'] != 'mariadb1':
        print(s['id'])
        break
" 2>/dev/null || true)
    if [ -n "$NEW_MASTER" ]; then
        echo -e " ${GREEN}new primary detected: ${NEW_MASTER}${NC}"
        break
    fi
    echo -n "."
    sleep 3
done

echo
show_server_states "Post-failover"

echo
if [ -n "$NEW_MASTER" ] && [ "$NEW_MASTER" != "mariadb1" ]; then
    bug_confirmed "${NEW_MASTER} was promoted — it should have been EXCLUDED"
    echo
    log_err "Expected: no promotion (mariadb2 and mariadb3 are in servers_no_promotion)"
    log_err "Actual:   ${NEW_MASTER} was promoted because exclusions were silently dropped"
    echo
    log_info "Root cause: destroying mariadb4 without removing it from servers_no_promotion"
    log_info "caused get_monitored_serverlist() in monitor.cc to return ok=false,"
    log_info "which caused post_configure() in mariadbmon.cc to skip ALL exclusion tagging."
    echo
    log_info "Workaround: always run the following BEFORE destroying a server:"
    log_info "  PATCH /v1/monitors/MariaDB-Monitor with servers_no_promotion=mariadb2,mariadb3"
else
    log_warn "No new primary detected within timeout — check MaxScale logs manually"
fi

echo
log_info "Full MaxScale logs for the failover event:"
show_maxscale_logs 50

sep
echo -e "${BOLD}  Reproduction complete.${NC}"
sep
