#!/bin/bash
set -e

# End-to-end test of the Debian systemd integration for receive-wal
# (debian/patches/systemd-receive-wal-integration.patch + the
# barman-receive-wal@.service template and polkit rule). It verifies that:
#
#   1. "barman cron", run as the unprivileged barman user, starts the
#      per-server unit barman-receive-wal@testdb.service via systemd
#      (exercising the shipped polkit rule), and the receiver connects.
#   2. "barman receive-wal --stop", run as the barman user, stops the *unit*
#      (detected from the process cgroup) rather than signalling the PID:
#      the receiver must stay down, proving Restart=on-failure did not revive it.
#   3. Setting streaming_archiver=off makes the next "barman cron" stop the
#      unit.
#
# Requires a systemd testbed (isolation-machine) and root.

UNIT="barman-receive-wal@testdb.service"
BARMAN_CONF=/etc/barman.conf
BARMAN_CONF_BAK=
BARMAN_PASSWORD=barman_autopkgtest
STREAMING_PASSWORD=streaming_autopkgtest
PGPORT=

as_barman() { runuser -u barman -- "$@"; }
as_postgres() { runuser -u postgres -- "$@"; }
run_psql() { as_postgres psql -p "$PGPORT" "$@"; }

CRON_D=/etc/cron.d/barman
CRON_D_BAK=
POLKIT_RULE=/usr/share/polkit-1/rules.d/50-barman-receive-wal.rules
POLKIT_RULE_BAK=

cleanup() {
    echo "=== Cleaning up ==="
    # Stop the receiver whether it is a systemd unit or a forked fallback.
    as_barman barman receive-wal --stop testdb 2>/dev/null || true
    systemctl stop "$UNIT" 2>/dev/null || true
    as_barman barman receive-wal --drop-slot testdb 2>/dev/null || true
    if [ -n "$BARMAN_CONF_BAK" ] && [ -e "$BARMAN_CONF_BAK" ]; then
        mv -f "$BARMAN_CONF_BAK" "$BARMAN_CONF"
    else
        rm -f "$BARMAN_CONF"
    fi
    systemctl unmask barman.timer 2>/dev/null || true
    if [ -n "$CRON_D_BAK" ] && [ -e "$CRON_D_BAK" ]; then
        mv -f "$CRON_D_BAK" "$CRON_D"
    fi
    if [ -n "$POLKIT_RULE_BAK" ] && [ -e "$POLKIT_RULE_BAK" ]; then
        mv -f "$POLKIT_RULE_BAK" "$POLKIT_RULE"
        systemctl reload polkit.service 2>/dev/null \
            || systemctl restart polkit.service 2>/dev/null || true
    fi
}
trap cleanup EXIT

# This test drives cron by hand and assumes it is the only actor; a background
# run via barman.timer or /etc/cron.d/barman racing it (re-starting the unit or
# forking a competing receiver) would make it flaky. Neutralise both, restore on exit.
echo "=== Disabling background cron drivers (test drives cron by hand) ==="
systemctl stop barman.timer 2>/dev/null || true
systemctl mask barman.timer 2>/dev/null || true
if [ -e "$CRON_D" ]; then
    CRON_D_BAK="${CRON_D}.autopkgtest-bak"
    mv -f "$CRON_D" "$CRON_D_BAK"
fi

# Wait until "systemctl is-active <unit>" matches the expected state.
wait_unit_state() { # state timeout
    local want="$1" timeout="${2:-30}" elapsed=0 got
    while [ "$elapsed" -lt "$timeout" ]; do
        got=$(systemctl is-active "$UNIT" 2>/dev/null || true)
        [ "$got" = "$want" ] && return 0
        # When waiting for the unit to stop, a 'failed' state means the stop
        # did not exit cleanly (e.g. the wrong signal was sent); it will not
        # reach 'inactive' on its own, so fail fast with a clear message
        # instead of waiting out the timeout.
        if [ "$got" = "failed" ] && [ "$want" = "inactive" ]; then
            echo "ERROR: $UNIT entered 'failed' while waiting for it to stop" >&2
            systemctl status "$UNIT" --no-pager || true
            return 1
        fi
        sleep 1
        elapsed=$((elapsed + 1))
    done
    echo "ERROR: $UNIT did not become '$want' within ${timeout}s (last: '$got')" >&2
    systemctl status "$UNIT" --no-pager || true
    return 1
}

# Wait until the replication slot is active (a receiver is connected).
wait_for_slot_active() { # timeout
    local timeout="${1:-30}" elapsed=0 active
    while [ "$elapsed" -lt "$timeout" ]; do
        active=$(run_psql -AqtX -c \
            "SELECT active FROM pg_replication_slots WHERE slot_name = 'barman'" \
            2>/dev/null || true)
        [ "$active" = "t" ] && return 0
        sleep 1
        elapsed=$((elapsed + 1))
    done
    echo "ERROR: replication slot did not become active within ${timeout}s" >&2
    return 1
}

# --- locate (and if needed start) a PostgreSQL cluster -------------------
PGPORT=$(pg_lsclusters -h | awk '$4 == "online" { print $3; exit }')
if [ -z "$PGPORT" ]; then
    read -r ver cluster <<EOF
$(pg_lsclusters -h | awk 'NR == 1 { print $1, $2 }')
EOF
    [ -n "$ver" ] || { echo "ERROR: no PostgreSQL cluster found" >&2; exit 1; }
    pg_ctlcluster "$ver" "$cluster" start
    PGPORT=$(pg_lsclusters -h | awk '$4 == "online" { print $3; exit }')
fi
[ -n "$PGPORT" ] || { echo "ERROR: no online PostgreSQL cluster" >&2; exit 1; }
echo "Using PostgreSQL on port $PGPORT"

# --- PostgreSQL roles and host-based authentication ----------------------
echo "=== Creating PostgreSQL roles ==="
run_psql -c "CREATE USER barman SUPERUSER PASSWORD '$BARMAN_PASSWORD';"
run_psql -c "CREATE USER streaming_barman REPLICATION LOGIN PASSWORD '$STREAMING_PASSWORD';"

echo "=== Allowing password authentication from localhost ==="
hba_file=$(run_psql -AqtX -c "SHOW hba_file")
{
    printf 'host all barman 127.0.0.1/32 scram-sha-256\n'
    printf 'host replication streaming_barman 127.0.0.1/32 scram-sha-256\n'
} >> "$hba_file"
run_psql -c "SELECT pg_reload_conf();"

# --- barman configuration (the path the systemd unit reads) --------------
echo "=== Writing $BARMAN_CONF ==="
if [ -e "$BARMAN_CONF" ]; then
    BARMAN_CONF_BAK="${BARMAN_CONF}.autopkgtest-bak"
    mv -f "$BARMAN_CONF" "$BARMAN_CONF_BAK"
fi
cat > "$BARMAN_CONF" <<EOF
[barman]
barman_home = /var/lib/barman
barman_user = barman
log_file = /var/log/barman/barman.log
configuration_files_directory = /etc/barman.d

[testdb]
description = "Autopkgtest systemd integration"
conninfo = host=127.0.0.1 port=${PGPORT} user=barman password=${BARMAN_PASSWORD} dbname=postgres sslmode=disable
streaming_conninfo = host=127.0.0.1 port=${PGPORT} user=streaming_barman password=${STREAMING_PASSWORD} sslmode=disable
backup_method = postgres
streaming_archiver = on
slot_name = barman
EOF

# --- 1. cron starts the unit (via polkit, as the barman user) -----------
echo "=== Creating replication slot ==="
as_barman barman receive-wal --create-slot testdb

echo "=== Running 'barman cron' as the barman user ==="
as_barman barman -q cron

echo "=== Asserting the systemd unit is active ==="
wait_unit_state active 30
echo "=== Asserting the receiver connected (slot active) ==="
wait_for_slot_active 30

# A second cron run must be a harmless no-op (systemctl start is idempotent).
echo "=== Re-running cron (idempotent) ==="
as_barman barman -q cron
wait_unit_state active 5

# --- 1b. config-switch restarts the receiver via systemd (restart_processes) --
# "barman config-switch" (and config-update) call Server.restart_processes(),
# the second code path that routes receiver startup through systemd: it stops
# the receiver and starts it again. Exercise it and re-assert the receiver is up.
echo "=== Restarting via 'barman config-switch --reset' (exercises restart_processes) ==="
as_barman barman config-switch testdb --reset
wait_unit_state active 30
wait_for_slot_active 30
echo "  config-switch restarted the receiver under systemd: OK"

# --- 2. receive-wal --stop stops the UNIT, and it stays stopped ----------
echo "=== Stopping via 'barman receive-wal --stop' (as barman user) ==="
as_barman barman receive-wal --stop testdb
wait_unit_state inactive 15

echo "=== Asserting the receiver is NOT revived ==="
# A clean --stop leaves the unit inactive and it must stay that way. If the stop
# had instead left the receiver in a failed state (e.g. the wrong kill signal),
# Restart=on-failure would bring it back within RestartSec (10s); confirm it is
# still inactive after 15s.
sleep 15
if systemctl is-active --quiet "$UNIT"; then
    echo "ERROR: $UNIT came back after --stop (it should have been stopped, not signalled)" >&2
    systemctl status "$UNIT" --no-pager || true
    exit 1
fi
echo "  unit stayed stopped: OK"

# --- 3. disabling streaming_archiver stops the unit via cron -------------
echo "=== Restarting receiver, then disabling streaming_archiver ==="
as_barman barman -q cron
wait_unit_state active 30
# Wait until the receiver has actually connected before flipping the config.
# The unit is Type=simple, so it reports "active" the instant the process is
# forked, before "barman receive-wal" has read its configuration; disabling
# streaming in that window makes the receiver read streaming_archiver=off and
# exit non-zero, which Restart=on-failure would then crash-loop on.
wait_for_slot_active 30

sed -i 's/^streaming_archiver = on/streaming_archiver = off/' "$BARMAN_CONF"
as_barman barman -q cron
wait_unit_state inactive 15
echo "  cron stopped the unit after disabling streaming: OK"

# --- 4. fork-fallback when the unit cannot be managed (no polkit) ---------
# Move the shipped polkit rule aside and reload polkit so the barman user is no
# longer authorised to manage the unit, mimicking a host without polkit. Then
# run cron through the real barman.service oneshot and assert the receiver is
# started by FORKING (not as a unit) and OUTLIVES the oneshot exit, which is
# exactly what KillMode=process in barman.service guarantees (Debian #1138696).
echo "=== Simulating absent polkit authorisation (moving the rule aside) ==="
sed -i 's/^streaming_archiver = off/streaming_archiver = on/' "$BARMAN_CONF"
if [ -e "$POLKIT_RULE" ]; then
    POLKIT_RULE_BAK="${POLKIT_RULE}.autopkgtest-bak"
    mv -f "$POLKIT_RULE" "$POLKIT_RULE_BAK"
    systemctl reload polkit.service 2>/dev/null \
        || systemctl restart polkit.service 2>/dev/null || true
fi

echo "=== Running cron via the barman.service oneshot (expect fork fallback) ==="
# barman.service runs "barman cron" as the barman user; the unit start is now
# denied, so cron must fall back to forking. systemctl start blocks until the
# oneshot has finished and its cgroup has been torn down, so a receiver that is
# streaming afterwards has necessarily survived that teardown.
systemctl start barman.service

echo "=== Asserting a receiver is streaming, but NOT as the systemd unit ==="
wait_for_slot_active 30
if systemctl is-active --quiet "$UNIT"; then
    echo "ERROR: $UNIT is active; expected the forked fallback, not a unit" >&2
    systemctl status "$UNIT" --no-pager || true
    exit 1
fi
echo "  receiver streaming via forked fallback, unit not active: OK"

echo "=== Asserting the forked receiver outlived the barman.service oneshot ==="
# Re-check after a short delay: without KillMode=process the control-group
# teardown of the oneshot would have killed the forked receiver by now.
sleep 5
wait_for_slot_active 5
echo "  forked receiver survived the oneshot exit (KillMode=process): OK"

echo "=== All systemd integration tests passed ==="
