foss_breakglass_mirror_v2/scripts/breakglass-healthcheck.sh

220 lines
7.9 KiB
Bash

#!/usr/bin/env bash
# ═══════════════════════════════════════════════════════════
# breakglass-healthcheck.sh — integrity & freshness checks
# ═══════════════════════════════════════════════════════════
# Checks:
# 1. Gitea reachable
# 2. Sync timer active
# 3. Recent sync logs exist
# 4. No repos are stale (FETCH_HEAD age)
# 5. Backup refs exist and are growing (append-only proof)
# 6. Audit log checksums haven't been tampered with
# 7. Local bare repos have not lost refs (deletion detection)
# ═══════════════════════════════════════════════════════════
set -euo pipefail
ENV_FILE="${BREAKGLASS_ENV:-/etc/breakglass/mirror.env}"
[[ -f "$ENV_FILE" ]] || { echo "FATAL: $ENV_FILE not found" >&2; exit 1; }
# shellcheck source=/dev/null
source "$ENV_FILE"
MIRROR_ROOT="${MIRROR_ROOT:-/var/lib/breakglass/repos}"
LOG_DIR="${LOG_DIR:-/var/log/breakglass}"
AUDIT_DIR="${AUDIT_DIR:-/var/lib/breakglass/audit}"
STALE_DAYS="${STALE_DAYS:-7}"
NOTIFY_METHOD="${NOTIFY_METHOD:-none}"
PROBLEMS=()
WARNINGS=()
log() { printf '%s %s\n' "$(date -u +%H:%M:%S)" "$*"; }
notify() {
local message="$1"
case "${NOTIFY_METHOD}" in
ntfy)
curl -sfS -d "$message" "${NTFY_SERVER:-https://ntfy.sh}/${NTFY_TOPIC:-}" &>/dev/null || true
;;
email)
echo "$message" | mail -s "Breakglass Health Alert" "${NOTIFY_EMAIL:-}" 2>/dev/null || true
;;
telegram)
curl -sfS -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN:-}/sendMessage" \
-d "chat_id=${TELEGRAM_CHAT_ID:-}" -d "text=${message}" &>/dev/null || true
;;
*) ;;
esac
}
# ── Check 1: Gitea reachable ────────────────────────────
log "Check 1: Gitea connectivity …"
if ! curl -sfS --connect-timeout 10 "${GITEA_URL}/api/v1/version" &>/dev/null; then
PROBLEMS+=("Gitea at ${GITEA_URL} is unreachable")
log " FAIL"
else
log " OK"
fi
# ── Check 2: Sync timer active ──────────────────────────
log "Check 2: Sync timer …"
if systemctl is-active --quiet breakglass-sync.timer 2>/dev/null; then
log " OK"
else
PROBLEMS+=("breakglass-sync.timer is not active")
log " FAIL"
fi
# ── Check 3: Recent sync log ────────────────────────────
log "Check 3: Recent sync logs …"
LATEST_LOG=$(find "$LOG_DIR" -name 'sync-*.log' -printf '%T@ %p\n' 2>/dev/null | sort -rn | head -1 | awk '{print $2}')
if [[ -z "$LATEST_LOG" ]]; then
PROBLEMS+=("No sync logs found in $LOG_DIR")
log " FAIL"
else
LOG_AGE_DAYS=$(( ( $(date +%s) - $(stat -c %Y "$LATEST_LOG") ) / 86400 ))
if (( LOG_AGE_DAYS > STALE_DAYS )); then
PROBLEMS+=("Last sync log is ${LOG_AGE_DAYS} days old")
log " WARN: ${LOG_AGE_DAYS}d old"
else
log " OK: ${LOG_AGE_DAYS}d old"
fi
fi
# ── Check 4: Repo freshness ─────────────────────────────
log "Check 4: Repo freshness (threshold: ${STALE_DAYS}d) …"
STALE_THRESHOLD=$(( $(date +%s) - STALE_DAYS * 86400 ))
REPO_COUNT=0
STALE_COUNT=0
if [[ -d "$MIRROR_ROOT" ]]; then
while IFS= read -r bare_dir; do
[[ -d "$bare_dir" ]] || continue
(( REPO_COUNT++ ))
repo_name="${bare_dir#${MIRROR_ROOT}/}"
repo_name="${repo_name%.git}"
fetch_head="$bare_dir/FETCH_HEAD"
if [[ -f "$fetch_head" ]]; then
last_fetch=$(stat -c %Y "$fetch_head")
if (( last_fetch < STALE_THRESHOLD )); then
days_stale=$(( ( $(date +%s) - last_fetch ) / 86400 ))
WARNINGS+=("${repo_name}: last fetched ${days_stale}d ago")
(( STALE_COUNT++ ))
log " STALE: $repo_name (${days_stale}d)"
fi
else
WARNINGS+=("${repo_name}: no FETCH_HEAD")
log " WARN: $repo_name has no FETCH_HEAD"
fi
done < <(find "$MIRROR_ROOT" -maxdepth 3 -name 'HEAD' -execdir pwd \;)
fi
log " $REPO_COUNT repos checked, $STALE_COUNT stale"
# ── Check 5: Backup refs exist (append-only proof) ──────
log "Check 5: Backup ref integrity …"
REPOS_WITHOUT_BACKUPS=0
if [[ -d "$MIRROR_ROOT" ]]; then
while IFS= read -r bare_dir; do
[[ -d "$bare_dir" ]] || continue
repo_name="${bare_dir#${MIRROR_ROOT}/}"
repo_name="${repo_name%.git}"
backup_count=$(git -C "$bare_dir" for-each-ref --format='x' refs/backup/ 2>/dev/null | wc -l)
if (( backup_count == 0 )); then
WARNINGS+=("${repo_name}: no backup refs — append-only not working?")
(( REPOS_WITHOUT_BACKUPS++ ))
log " WARN: $repo_name has no backup refs"
fi
done < <(find "$MIRROR_ROOT" -maxdepth 3 -name 'HEAD' -execdir pwd \;)
fi
if (( REPOS_WITHOUT_BACKUPS > 0 )); then
log " $REPOS_WITHOUT_BACKUPS repos missing backup refs"
else
log " OK: all repos have backup refs"
fi
# ── Check 6: Audit log checksums ────────────────────────
log "Check 6: Audit log integrity …"
CHECKSUM_FILE="$AUDIT_DIR/checksums.log"
if [[ -f "$CHECKSUM_FILE" ]]; then
TAMPERED=0
while read -r expected_hash filepath; do
if [[ -f "$filepath" ]]; then
actual_hash=$(sha256sum "$filepath" | awk '{print $1}')
if [[ "$actual_hash" != "$expected_hash" ]]; then
PROBLEMS+=("TAMPERED audit log: $filepath")
(( TAMPERED++ ))
fi
fi
done < "$CHECKSUM_FILE"
if (( TAMPERED > 0 )); then
log " FAIL: $TAMPERED tampered audit files"
else
log " OK"
fi
else
log " SKIP: no checksums yet (first run?)"
fi
# ── Check 7: Ref count file — detect local ref deletion ─
log "Check 7: Ref count stability …"
REF_COUNT_FILE="$AUDIT_DIR/ref-counts.dat"
if [[ -d "$MIRROR_ROOT" ]]; then
CURRENT_COUNTS=$(mktemp)
while IFS= read -r bare_dir; do
[[ -d "$bare_dir" ]] || continue
repo_name="${bare_dir#${MIRROR_ROOT}/}"
repo_name="${repo_name%.git}"
count=$(git -C "$bare_dir" for-each-ref --format='x' refs/heads refs/tags refs/notes refs/backup 2>/dev/null | wc -l)
echo "$repo_name $count" >> "$CURRENT_COUNTS"
done < <(find "$MIRROR_ROOT" -maxdepth 3 -name 'HEAD' -execdir pwd \;)
if [[ -f "$REF_COUNT_FILE" ]]; then
while read -r repo prev_count; do
curr_count=$(grep "^${repo} " "$CURRENT_COUNTS" 2>/dev/null | awk '{print $2}')
if [[ -n "$curr_count" ]] && (( curr_count < prev_count )); then
PROBLEMS+=("${repo}: ref count DECREASED ($prev_count$curr_count) — possible local tampering")
log " ALERT: $repo refs decreased $prev_count$curr_count"
fi
done < "$REF_COUNT_FILE"
fi
# Save current counts for next run
cp "$CURRENT_COUNTS" "$REF_COUNT_FILE"
rm -f "$CURRENT_COUNTS"
log " OK"
fi
# ── Report ───────────────────────────────────────────────
log ""
TOTAL_ISSUES=$(( ${#PROBLEMS[@]} + ${#WARNINGS[@]} ))
if [[ ${#PROBLEMS[@]} -eq 0 && ${#WARNINGS[@]} -eq 0 ]]; then
log "═══ Health check PASSED — all mirrors healthy ═══"
exit 0
fi
if [[ ${#PROBLEMS[@]} -gt 0 ]]; then
log "═══ Health check FAILED — ${#PROBLEMS[@]} critical issue(s): ═══"
REPORT="BREAKGLASS HEALTH ALERT: ${#PROBLEMS[@]} critical, ${#WARNINGS[@]} warnings\n\nCRITICAL:\n"
for p in "${PROBLEMS[@]}"; do
log "$p"
REPORT+="${p}\n"
done
fi
if [[ ${#WARNINGS[@]} -gt 0 ]]; then
log " ${#WARNINGS[@]} warning(s):"
REPORT="${REPORT:-}WARNINGS:\n"
for w in "${WARNINGS[@]}"; do
log "$w"
REPORT+="${w}\n"
done
fi
notify "$(echo -e "${REPORT:-Health check completed with issues}")"
exit $(( ${#PROBLEMS[@]} > 0 ? 1 : 0 ))