#!/usr/bin/env bash
set -euo pipefail

# Hermes FleetOps remote bootstrap v2
# Purpose: install/join Tailscale, create a minimal SSH diagnostics user,
# install user/root diagnostics collectors, and optionally install a narrow sudoers rule.
# Supported: Ubuntu/Debian apt-based Linux.
# Secrets: pass auth key via --authkey, --authkey-file, or TAILSCALE_AUTHKEY env.
# Do NOT hardcode real auth keys in this file.

VERSION="2.0.0"
MODE="install"
AUTHKEY="${TAILSCALE_AUTHKEY:-}"
AUTHKEY_FILE=""
HOSTNAME_OVERRIDE=""
ACCEPT_ROUTES="false"
TS_SSH="false"
DIAG_USER="hermesops"
CONTROL_PUBKEY=""
CONTROL_PUBKEY_FILE=""
INSTALL_DIAG="true"
INSTALL_SUDOERS="false"
INSTALL_USER_DIAG="true"
SUDOERS_PATH="/etc/sudoers.d/hermesops-hermes-remote-diagnose"
REMOTE_DIR="/opt/hermes-remote"
ALLOW_APT_AUTOREMOVE="false"

log(){ printf '[%s] %s\n' "$(date -Is 2>/dev/null || date)" "$*"; }
warn(){ printf '[%s] WARN: %s\n' "$(date -Is 2>/dev/null || date)" "$*" >&2; }
err(){ printf '[%s] ERROR: %s\n' "$(date -Is 2>/dev/null || date)" "$*" >&2; }
redact(){ sed -E 's/tskey-auth-[A-Za-z0-9_-]+/tskey-auth-[REDACTED]/g'; }

usage(){
  cat <<'EOF'
Hermes FleetOps remote bootstrap v2

Usage:
  ./hermes-remote-bootstrap-v2.sh --dry-run
  sudo ./hermes-remote-bootstrap-v2.sh --authkey 'tskey-auth-...' --control-pubkey 'ssh-ed25519 ...'
  sudo TAILSCALE_AUTHKEY='tskey-auth-...' ./hermes-remote-bootstrap-v2.sh --control-pubkey-file ./id_ed25519.pub

Options:
  --dry-run                     Validate OS, repos, dependencies, current Tailscale state; no system changes.
  --status                      Print current Tailscale/SSH/diagnostic state; no system changes.
  --authkey KEY                 Tailscale auth key. Prefer ephemeral + pre-approved key.
  --authkey-file PATH           Read Tailscale auth key from file on remote machine.
  --hostname NAME               Tailscale node hostname override.
  --accept-routes true|false    Default false.
  --ssh true|false              Enable Tailscale SSH. Default false; normal OpenSSH is used.
  --diag-user USER              Dedicated diagnostics SSH user. Default hermesops. Empty disables creation.
  --control-pubkey KEY          SSH public key authorized for diag-user.
  --control-pubkey-file PATH    Read SSH public key from file on remote machine.
  --install-sudoers             Allow diag-user to run ONLY /opt/hermes-remote/diagnose.sh via sudo NOPASSWD.
  --no-diag                     Do not install diagnostics scripts.
  --no-user-diag                Do not install user-mode diagnostics under diag-user home.
  --remote-dir PATH             Root diagnostics install dir. Default /opt/hermes-remote.
  -h|--help                     Show help.

Exit codes:
  0 success
  2 bad arguments
  3 unsupported OS/codename
  4 repo endpoint/network validation failed
  5 install mode requires root
  6 missing auth key when node is not logged in
  7 sudoers validation failed

Security notes:
  - Do not paste auth keys into logs/tickets. Revoke keys after bootstrap.
  - Prefer Tailscale auth keys that are ephemeral, pre-approved, short-lived, and tag-scoped.
  - This script never stores the auth key on disk unless you explicitly provide an authkey file.
EOF
}

bool_arg(){
  case "${1:-}" in true|false) printf '%s' "$1" ;; *) err "$2 must be true or false"; exit 2 ;; esac
}

while [ $# -gt 0 ]; do
  case "$1" in
    --dry-run) MODE="dry-run"; shift ;;
    --status) MODE="status"; shift ;;
    --authkey) AUTHKEY="${2:-}"; shift 2 ;;
    --authkey-file) AUTHKEY_FILE="${2:-}"; shift 2 ;;
    --hostname) HOSTNAME_OVERRIDE="${2:-}"; shift 2 ;;
    --accept-routes) ACCEPT_ROUTES=$(bool_arg "${2:-}" "--accept-routes"); shift 2 ;;
    --ssh) TS_SSH=$(bool_arg "${2:-}" "--ssh"); shift 2 ;;
    --diag-user) DIAG_USER="${2:-}"; shift 2 ;;
    --control-pubkey) CONTROL_PUBKEY="${2:-}"; shift 2 ;;
    --control-pubkey-file) CONTROL_PUBKEY_FILE="${2:-}"; shift 2 ;;
    --install-sudoers) INSTALL_SUDOERS="true"; shift ;;
    --no-diag) INSTALL_DIAG="false"; shift ;;
    --no-user-diag) INSTALL_USER_DIAG="false"; shift ;;
    --remote-dir) REMOTE_DIR="${2:-}"; shift 2 ;;
    -h|--help) usage; exit 0 ;;
    *) err "unknown argument: $1"; usage; exit 2 ;;
  esac
done

[ -n "$AUTHKEY_FILE" ] && AUTHKEY="$(tr -d ' \t\r\n' < "$AUTHKEY_FILE")"
[ -n "$CONTROL_PUBKEY_FILE" ] && CONTROL_PUBKEY="$(cat "$CONTROL_PUBKEY_FILE")"

if [ -n "$AUTHKEY" ] && ! printf '%s' "$AUTHKEY" | grep -q '^tskey-auth-'; then
  warn "auth key does not start with tskey-auth-; tailscale up may fail"
fi
if [ -n "$CONTROL_PUBKEY" ] && ! printf '%s' "$CONTROL_PUBKEY" | grep -Eq '^(ssh-ed25519|ssh-rsa|ecdsa-sha2-nistp[0-9]+) '; then
  err "control public key does not look like an OpenSSH public key"
  exit 2
fi

log "Hermes FleetOps remote bootstrap v$VERSION mode=$MODE"
log "identity: user=$(whoami 2>/dev/null || true) uid=$(id -u 2>/dev/null || true) host=$(hostname 2>/dev/null || true)"

if [ -r /etc/os-release ]; then . /etc/os-release; else ID=""; VERSION_CODENAME=""; PRETTY_NAME="unknown"; fi
DISTRO="${ID:-unknown}"
CODENAME="${VERSION_CODENAME:-}"
log "os: distro=$DISTRO codename=$CODENAME pretty=${PRETTY_NAME:-unknown}"

case "$DISTRO:$CODENAME" in
  ubuntu:jammy|ubuntu:focal|ubuntu:noble|ubuntu:plucky|debian:bullseye|debian:bookworm|debian:trixie)
    REPO_BASE="https://pkgs.tailscale.com/stable/$DISTRO/$CODENAME"
    ;;
  *)
    if [ "$DISTRO" = "ubuntu" ] && [ -n "$CODENAME" ]; then REPO_BASE="https://pkgs.tailscale.com/stable/ubuntu/$CODENAME";
    elif [ "$DISTRO" = "debian" ] && [ -n "$CODENAME" ]; then REPO_BASE="https://pkgs.tailscale.com/stable/debian/$CODENAME";
    else err "unsupported or unrecognized distro/codename: $DISTRO/$CODENAME"; exit 3; fi
    ;;
esac

have(){ command -v "$1" >/dev/null 2>&1; }
need(){ have "$1" || { err "missing command: $1"; return 1; }; }

print_status(){
  log "status summary"
  printf 'hostname=%s\n' "$(hostname 2>/dev/null || true)"
  printf 'whoami=%s uid=%s\n' "$(whoami 2>/dev/null || true)" "$(id -u 2>/dev/null || true)"
  printf 'tailscale_bin=%s\n' "$(command -v tailscale 2>/dev/null || true)"
  if have tailscale; then
    tailscale version 2>&1 | redact || true
    tailscale status 2>&1 | redact || true
    tailscale ip -4 2>&1 | redact || true
  fi
  systemctl is-active tailscaled 2>/dev/null || true
  systemctl is-enabled tailscaled 2>/dev/null || true
  [ -n "$DIAG_USER" ] && id "$DIAG_USER" 2>/dev/null || true
  [ -x "$REMOTE_DIR/diagnose.sh" ] && printf 'root_diag=%s\n' "$REMOTE_DIR/diagnose.sh" || true
  [ -f "$SUDOERS_PATH" ] && printf 'sudoers=%s\n' "$SUDOERS_PATH" || true
}

need sh
need curl
need apt-get

log "repo base: $REPO_BASE"
if [ "$MODE" = "status" ]; then print_status; exit 0; fi

if [ "$MODE" = "dry-run" ]; then
  log "dry-run: checking Tailscale package endpoints"
  curl -fsI "$REPO_BASE.noarmor.gpg" >/dev/null || { err "cannot reach $REPO_BASE.noarmor.gpg"; exit 4; }
  log "OK: $REPO_BASE.noarmor.gpg"
  curl -fsI "$REPO_BASE.tailscale-keyring.list" >/dev/null || { err "cannot reach $REPO_BASE.tailscale-keyring.list"; exit 4; }
  log "OK: $REPO_BASE.tailscale-keyring.list"
  if have tailscale; then log "tailscale already present: $(command -v tailscale)"; tailscale version 2>&1 | redact || true; else log "tailscale currently not installed"; fi
  if [ -n "$AUTHKEY" ]; then log "authkey provided: yes length=${#AUTHKEY}"; else log "authkey provided: no"; fi
  if [ -n "$CONTROL_PUBKEY" ]; then log "control pubkey provided: yes"; else log "control pubkey provided: no"; fi
  log "dry-run complete; no system changes made"
  exit 0
fi

if [ "$(id -u)" -ne 0 ]; then err "install mode must run as root; re-run with sudo"; exit 5; fi
if [ -z "$AUTHKEY" ] && ! tailscale status >/dev/null 2>&1; then err "--authkey is required when node is not already logged in"; exit 6; fi

log "installing Tailscale apt repository"
install -d -m 0755 /usr/share/keyrings
curl -fsSL "$REPO_BASE.noarmor.gpg" -o /usr/share/keyrings/tailscale-archive-keyring.gpg
curl -fsSL "$REPO_BASE.tailscale-keyring.list" -o /etc/apt/sources.list.d/tailscale.list
apt-get update
DEBIAN_FRONTEND=noninteractive apt-get install -y tailscale
systemctl enable --now tailscaled

UP_ARGS=(up "--accept-routes=$ACCEPT_ROUTES" "--ssh=$TS_SSH")
[ -n "$AUTHKEY" ] && UP_ARGS+=("--authkey=$AUTHKEY")
[ -n "$HOSTNAME_OVERRIDE" ] && UP_ARGS+=("--hostname=$HOSTNAME_OVERRIDE")
log "running tailscale up: authkey redacted, hostname=${HOSTNAME_OVERRIDE:-default}, ssh=$TS_SSH, accept-routes=$ACCEPT_ROUTES"
tailscale "${UP_ARGS[@]}"

if [ -n "$DIAG_USER" ]; then
  log "creating/updating diagnostics user: $DIAG_USER"
  if ! id "$DIAG_USER" >/dev/null 2>&1; then useradd -m -s /bin/bash "$DIAG_USER"; fi
  HOME_DIR="$(getent passwd "$DIAG_USER" | cut -d: -f6)"
  chmod go-w "$HOME_DIR" || true
  if [ -n "$CONTROL_PUBKEY" ]; then
    install -d -m 0700 -o "$DIAG_USER" -g "$DIAG_USER" "$HOME_DIR/.ssh"
    touch "$HOME_DIR/.ssh/authorized_keys"
    chown "$DIAG_USER:$DIAG_USER" "$HOME_DIR/.ssh/authorized_keys"
    chmod 0600 "$HOME_DIR/.ssh/authorized_keys"
    grep -qxF "$CONTROL_PUBKEY" "$HOME_DIR/.ssh/authorized_keys" || printf '%s\n' "$CONTROL_PUBKEY" >> "$HOME_DIR/.ssh/authorized_keys"
  fi
fi

if [ "$INSTALL_DIAG" = "true" ]; then
  log "installing diagnostics under $REMOTE_DIR"
  install -d -m 0755 "$REMOTE_DIR"
  cat > "$REMOTE_DIR/diagnose.sh" <<'DIAG'
#!/usr/bin/env bash
set -euo pipefail
TS=$(date -u +%Y%m%dT%H%M%SZ)
HOST=$(hostname 2>/dev/null || echo unknown)
BASE="/tmp/hermes-root-diagnose-${HOST}-${TS}"
mkdir -p "$BASE"
run(){ name="$1"; shift; { echo "### $name"; echo "### command: $*"; "$@"; } >"$BASE/$name.txt" 2>&1 || true; }
run hostname hostname
run date date -Is
run uname uname -a
run os_release bash -lc 'cat /etc/os-release 2>/dev/null || true'
run uptime uptime
run who who
run users users
run last_reboot last reboot
run df df -hT
run free free -h
run lsblk lsblk -a
run mount mount
run ip_addr ip addr
run ip_route ip route
run ss_listen ss -lntup
run systemctl_failed systemctl --failed
run journal_boot journalctl -b --no-pager
run journal_prev journalctl -b -1 --no-pager
run journal_errors journalctl --since '14 days ago' -p warning..alert --no-pager
run dmesg dmesg -T
run pstore bash -lc 'ls -la /sys/fs/pstore 2>/dev/null; for f in /sys/fs/pstore/*; do [ -f "$f" ] && echo "--- $f" && head -300 "$f"; done || true'
run tailscale_status bash -lc 'command -v tailscale >/dev/null && tailscale status || echo tailscale_not_installed'
run tailscale_netcheck bash -lc 'command -v tailscale >/dev/null && tailscale netcheck || true'
run smartctl_scan bash -lc 'command -v smartctl >/dev/null && smartctl --scan || echo smartctl_missing'
run smartctl_all bash -lc 'if command -v smartctl >/dev/null; then smartctl --scan | awk "{print \$1}" | while read -r d; do echo "--- $d"; smartctl -a "$d"; done; fi'
run sensors bash -lc 'command -v sensors >/dev/null && sensors || echo sensors_missing'
OUT="/tmp/${BASE##*/}.tar.gz"
tar -C "$(dirname "$BASE")" -czf "$OUT" "$(basename "$BASE")"
echo "$OUT"
DIAG
  chmod 0755 "$REMOTE_DIR/diagnose.sh"
fi

if [ -n "$DIAG_USER" ] && [ "$INSTALL_USER_DIAG" = "true" ]; then
  HOME_DIR="$(getent passwd "$DIAG_USER" | cut -d: -f6)"
  log "installing user-mode diagnostics for $DIAG_USER"
  install -d -m 0755 -o "$DIAG_USER" -g "$DIAG_USER" "$HOME_DIR/hermes-remote"
  cat > "$HOME_DIR/hermes-remote/diagnose-user.sh" <<'UDIAG'
#!/usr/bin/env bash
set -euo pipefail
TS=$(date -u +%Y%m%dT%H%M%SZ)
HOST=$(hostname 2>/dev/null || echo unknown)
BASE="$HOME/hermes-remote/cases/hermes-user-diagnose-${HOST}-${TS}"
mkdir -p "$BASE"
run(){ name="$1"; shift; { echo "### $name"; echo "### command: $*"; "$@"; } >"$BASE/$name.txt" 2>&1 || true; }
run hostname hostname
run date date -Is
run uname uname -a
run os_release bash -lc 'cat /etc/os-release 2>/dev/null || true'
run uptime uptime
run id id
run df df -hT
run free free -h
run ip_addr ip addr
run ip_route ip route
run ss_listen ss -lntup
run systemctl_failed systemctl --failed
run journal_user journalctl --user --since '14 days ago' --no-pager
run journal_errors journalctl --since '14 days ago' -p warning..alert --no-pager
run tailscale_status bash -lc 'command -v tailscale >/dev/null && tailscale status || echo tailscale_not_installed'
run tailscale_netcheck bash -lc 'command -v tailscale >/dev/null && tailscale netcheck || true'
OUT="$HOME/hermes-remote/cases/${BASE##*/}.tar.gz"
tar -C "$(dirname "$BASE")" -czf "$OUT" "$(basename "$BASE")"
echo "$OUT"
UDIAG
  chown "$DIAG_USER:$DIAG_USER" "$HOME_DIR/hermes-remote/diagnose-user.sh"
  chmod 0755 "$HOME_DIR/hermes-remote/diagnose-user.sh"
fi

if [ -n "$DIAG_USER" ] && [ "$INSTALL_SUDOERS" = "true" ]; then
  log "installing narrow sudoers rule for $DIAG_USER"
  printf '%s ALL=(root) NOPASSWD: %s/diagnose.sh\n' "$DIAG_USER" "$REMOTE_DIR" > "$SUDOERS_PATH"
  chmod 0440 "$SUDOERS_PATH"
  visudo -cf "$SUDOERS_PATH" >/dev/null || { rm -f "$SUDOERS_PATH"; err "sudoers validation failed; removed $SUDOERS_PATH"; exit 7; }
fi

log "final status"
print_status | redact
log "install complete"
