From 280744ce0c482f87746cb149fc427410c18750ac Mon Sep 17 00:00:00 2001 From: joshp123 Date: Fri, 3 Apr 2026 15:38:57 +0200 Subject: [PATCH] infra: slim clawdinators aws footprint What: - bound CLAWDINATOR image artifact retention with S3 lifecycle, AMI pruning, and import provenance tags - reduce the AWS fleet to Babelfish-only and make GitHub credentials opt-in per host - disable the AMI build, nix-openclaw bump, and release workflows by moving them out of .github/workflows/ - update operator docs for the new explicit build and deploy model Why: - stop unbounded S3 and snapshot growth from image builds - remove unattended resurrection paths and shut down the unused t3.large instances - keep the remaining Babelfish host running without GitHub App credentials or sync timers Tests: - `nix shell nixpkgs#shellcheck nixpkgs#shfmt -c bash scripts/lint-shell.sh` (pass) - `nix build .#nixosConfigurations.clawdinator-babelfish.config.system.build.toplevel .#nixosConfigurations.clawdinator-1.config.system.build.toplevel .#nixosConfigurations.clawdinator-2.config.system.build.toplevel` (pass) - `AWS_PROFILE=homelab-admin TF_VAR_aws_region=eu-central-1 TF_VAR_ami_id=ami-0a9abe17feeee0079 TF_VAR_ssh_public_key="$(cat ~/.ssh/id_ed25519.pub)" nix shell nixpkgs#opentofu -c sh -lc 'tofu fmt -check && tofu validate'` (pass) - live AWS apply: destroyed `clawdinator-1` and `clawdinator-2`, replaced Babelfish, and verified only `Fleet Deploy` remains active in GitHub Actions --- .github/workflows-disabled/README.md | 7 + .../bump-nix-clawdbot.yml.disabled} | 0 .../image-build.yml.disabled} | 9 + .../release.yml.disabled} | 0 AGENTS.md | 20 +- docs/CONTROL_PLANE.md | 3 + docs/DEPLOYMENT_MODEL.md | 8 +- infra/opentofu/aws/README.md | 8 + infra/opentofu/aws/main.tf | 31 ++- nix/hosts/clawdinator-1.nix | 3 + nix/hosts/clawdinator-2.nix | 3 + nix/hosts/clawdinator-babelfish.nix | 5 +- nix/hosts/clawdinator-common.nix | 41 ++-- nix/instances.json | 12 - nix/modules/clawdinator.nix | 4 +- scripts/import-image.sh | 15 +- scripts/prune-clawdinator-ami-history.sh | 230 ++++++++++++++++++ scripts/upload-bootstrap-all.sh | 5 +- 18 files changed, 345 insertions(+), 59 deletions(-) create mode 100644 .github/workflows-disabled/README.md rename .github/{workflows/bump-nix-clawdbot.yml => workflows-disabled/bump-nix-clawdbot.yml.disabled} (100%) rename .github/{workflows/image-build.yml => workflows-disabled/image-build.yml.disabled} (92%) rename .github/{workflows/release.yml => workflows-disabled/release.yml.disabled} (100%) create mode 100644 scripts/prune-clawdinator-ami-history.sh diff --git a/.github/workflows-disabled/README.md b/.github/workflows-disabled/README.md new file mode 100644 index 0000000..65b6d57 --- /dev/null +++ b/.github/workflows-disabled/README.md @@ -0,0 +1,7 @@ +Disabled GitHub Actions live here on purpose. + +Moving a file out of `.github/workflows/` fully disables it: no schedule, no manual dispatch button, no runnable workflow at all. + +The disabled set currently includes the old AMI build, flake bump, and push-triggered release/deploy workflows. + +To reactivate one of these workflows, move it back into `.github/workflows/` in a code change and review whether that would recreate infrastructure or resume unattended mutation. diff --git a/.github/workflows/bump-nix-clawdbot.yml b/.github/workflows-disabled/bump-nix-clawdbot.yml.disabled similarity index 100% rename from .github/workflows/bump-nix-clawdbot.yml rename to .github/workflows-disabled/bump-nix-clawdbot.yml.disabled diff --git a/.github/workflows/image-build.yml b/.github/workflows-disabled/image-build.yml.disabled similarity index 92% rename from .github/workflows/image-build.yml rename to .github/workflows-disabled/image-build.yml.disabled index 15cc68d..c4b87e5 100644 --- a/.github/workflows/image-build.yml +++ b/.github/workflows-disabled/image-build.yml.disabled @@ -124,3 +124,12 @@ jobs: run: | ami_id="$(scripts/import-image.sh)" echo "AMI_ID=${ami_id}" >> "${GITHUB_ENV}" + + - name: Prune old AMIs + env: + AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }} + AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }} + AWS_REGION: ${{ secrets.AWS_REGION }} + APPLY: "true" + run: | + bash scripts/prune-clawdinator-ami-history.sh diff --git a/.github/workflows/release.yml b/.github/workflows-disabled/release.yml.disabled similarity index 100% rename from .github/workflows/release.yml rename to .github/workflows-disabled/release.yml.disabled diff --git a/AGENTS.md b/AGENTS.md index 347190b..d79d831 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -62,7 +62,8 @@ Deploy flow (automation-first): - Use `devenv.nix` for tooling (nixos-generators, awscli2). - Build a bootstrap NixOS image with nixos-generators (raw) and upload it to S3. - Use `nix/hosts/clawdinator-1-image.nix` for image builds. -- CI is preferred: `.github/workflows/image-build.yml` runs build → S3 upload → AMI import. +- The old CI AMI/update/release workflows are intentionally disabled under `.github/workflows-disabled/`; AMI builds and deploys now require an explicit code change or a local operator run. +- Image history is bounded on purpose: raw `clawdinator-nixos-*` uploads expire automatically, and old CLAWDINATOR AMIs/snapshots are pruned after successful builds while keeping the live fleet AMI plus a short rollback window. - Resume AMI pipeline work immediately if it stalls; do not use rsync as a workaround. Host edits are allowed but must be committed and baked into a new AMI to persist. - CI must provide `CLAWDINATOR_AGE_KEY` to build + upload the runtime bootstrap bundle to S3. - Bootstrap bundle location: `s3://${S3_BUCKET}/bootstrap//` (secrets + repo seeds). @@ -74,7 +75,7 @@ Deploy flow (automation-first): - Update `nix/hosts/.nix` (Discord allowlist, GitHub App installationId, identity name). - Discord must use `messages.queue.byChannel.discord = "interrupt"`; `queue` delays replies to heartbeat and makes the bot appear dead. - Ensure `/var/lib/clawd/repos/clawdinators` contains this repo (self-update requires it). -- Verify systemd services: `clawdinator`, `clawdinator-github-app-token`, `clawdinator-self-update`. +- Verify systemd services: `clawdinator`; `clawdinator-github-app-token` only on hosts that explicitly enable GitHub App auth. - Commit and push changes; repo is the source of truth. Bootstrap (local): @@ -102,19 +103,16 @@ End-to-end SDLC (local → AMI → host) **(verified)**: - `RULES=./secrets.nix agenix -d homelab-admin.age -i ~/.ssh/id_ed25519 > /tmp/homelab-admin.env` - `set -a; source /tmp/homelab-admin.env; set +a` - Cleanup: `trash /tmp/homelab-admin.env` -2) Push to `main` to trigger AMI build (`.github/workflows/image-build.yml`). -3) Watch CI: - - `gh run list -R openclaw/clawdinators --limit 5` - - `gh run view --log | grep AMI_ID` -4) Redeploy from the new AMI (instance replacement): +2) Build/import a new AMI explicitly. The old GitHub Actions build/deploy paths are disabled under `.github/workflows-disabled/`. +3) Redeploy from the new AMI (instance replacement): - `devenv shell -- bash -lc "cd infra/opentofu/aws && TF_VAR_ami_id= TF_VAR_ssh_public_key=\"$(cat ~/.ssh/id_ed25519.pub)\" TF_VAR_aws_region=eu-central-1 tofu apply -auto-approve"` -5) New IP: +4) New IP: - `tofu output -json instance_public_ips | jq -r '."clawdinator-1"'` - `ssh -o StrictHostKeyChecking=accept-new root@` -6) Post-deploy sanity: +5) Post-deploy sanity: - `systemctl is-active clawdinator` - - `systemctl is-active clawdinator-github-app-token.timer` - - `GH_CONFIG_DIR=/var/lib/clawd/gh gh auth status -h github.com` + - `systemctl is-active clawdinator-github-app-token.timer` only if the target host explicitly enables `githubApp` + - `GH_CONFIG_DIR=/var/lib/clawd/gh gh auth status -h github.com` only if the target host explicitly enables GitHub auth Important: - Repo/workspace on host is seeded from the **AMI snapshot**. `git pull` is ephemeral; rebuild AMI for persistent changes. diff --git a/docs/CONTROL_PLANE.md b/docs/CONTROL_PLANE.md index d99e3f7..4bb3b65 100644 --- a/docs/CONTROL_PLANE.md +++ b/docs/CONTROL_PLANE.md @@ -105,6 +105,7 @@ Example: ### Roll the fleet - `/fleet deploy all` replaces every host with latest AMI. +- Old AMI history is intentionally bounded. Normal operations keep the currently used fleet AMI plus a small recent rollback window; deeper rollback requires an explicit preserved AMI id. ## Self‑Recycle (Out‑of‑Band) - Agents call the Control API (no AWS creds) via the fleet-control skill. @@ -118,6 +119,7 @@ Example: ## AMI Selection (KISS) - Use latest AMI tagged `clawdinator=true`. - Optional override via workflow input `ami_override` for rollback. +- Automatic retention keeps the newest few tagged AMIs plus any AMI still backing a live CLAWDINATOR instance. ## Deploy Execution (Workflow) - Single workflow `fleet-deploy.yml`. @@ -157,6 +159,7 @@ Example: - `/fleet deploy clawdinator-2` → bring up new host. - `/fleet deploy all` → roll the fleet to latest AMI. - If rollback needed: rerun deploy with `ami_override` (exact AMI id). +- If the exact rollback AMI is older than the bounded retention window, preserve it intentionally before relying on it. ## Implementation Checklist (From Design → Works) 1) Add `nix/instances.json` (clawdinator‑1 + clawdinator‑2). diff --git a/docs/DEPLOYMENT_MODEL.md b/docs/DEPLOYMENT_MODEL.md index 88a1149..45f8e21 100644 --- a/docs/DEPLOYMENT_MODEL.md +++ b/docs/DEPLOYMENT_MODEL.md @@ -4,12 +4,12 @@ This repo uses a **two-lane** delivery model: - **Lane A: Base AMI** (slow path, rare) - Purpose: reliable boot substrate (Nix + systemd + networking + EFS + SSM + bootstrap services). - - Built by: `.github/workflows/image-build.yml` (manual or scheduled). + - Built by: explicit operator flow. The old `.github/workflows/image-build.yml` workflow is intentionally disabled under `.github/workflows-disabled/`. - Tradeoff: EC2 VM Import is slow/variable; do not run per-commit. -- **Lane B: Release + Fleet switch** (fast path, every merge) +- **Lane B: Release + Fleet switch** (fast path, manual) - Purpose: ship config/app changes quickly while staying reproducible. - - Built by: `.github/workflows/release.yml`. + - Built by: explicit operator flow. The old `.github/workflows/release.yml` workflow is intentionally disabled under `.github/workflows-disabled/`. - Steps: 1) **Fail-fast eval** of NixOS configs. 2) Upload **bootstrap bundles** to S3 (repo seeds, workspace, secrets references). @@ -38,7 +38,7 @@ This repo uses a **two-lane** delivery model: ## Infra requirement: CI SSM permissions -`release.yml` uses `aws ssm send-command`. +The old `release.yml` workflow used `aws ssm send-command`; that path is intentionally disabled now. After pulling these changes, run `tofu apply` in `infra/opentofu/aws` (with admin creds) so the CI IAM policy includes the `FleetDeploySSM` statement. diff --git a/infra/opentofu/aws/README.md b/infra/opentofu/aws/README.md index f96a161..2169889 100644 --- a/infra/opentofu/aws/README.md +++ b/infra/opentofu/aws/README.md @@ -2,6 +2,8 @@ Goal: manage the CLAWDINATOR fleet infrastructure (S3 image bucket, VM import role, EFS, EC2 instances, and control-plane Lambda). +The shared image bucket is not image-only. It also stores bootstrap bundles, age-encrypted secrets, and Terraform remote state. Raw image uploads therefore use a prefix-scoped lifecycle rule: only top-level `clawdinator-nixos-*` objects expire automatically. Bootstrap, secrets, and state are intentionally retained. + ## Prereqs - AWS credentials with permissions to manage IAM (use your homelab-admin key locally). - Fleet registry: `nix/instances.json` (authoritative instance list). @@ -73,3 +75,9 @@ export TF_VAR_github_token=... ## Runtime bootstrap - Instances get an IAM role with read access to `s3://${S3_BUCKET}/bootstrap/*` for secrets + repo seeds. + +## Retention contract +- Raw image uploads whose keys start with `clawdinator-nixos-` expire automatically after 14 days. +- Because bucket versioning is enabled, noncurrent raw-image versions are also expired so the bytes actually disappear. +- The CI IAM user can prune old CLAWDINATOR AMIs and their backing snapshots. +- Normal deploys still use the latest self-owned AMI tagged `clawdinator=true`. diff --git a/infra/opentofu/aws/main.tf b/infra/opentofu/aws/main.tf index bdfbd83..6daca36 100644 --- a/infra/opentofu/aws/main.tf +++ b/infra/opentofu/aws/main.tf @@ -49,6 +49,33 @@ resource "aws_s3_bucket_versioning" "image_bucket" { } } +resource "aws_s3_bucket_lifecycle_configuration" "image_bucket" { + bucket = aws_s3_bucket.image_bucket.id + + rule { + id = "expire-clawdinator-raw-images" + status = "Enabled" + + filter { + prefix = "clawdinator-nixos-" + } + + expiration { + days = 14 + } + + # Versioning is enabled on the shared bucket, so expiring the current object + # alone would leave the bytes behind as noncurrent versions. + noncurrent_version_expiration { + noncurrent_days = 1 + } + + abort_incomplete_multipart_upload { + days_after_initiation = 1 + } + } +} + resource "aws_dynamodb_table" "terraform_lock" { name = var.terraform_lock_table_name billing_mode = "PAY_PER_REQUEST" @@ -187,7 +214,9 @@ data "aws_iam_policy_document" "ami_importer" { "ec2:DescribeImages", "ec2:DescribeSnapshots", "ec2:RegisterImage", - "ec2:CreateTags" + "ec2:CreateTags", + "ec2:DeregisterImage", + "ec2:DeleteSnapshot" ] resources = ["*"] } diff --git a/nix/hosts/clawdinator-1.nix b/nix/hosts/clawdinator-1.nix index 227908e..1277255 100644 --- a/nix/hosts/clawdinator-1.nix +++ b/nix/hosts/clawdinator-1.nix @@ -21,6 +21,9 @@ networking.firewall.allowedTCPPorts = [ 22 ]; + clawdinator.bootstrapPrefix = "bootstrap/clawdinator-1"; + clawdinator.discordTokenSecret = "clawdinator-discord-token-1"; + # Publish PR intent artifacts from EFS to the public bucket. # (Timer + oneshot service; safe to run without stopping the gateway.) services.clawdinator.publicS3 = { diff --git a/nix/hosts/clawdinator-2.nix b/nix/hosts/clawdinator-2.nix index 0ee22c6..d7f2517 100644 --- a/nix/hosts/clawdinator-2.nix +++ b/nix/hosts/clawdinator-2.nix @@ -21,6 +21,9 @@ networking.firewall.allowedTCPPorts = [ 22 ]; + clawdinator.bootstrapPrefix = "bootstrap/clawdinator-2"; + clawdinator.discordTokenSecret = "clawdinator-discord-token-2"; + # Discord-only instance: disable Telegram. services.clawdinator.config.plugins.entries.telegram.enabled = false; services.clawdinator.config.channels.telegram.enabled = false; diff --git a/nix/hosts/clawdinator-babelfish.nix b/nix/hosts/clawdinator-babelfish.nix index b7e635b..188c6dd 100644 --- a/nix/hosts/clawdinator-babelfish.nix +++ b/nix/hosts/clawdinator-babelfish.nix @@ -21,8 +21,11 @@ networking.firewall.allowedTCPPorts = [ 22 ]; + clawdinator.bootstrapPrefix = "bootstrap/clawdinator-babelfish"; + clawdinator.discordTokenSecret = "clawdinator-discord-token-babelfish"; + services.clawdinator = { - githubApp.enable = lib.mkForce true; + githubApp.enable = lib.mkForce false; githubSync.enable = lib.mkForce false; cronJobsFile = lib.mkForce null; diff --git a/nix/hosts/clawdinator-common.nix b/nix/hosts/clawdinator-common.nix index 93c8057..be6ac34 100644 --- a/nix/hosts/clawdinator-common.nix +++ b/nix/hosts/clawdinator-common.nix @@ -2,14 +2,9 @@ let cfg = config.services.clawdinator; secretsPath = config.clawdinator.secretsPath; - instancesFile = ../instances.json; - instances = builtins.fromJSON (builtins.readFile instancesFile); hostName = config.networking.hostName; - instance = - if builtins.hasAttr hostName instances - then instances.${hostName} - else throw "clawdinator: missing instance ${hostName} in ${instancesFile}"; - discordTokenSecret = instance.discordTokenSecret; + bootstrapPrefix = config.clawdinator.bootstrapPrefix; + discordTokenSecret = config.clawdinator.discordTokenSecret; repoSeedsFile = ../../clawdinator/repos.tsv; repoSeedLines = lib.filter @@ -34,17 +29,22 @@ in description = "Path to encrypted age secrets for CLAWDINATOR."; }; + options.clawdinator.bootstrapPrefix = lib.mkOption { + type = lib.types.str; + description = "Bootstrap S3 prefix for this host."; + }; + + options.clawdinator.discordTokenSecret = lib.mkOption { + type = lib.types.str; + description = "Encrypted Discord token secret name for this host."; + }; + config = { clawdinator.secretsPath = "/var/lib/clawd/nix-secrets"; swapDevices = [ { device = "/swapfile"; size = 8192; } ]; age.identityPaths = [ "/etc/agenix/keys/clawdinator.agekey" ]; - age.secrets."clawdinator-github-app.pem" = { - file = "${secretsPath}/clawdinator-github-app.pem.age"; - owner = "clawdinator"; - group = "clawdinator"; - }; age.secrets."clawdinator-anthropic-api-key" = { file = "${secretsPath}/clawdinator-anthropic-api-key.age"; owner = "clawdinator"; @@ -97,7 +97,7 @@ in bootstrap = { enable = true; s3Bucket = "clawdinator-images-eu1-20260107165216"; - s3Prefix = instance.bootstrapPrefix; + s3Prefix = bootstrapPrefix; region = "eu-central-1"; secretsDir = "/var/lib/clawd/nix-secrets"; repoSeedsDir = "/var/lib/clawd/repo-seeds"; @@ -205,21 +205,10 @@ in discordTokenFile = "/run/agenix/${discordTokenSecret}"; telegramAllowFromFile = "/run/agenix/clawdinator-telegram-allow-from"; - githubApp = { - enable = true; - appId = "2607181"; - installationId = "102951645"; - privateKeyFile = "/run/agenix/clawdinator-github-app.pem"; - schedule = "*:0/45"; # every 45 min — tokens expire after 1h - }; - - # We deploy via CI (release.yml) pinned to a git SHA; avoid host-local - # `nix flake update` drift. + # Hosts do not self-mutate. Replacements and switches are explicit operator + # actions, which avoids host-local `nix flake update` drift. selfUpdate.enable = false; - githubSync.enable = true; - githubSync.org = "openclaw"; - cronJobsFile = ../../clawdinator/cron-jobs.json; }; }; diff --git a/nix/instances.json b/nix/instances.json index 1a6c168..6e80382 100644 --- a/nix/instances.json +++ b/nix/instances.json @@ -1,16 +1,4 @@ { - "clawdinator-1": { - "host": "clawdinator-1", - "instanceType": "t3.large", - "bootstrapPrefix": "bootstrap/clawdinator-1", - "discordTokenSecret": "clawdinator-discord-token-1" - }, - "clawdinator-2": { - "host": "clawdinator-2", - "instanceType": "t3.large", - "bootstrapPrefix": "bootstrap/clawdinator-2", - "discordTokenSecret": "clawdinator-discord-token-2" - }, "clawdinator-babelfish": { "host": "clawdinator-babelfish", "instanceType": "t3.small", diff --git a/nix/modules/clawdinator.nix b/nix/modules/clawdinator.nix index 922fcdb..3c2c661 100644 --- a/nix/modules/clawdinator.nix +++ b/nix/modules/clawdinator.nix @@ -514,8 +514,8 @@ in message = "services.clawdinator requires nix-openclaw overlay (pkgs.openclaw-gateway)."; } { - assertion = cfg.githubApp.enable || cfg.githubPatFile != null; - message = "services.clawdinator requires a GitHub token (enable githubApp or set githubPatFile)."; + assertion = (!cfg.githubSync.enable) || cfg.githubApp.enable || cfg.githubPatFile != null; + message = "services.clawdinator.githubSync requires GitHub auth (enable githubApp or set githubPatFile)."; } { assertion = (!cfg.githubApp.enable) || (cfg.githubApp.appId != "" && cfg.githubApp.installationId != ""); diff --git a/scripts/import-image.sh b/scripts/import-image.sh index 7f64a8c..b4bf34e 100755 --- a/scripts/import-image.sh +++ b/scripts/import-image.sh @@ -87,7 +87,20 @@ for _ in {1..120}; do aws ec2 create-tags \ --region "${region}" \ --resources "${image_id}" \ - --tags "Key=Name,Value=${ami_name}" "Key=clawdinator,Value=true" + --tags \ + "Key=Name,Value=${ami_name}" \ + "Key=clawdinator,Value=true" \ + "Key=artifact-kind,Value=ami" \ + "Key=source-s3-key,Value=${key}" + + aws ec2 create-tags \ + --region "${region}" \ + --resources "${snapshot_id}" \ + --tags \ + "Key=Name,Value=${ami_name}-root-snapshot" \ + "Key=clawdinator,Value=true" \ + "Key=artifact-kind,Value=ami-root-snapshot" \ + "Key=source-s3-key,Value=${key}" echo "AMI_ID=${image_id}" >&2 echo "${image_id}" exit 0 diff --git a/scripts/prune-clawdinator-ami-history.sh b/scripts/prune-clawdinator-ami-history.sh new file mode 100644 index 0000000..ef31cf6 --- /dev/null +++ b/scripts/prune-clawdinator-ami-history.sh @@ -0,0 +1,230 @@ +#!/usr/bin/env bash +set -euo pipefail + +region="${AWS_REGION:?AWS_REGION required}" +keep_count="${KEEP_COUNT:-6}" +apply="${APPLY:-false}" + +if ! [[ "${keep_count}" =~ ^[0-9]+$ ]] || [ "${keep_count}" -lt 1 ]; then + echo "KEEP_COUNT must be a positive integer." >&2 + exit 1 +fi + +aws_deregister_image() { + local image_id="$1" + local output + + if ! output="$( + aws ec2 deregister-image \ + --region "${region}" \ + --image-id "${image_id}" \ + 2>&1 + )"; then + if [[ "${output}" == *"InvalidAMIID.NotFound"* ]] || [[ "${output}" == *"InvalidAMIID.Unavailable"* ]]; then + echo "AMI already gone: ${image_id}" >&2 + return 0 + fi + echo "${output}" >&2 + return 1 + fi +} + +aws_delete_snapshot() { + local snapshot_id="$1" + local output + + if [ -z "${snapshot_id}" ]; then + return 0 + fi + + if ! output="$( + aws ec2 delete-snapshot \ + --region "${region}" \ + --snapshot-id "${snapshot_id}" \ + 2>&1 + )"; then + if [[ "${output}" == *"InvalidSnapshot.NotFound"* ]]; then + echo "Snapshot already gone: ${snapshot_id}" >&2 + return 0 + fi + echo "${output}" >&2 + return 1 + fi +} + +array_contains() { + local needle="$1" + shift + local item + + for item in "$@"; do + if [ "${item}" = "${needle}" ]; then + return 0 + fi + done + + return 1 +} + +find_image_row() { + local needle="$1" + local row + local image_id + + for row in "${image_rows[@]}"; do + IFS=$'\t' read -r image_id _rest <<< "${row}" + if [ "${image_id}" = "${needle}" ]; then + printf '%s\n' "${row}" + return 0 + fi + done + + return 1 +} + +in_use_ami_ids=() +while IFS= read -r image_id; do + if [ -n "${image_id}" ]; then + in_use_ami_ids+=("${image_id}") + fi +done < <( + aws ec2 describe-instances \ + --region "${region}" \ + --filters \ + "Name=tag:app,Values=clawdinator" \ + "Name=instance-state-name,Values=pending,running,stopping,stopped" \ + --query 'Reservations[].Instances[].ImageId' \ + --output text | + tr '\t' '\n' | + sed '/^None$/d;/^$/d' | + sort -u +) + +images_json="$( + aws ec2 describe-images \ + --region "${region}" \ + --owners self \ + --filters "Name=tag:clawdinator,Values=true" \ + --output json +)" + +image_rows=() +while IFS= read -r row; do + if [ -n "${row}" ]; then + image_rows+=("${row}") + fi +done < <( + printf '%s\n' "${images_json}" | jq -r ' + .Images + | sort_by(.CreationDate) + | reverse[] + | [ + .ImageId, + (.Name // ""), + .CreationDate, + ((.RootDeviceName // "/dev/xvda") as $root + | ([.BlockDeviceMappings[]? | select(.DeviceName == $root) | .Ebs.SnapshotId][0] // "")) + ] + | @tsv + ' +) + +if [ "${#image_rows[@]}" -eq 0 ]; then + echo "No CLAWDINATOR AMIs found." + exit 0 +fi + +declare -a newest_ids=() +declare -a keep_ids=() +declare -a prune_rows=() + +for image_id in "${in_use_ami_ids[@]}"; do + keep_ids+=("${image_id}") +done + +recent_index=0 +for row in "${image_rows[@]}"; do + IFS=$'\t' read -r image_id name creation_date snapshot_id <<< "${row}" + + if [ "${recent_index}" -lt "${keep_count}" ]; then + newest_ids+=("${image_id}") + if ! array_contains "${image_id}" "${keep_ids[@]}"; then + keep_ids+=("${image_id}") + fi + recent_index=$((recent_index + 1)) + fi + + if ! array_contains "${image_id}" "${keep_ids[@]}"; then + prune_rows+=("${row}") + fi +done + +echo "CLAWDINATOR AMI retention" +echo "Mode: $(printf '%s' "${apply}" | tr '[:lower:]' '[:upper:]')" +echo "Region: ${region}" +echo + +echo "In-use AMIs (${#in_use_ami_ids[@]}):" +if [ "${#in_use_ami_ids[@]}" -eq 0 ]; then + echo " (none)" +else + for image_id in "${in_use_ami_ids[@]}"; do + echo " ${image_id}" + done +fi +echo + +echo "Newest ${keep_count} AMIs by age:" +for image_id in "${newest_ids[@]}"; do + row="$(find_image_row "${image_id}")" + IFS=$'\t' read -r _image_id name creation_date snapshot_id <<< "${row}" + echo " ${image_id} ${creation_date} ${name}" +done +echo + +echo "Keep-set (${#keep_ids[@]} total):" +for row in "${image_rows[@]}"; do + reasons=() + IFS=$'\t' read -r image_id name creation_date snapshot_id <<< "${row}" + if array_contains "${image_id}" "${keep_ids[@]}"; then + if array_contains "${image_id}" "${in_use_ami_ids[@]}"; then + reasons+=("in-use") + fi + if array_contains "${image_id}" "${newest_ids[@]}"; then + reasons+=("recent") + fi + reason="$( + IFS=, + printf '%s' "${reasons[*]}" + )" + echo " keep ${image_id} ${creation_date} ${reason} ${name}" + fi +done +echo + +echo "Prune-set (${#prune_rows[@]} total):" +if [ "${#prune_rows[@]}" -eq 0 ]; then + echo " (none)" +else + for row in "${prune_rows[@]}"; do + IFS=$'\t' read -r image_id name creation_date snapshot_id <<< "${row}" + echo " prune ${image_id} ${creation_date} snapshot=${snapshot_id:-none} ${name}" + done +fi +echo + +if [ "${apply}" != "true" ]; then + echo "Dry-run only. Re-run with APPLY=true to prune old CLAWDINATOR AMIs." + exit 0 +fi + +for row in "${prune_rows[@]}"; do + IFS=$'\t' read -r image_id name creation_date snapshot_id <<< "${row}" + echo "Deregistering ${image_id} (${name})" + aws_deregister_image "${image_id}" + + if [ -n "${snapshot_id}" ]; then + echo "Deleting snapshot ${snapshot_id}" + aws_delete_snapshot "${snapshot_id}" + fi +done diff --git a/scripts/upload-bootstrap-all.sh b/scripts/upload-bootstrap-all.sh index a6cc2a9..640243e 100755 --- a/scripts/upload-bootstrap-all.sh +++ b/scripts/upload-bootstrap-all.sh @@ -33,7 +33,10 @@ while IFS= read -r instance_name; do instance_secrets="${workdir}/${instance_name}/secrets" mkdir -p "${instance_secrets}" - rsync -a --exclude 'clawdinator-discord-token-*.age' "${secrets_dir}/" "${instance_secrets}/" + rsync -a \ + --exclude 'clawdinator-discord-token-*.age' \ + --exclude 'clawdinator-github-app.pem.age' \ + "${secrets_dir}/" "${instance_secrets}/" if [ ! -f "${secrets_dir}/${token_secret}.age" ]; then echo "Missing instance token ${secrets_dir}/${token_secret}.age" >&2