Add fast release pipeline (bootstrap + SSM nixos-rebuild)
- Add release.yml: eval -> upload bootstrap -> deploy via SSM (canary order) - Make image-build manual/weekly (base AMI lane) - Add SSM permissions to CI IAM policy (requires tofu apply) - Add scripts for SSM-based nixos-rebuild and docs for the two-lane model
This commit is contained in:
parent
d7df4f0e13
commit
9245311395
5
.github/workflows/image-build.yml
vendored
5
.github/workflows/image-build.yml
vendored
@ -6,9 +6,8 @@ concurrency:
|
||||
|
||||
on:
|
||||
workflow_dispatch:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
schedule:
|
||||
- cron: "17 3 * * 0" # weekly (Sunday)
|
||||
|
||||
jobs:
|
||||
build-image:
|
||||
|
||||
134
.github/workflows/release.yml
vendored
Normal file
134
.github/workflows/release.yml
vendored
Normal file
@ -0,0 +1,134 @@
|
||||
name: Release (bootstrap + fast deploy)
|
||||
|
||||
concurrency:
|
||||
group: release-${{ github.ref }}
|
||||
cancel-in-progress: true
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- flake.nix
|
||||
- flake.lock
|
||||
- nix/**
|
||||
- scripts/**
|
||||
- clawdinator/**
|
||||
- infra/opentofu/aws/**
|
||||
workflow_dispatch:
|
||||
|
||||
jobs:
|
||||
eval:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Install Nix
|
||||
uses: cachix/install-nix-action@v27
|
||||
with:
|
||||
nix_path: nixpkgs=channel:nixos-unstable
|
||||
extra_nix_config: |
|
||||
extra-substituters = https://cache.garnix.io
|
||||
extra-trusted-public-keys = cache.garnix.io:CTFPyKSLcx5RMJKfLo5EEPUObbA78b0YQ2DTCJXqr9g=
|
||||
|
||||
- name: Evaluate host configs (fail fast)
|
||||
run: |
|
||||
nix eval --raw .#nixosConfigurations.clawdinator-1.config.system.build.toplevel.drvPath --accept-flake-config >/dev/null
|
||||
nix eval --raw .#nixosConfigurations.clawdinator-2.config.system.build.toplevel.drvPath --accept-flake-config >/dev/null
|
||||
|
||||
upload-bootstrap:
|
||||
needs: eval
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: read
|
||||
env:
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
|
||||
AWS_REGION: ${{ secrets.AWS_REGION }}
|
||||
S3_BUCKET: ${{ secrets.S3_BUCKET }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Install Nix
|
||||
uses: cachix/install-nix-action@v27
|
||||
with:
|
||||
nix_path: nixpkgs=channel:nixos-unstable
|
||||
extra_nix_config: |
|
||||
extra-substituters = https://cache.garnix.io
|
||||
extra-trusted-public-keys = cache.garnix.io:CTFPyKSLcx5RMJKfLo5EEPUObbA78b0YQ2DTCJXqr9g=
|
||||
|
||||
- name: Install tooling
|
||||
run: |
|
||||
nix profile install \
|
||||
nixpkgs#awscli2 \
|
||||
nixpkgs#age \
|
||||
nixpkgs#jq \
|
||||
nixpkgs#zstd
|
||||
|
||||
- name: Write agenix image key
|
||||
env:
|
||||
CLAWDINATOR_AGE_KEY: ${{ secrets.CLAWDINATOR_AGE_KEY }}
|
||||
run: |
|
||||
mkdir -p nix/keys
|
||||
printf '%s' "${CLAWDINATOR_AGE_KEY}" > nix/keys/clawdinator.agekey
|
||||
chmod 600 nix/keys/clawdinator.agekey
|
||||
|
||||
- name: Fetch age secrets
|
||||
run: |
|
||||
mkdir -p nix/age-secrets
|
||||
aws s3 sync "s3://${S3_BUCKET}/age-secrets" nix/age-secrets
|
||||
bash scripts/validate-age-secrets.sh
|
||||
|
||||
- name: Mint GitHub App token
|
||||
env:
|
||||
GITHUB_APP_ID: "2607181"
|
||||
GITHUB_APP_INSTALLATION_ID: "102951645"
|
||||
run: |
|
||||
age -d -i nix/keys/clawdinator.agekey \
|
||||
-o /tmp/clawdinator-github-app.pem \
|
||||
nix/age-secrets/clawdinator-github-app.pem.age
|
||||
export GITHUB_APP_PEM_FILE=/tmp/clawdinator-github-app.pem
|
||||
token="$(scripts/mint-github-app-token.sh)"
|
||||
echo "GITHUB_TOKEN=${token}" >> "${GITHUB_ENV}"
|
||||
|
||||
- name: Prepare repo seeds
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ env.GITHUB_TOKEN }}
|
||||
run: |
|
||||
scripts/prepare-repo-seeds.sh repo-seeds
|
||||
|
||||
- name: Upload bootstrap bundles
|
||||
run: |
|
||||
bash scripts/upload-bootstrap-all.sh
|
||||
|
||||
deploy:
|
||||
needs: upload-bootstrap
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: read
|
||||
env:
|
||||
AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
||||
AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
|
||||
AWS_REGION: ${{ secrets.AWS_REGION }}
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Install Nix
|
||||
uses: cachix/install-nix-action@v27
|
||||
with:
|
||||
nix_path: nixpkgs=channel:nixos-unstable
|
||||
|
||||
- name: Install tooling
|
||||
run: |
|
||||
nix profile install \
|
||||
nixpkgs#awscli2 \
|
||||
nixpkgs#jq
|
||||
|
||||
- name: Switch fleet via SSM (canary then full)
|
||||
env:
|
||||
REV: ${{ github.sha }}
|
||||
run: |
|
||||
bash scripts/fleet-switch-nixos.sh "${REV}"
|
||||
44
docs/DEPLOYMENT_MODEL.md
Normal file
44
docs/DEPLOYMENT_MODEL.md
Normal file
@ -0,0 +1,44 @@
|
||||
# Deployment model (fast + declarative)
|
||||
|
||||
This repo uses a **two-lane** delivery model:
|
||||
|
||||
- **Lane A: Base AMI** (slow path, rare)
|
||||
- Purpose: reliable boot substrate (Nix + systemd + networking + EFS + SSM + bootstrap services).
|
||||
- Built by: `.github/workflows/image-build.yml` (manual or scheduled).
|
||||
- Tradeoff: EC2 VM Import is slow/variable; do not run per-commit.
|
||||
|
||||
- **Lane B: Release + Fleet switch** (fast path, every merge)
|
||||
- Purpose: ship config/app changes quickly while staying reproducible.
|
||||
- Built by: `.github/workflows/release.yml`.
|
||||
- Steps:
|
||||
1) **Fail-fast eval** of NixOS configs.
|
||||
2) Upload **bootstrap bundles** to S3 (repo seeds, workspace, secrets references).
|
||||
3) Deploy via **SSM**: `nixos-rebuild switch --flake github:openclaw/clawdinators/<rev>#<host>`.
|
||||
|
||||
## Primitives
|
||||
|
||||
- **Source of truth**: git SHA + `flake.lock`.
|
||||
- **Artifact**: NixOS system closure for each host config.
|
||||
- **Distribution**: Nix substituters + S3 bootstrap bundle.
|
||||
- **Activation**: `nixos-rebuild switch`.
|
||||
- **Rollout**: canary order (clawdinator-1 then clawdinator-2).
|
||||
- **Rollback**: redeploy an older git SHA.
|
||||
|
||||
## Tradeoffs
|
||||
|
||||
- Pros:
|
||||
- Fast deploys (minutes) vs AMI import (tens of minutes).
|
||||
- Cattle-friendly: hosts stay disposable; state lives on EFS.
|
||||
- Reproducible: deploys are pinned to a git SHA.
|
||||
|
||||
- Cons:
|
||||
- `nixos-rebuild switch` restarts services; expect brief bot downtime per release.
|
||||
- Requires AWS SSM permissions for the CI user (see `infra/opentofu/aws/main.tf`).
|
||||
- If Nix caches miss, deploys can be slower (still typically faster than AMI import).
|
||||
|
||||
## Infra requirement: CI SSM permissions
|
||||
|
||||
`release.yml` uses `aws ssm send-command`.
|
||||
|
||||
After pulling these changes, run `tofu apply` in `infra/opentofu/aws` (with admin creds)
|
||||
so the CI IAM policy includes the `FleetDeploySSM` statement.
|
||||
@ -188,6 +188,21 @@ data "aws_iam_policy_document" "ami_importer" {
|
||||
resources = ["*"]
|
||||
}
|
||||
|
||||
# Allow CI to do fast, declarative deploys via AWS Systems Manager (SSM)
|
||||
# instead of slow AMI replacement.
|
||||
statement {
|
||||
sid = "FleetDeploySSM"
|
||||
actions = [
|
||||
"ssm:SendCommand",
|
||||
"ssm:GetCommandInvocation",
|
||||
"ssm:ListCommands",
|
||||
"ssm:ListCommandInvocations",
|
||||
"ssm:DescribeInstanceInformation",
|
||||
"ssm:GetDocument"
|
||||
]
|
||||
resources = ["*"]
|
||||
}
|
||||
|
||||
statement {
|
||||
sid = "TerraformLockTable"
|
||||
actions = [
|
||||
|
||||
31
scripts/aws-resolve-instance-id.sh
Executable file
31
scripts/aws-resolve-instance-id.sh
Executable file
@ -0,0 +1,31 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
if [ "$#" -ne 1 ]; then
|
||||
echo "usage: $0 <host-tag-Name>" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
host="$1"
|
||||
|
||||
ids="$(aws ec2 describe-instances \
|
||||
--filters \
|
||||
"Name=tag:app,Values=clawdinator" \
|
||||
"Name=tag:Name,Values=${host}" \
|
||||
"Name=instance-state-name,Values=running" \
|
||||
--query 'Reservations[].Instances[].InstanceId' \
|
||||
--output text)"
|
||||
|
||||
if [ -z "${ids}" ] || [ "${ids}" = "None" ]; then
|
||||
echo "no running instance found for Name tag: ${host}" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# If multiple instances match, fail loudly.
|
||||
count="$(wc -w <<<"${ids}" | tr -d ' ')"
|
||||
if [ "${count}" != "1" ]; then
|
||||
echo "expected 1 instance for ${host}, got ${count}: ${ids}" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "${ids}"
|
||||
68
scripts/aws-ssm-run.sh
Executable file
68
scripts/aws-ssm-run.sh
Executable file
@ -0,0 +1,68 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
if [ "$#" -lt 2 ]; then
|
||||
echo "usage: $0 <instance-id> <command...>" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
instance_id="$1"
|
||||
shift
|
||||
|
||||
# Join remaining args into a single shell command.
|
||||
cmd="$*"
|
||||
|
||||
command_id="$(aws ssm send-command \
|
||||
--instance-ids "${instance_id}" \
|
||||
--document-name "AWS-RunShellScript" \
|
||||
--comment "clawdinators deploy" \
|
||||
--parameters commands="${cmd}" \
|
||||
--query 'Command.CommandId' \
|
||||
--output text)"
|
||||
|
||||
echo "ssm command id: ${command_id} (instance: ${instance_id})" >&2
|
||||
|
||||
status=""
|
||||
# Wait for invocation to exist + finish.
|
||||
for _ in $(seq 1 300); do
|
||||
status="$(aws ssm list-command-invocations \
|
||||
--command-id "${command_id}" \
|
||||
--details \
|
||||
--query 'CommandInvocations[0].Status' \
|
||||
--output text 2>/dev/null || true)"
|
||||
|
||||
case "${status}" in
|
||||
Success|Cancelled|TimedOut|Failed)
|
||||
break
|
||||
;;
|
||||
Pending|InProgress|Delayed|Cancelling|None|"")
|
||||
sleep 2
|
||||
;;
|
||||
*)
|
||||
echo "unknown SSM status: ${status}" >&2
|
||||
sleep 2
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
invocation_json="$(aws ssm get-command-invocation \
|
||||
--command-id "${command_id}" \
|
||||
--instance-id "${instance_id}" \
|
||||
--output json)"
|
||||
|
||||
stdout="$(jq -r '.StandardOutputContent // ""' <<<"${invocation_json}")"
|
||||
stderr="$(jq -r '.StandardErrorContent // ""' <<<"${invocation_json}")"
|
||||
final_status="$(jq -r '.Status' <<<"${invocation_json}")"
|
||||
|
||||
if [ -n "${stdout}" ]; then
|
||||
echo "${stdout}"
|
||||
fi
|
||||
if [ -n "${stderr}" ]; then
|
||||
echo "--- stderr ---" >&2
|
||||
echo "${stderr}" >&2
|
||||
fi
|
||||
|
||||
if [ "${final_status}" != "Success" ]; then
|
||||
echo "ssm command failed: status=${final_status}" >&2
|
||||
exit 1
|
||||
fi
|
||||
29
scripts/fleet-switch-nixos.sh
Executable file
29
scripts/fleet-switch-nixos.sh
Executable file
@ -0,0 +1,29 @@
|
||||
#!/usr/bin/env bash
|
||||
set -euo pipefail
|
||||
|
||||
if [ "$#" -lt 1 ]; then
|
||||
echo "usage: $0 <git-rev> [host1 host2 ...]" >&2
|
||||
echo "example: $0 ${GITHUB_SHA:-<sha>} clawdinator-1 clawdinator-2" >&2
|
||||
exit 2
|
||||
fi
|
||||
|
||||
rev="$1"
|
||||
shift
|
||||
|
||||
if [ "$#" -eq 0 ]; then
|
||||
# Canary order.
|
||||
hosts=(clawdinator-1 clawdinator-2)
|
||||
else
|
||||
hosts=("$@")
|
||||
fi
|
||||
|
||||
for host in "${hosts[@]}"; do
|
||||
echo "== deploy: ${host} @ ${rev} ==" >&2
|
||||
instance_id="$(bash scripts/aws-resolve-instance-id.sh "${host}")"
|
||||
|
||||
# Run everything under bash -lc so PATH + profiles behave similarly to an interactive session.
|
||||
# We also force flakes enabled for safety.
|
||||
bash scripts/aws-ssm-run.sh "${instance_id}" \
|
||||
"bash -lc 'set -euo pipefail; export NIX_CONFIG=\"experimental-features = nix-command flakes\"; nixos-rebuild switch --accept-flake-config --flake github:openclaw/clawdinators/${rev}#${host}; systemctl is-active clawdinator'"
|
||||
|
||||
done
|
||||
Loading…
Reference in New Issue
Block a user