infra: restore ec2 user-data fetch

- add fetch-ec2-metadata service for AMI bootstrap

- set git safe.directory for nixos-rebuild

- note clawdinator-2 recovery in ops
This commit is contained in:
Josh Palmer 2026-02-03 16:36:21 -08:00
parent b54453c593
commit 1384ee7b47
4 changed files with 94 additions and 1 deletions

View File

@ -26,3 +26,8 @@ Update with incidents, fixes, and operational lessons.
- Instances:
- clawdinator-1: i-0b6060699bb413d82 (IP 18.198.25.107, DNS ec2-18-198-25-107.eu-central-1.compute.amazonaws.com).
- clawdinator-2: i-07bcba2bb924dfc93 (IP 3.66.165.141, DNS ec2-3-66-165-141.eu-central-1.compute.amazonaws.com).
## 2026-02-04
- clawdinator-2 booted without /etc/ec2-metadata/user-data, so amazon-init skipped user-data and clawdinator stayed inactive.
- Manual recovery: fetch IMDS user-data, rerun user-data script, set git safe.directory, set transient hostname.
- Fix: add fetch-ec2-metadata systemd unit to AMI config + git safe.directory in programs.git.

View File

@ -1,4 +1,4 @@
{ modulesPath, config, ... }: {
{ modulesPath, config, pkgs, ... }: {
imports = [
(modulesPath + "/virtualisation/ec2-data.nix")
(modulesPath + "/virtualisation/amazon-init.nix")
@ -29,4 +29,20 @@
device = "/dev/disk/by-label/nixos";
fsType = "ext4";
};
systemd.services.fetch-ec2-metadata = {
description = "Fetch EC2 metadata";
wantedBy = [ "multi-user.target" ];
wants = [ "network-online.target" ];
after = [ "network-online.target" ];
path = [ pkgs.curl ];
serviceConfig = {
Type = "oneshot";
StandardOutput = "journal+console";
ExecStart = "${pkgs.bash}/bin/bash ${../../scripts/fetch-ec2-metadata.sh}";
};
};
systemd.services.amazon-init.after = [ "fetch-ec2-metadata.service" ];
systemd.services.amazon-init.wants = [ "fetch-ec2-metadata.service" ];
}

View File

@ -486,6 +486,9 @@ in
name = "CLAWDINATOR Bot";
email = "clawdinator[bot]@users.noreply.github.com";
};
safe = {
directory = [ "/var/lib/clawd/repos/clawdinators" ];
};
};
};

View File

@ -0,0 +1,69 @@
#!/usr/bin/env bash
metaDir=/etc/ec2-metadata
mkdir -p "$metaDir"
chmod 0755 "$metaDir"
rm -f "$metaDir/*"
get_imds_token() {
# retry-delay of 1 selected to give the system a second to get going,
# but not add a lot to the bootup time
curl \
--silent \
--show-error \
--retry 3 \
--retry-delay 1 \
--fail \
-X PUT \
--connect-timeout 1 \
-H "X-aws-ec2-metadata-token-ttl-seconds: 600" \
http://169.254.169.254/latest/api/token
}
preflight_imds_token() {
# retry-delay of 1 selected to give the system a second to get going,
# but not add a lot to the bootup time
curl \
--silent \
--show-error \
--retry 3 \
--retry-delay 1 \
--fail \
--connect-timeout 1 \
-H "X-aws-ec2-metadata-token: $IMDS_TOKEN" \
-o /dev/null \
http://169.254.169.254/1.0/meta-data/instance-id
}
try=1
while [ $try -le 3 ]; do
echo "(attempt $try/3) getting an EC2 instance metadata service v2 token..."
IMDS_TOKEN=$(get_imds_token) && break
try=$((try + 1))
sleep 1
done
if [ "$IMDS_TOKEN" == "" ]; then
echo "failed to fetch an IMDS2v token."
fi
try=1
while [ $try -le 10 ]; do
echo "(attempt $try/10) validating the EC2 instance metadata service v2 token..."
preflight_imds_token && break
try=$((try + 1))
sleep 1
done
echo "getting EC2 instance metadata..."
get_imds() {
# --fail to avoid populating missing files with 404 HTML response body
# || true to allow the script to continue even when encountering a 404
curl --silent --show-error --fail --header "X-aws-ec2-metadata-token: $IMDS_TOKEN" "$@" || true
}
get_imds -o "$metaDir/ami-manifest-path" http://169.254.169.254/1.0/meta-data/ami-manifest-path
(umask 077 && get_imds -o "$metaDir/user-data" http://169.254.169.254/1.0/user-data)
get_imds -o "$metaDir/hostname" http://169.254.169.254/1.0/meta-data/hostname
get_imds -o "$metaDir/public-keys-0-openssh-key" http://169.254.169.254/1.0/meta-data/public-keys/0/openssh-key