diff --git a/docs/backup.md b/docs/backup.md index 6142e5b..5b796f7 100644 --- a/docs/backup.md +++ b/docs/backup.md @@ -281,12 +281,13 @@ recipients as normal backup shards, and are committed with messages like to use the root `manifest.json` as the authoritative completed backup. This keeps long runs crash-tolerant without pretending partial data is a finished snapshot. A checkpoint commit can cover many messages, but its encrypted files -are split into smaller shard files to stay below normal GitHub blob limits. Tune -the commit cadence with `--gmail-checkpoint-rows` / `--gmail-checkpoint-interval` -on `gog backup push`, or `--checkpoint-rows` / `--checkpoint-interval` on -`gog backup gmail push`; set the interval or rows to `0` to disable that -trigger, or use `--no-gmail-checkpoints` / `--no-checkpoints` to disable -checkpoint pushes entirely. +are split by both row count and a conservative plaintext byte ceiling so large +messages do not create GitHub-rejected blobs. Tune the commit cadence with +`--gmail-checkpoint-rows` / `--gmail-checkpoint-interval` on `gog backup push`, +or `--checkpoint-rows` / `--checkpoint-interval` on `gog backup gmail push`; +set the interval or rows to `0` to disable that trigger, or use +`--no-gmail-checkpoints` / `--no-checkpoints` to disable checkpoint pushes +entirely. `--include-spam-trash` defaults to true. Use `--query` and `--max` for bounded test exports; omit them for a full mailbox scan. diff --git a/internal/cmd/backup_gmail.go b/internal/cmd/backup_gmail.go index c314487..b6569ca 100644 --- a/internal/cmd/backup_gmail.go +++ b/internal/cmd/backup_gmail.go @@ -353,6 +353,8 @@ type gmailBackupCheckpointer struct { const gmailCheckpointShardMaxRows = 250 +var gmailCheckpointShardMaxPlaintextBytes int64 = 32 * 1024 * 1024 + func newGmailBackupCheckpointer(ctx context.Context, opts gmailBackupOptions, total int) *gmailBackupCheckpointer { enabled := opts.Checkpoints && opts.CacheMessages && @@ -876,21 +878,57 @@ func buildGmailCheckpointShardsFromCache(accountHash, runID string, firstPart in return nil, nil } shards := make([]backup.PlainShard, 0, (len(ids)+gmailCheckpointShardMaxRows-1)/gmailCheckpointShardMaxRows) - for start := 0; start < len(ids); start += gmailCheckpointShardMaxRows { - end := start + gmailCheckpointShardMaxRows - if end > len(ids) { - end = len(ids) - } - shard, err := buildGmailCheckpointShardFromCache(accountHash, runID, firstPart+len(shards), ids[start:end]) - if err != nil { - for _, shard := range shards { - if strings.TrimSpace(shard.PlaintextPath) != "" { - _ = os.Remove(shard.PlaintextPath) - } + chunk := make([]string, 0, gmailCheckpointShardMaxRows) + var chunkBytes int64 + cleanup := func() { + for _, shard := range shards { + if strings.TrimSpace(shard.PlaintextPath) != "" { + _ = os.Remove(shard.PlaintextPath) } - return nil, err + } + } + flush := func() error { + if len(chunk) == 0 { + return nil + } + shard, err := buildGmailCheckpointShardFromCache(accountHash, runID, firstPart+len(shards), chunk) + if err != nil { + cleanup() + return err } shards = append(shards, shard) + chunk = chunk[:0] + chunkBytes = 0 + return nil + } + for _, id := range ids { + msg, ok, err := readGmailBackupMessageCache(accountHash, id) + if err != nil { + cleanup() + return nil, err + } + if !ok { + cleanup() + return nil, fmt.Errorf("gmail message %s missing from backup cache", id) + } + line, err := json.Marshal(msg) + if err != nil { + cleanup() + return nil, fmt.Errorf("encode gmail backup checkpoint shard estimate: %w", err) + } + lineBytes := int64(len(line) + 1) + overRows := len(chunk) >= gmailCheckpointShardMaxRows + overBytes := gmailCheckpointShardMaxPlaintextBytes > 0 && len(chunk) > 0 && chunkBytes+lineBytes > gmailCheckpointShardMaxPlaintextBytes + if overRows || overBytes { + if err := flush(); err != nil { + return nil, err + } + } + chunk = append(chunk, id) + chunkBytes += lineBytes + } + if err := flush(); err != nil { + return nil, err } return shards, nil } diff --git a/internal/cmd/backup_test.go b/internal/cmd/backup_test.go index 420c45b..f8b3b66 100644 --- a/internal/cmd/backup_test.go +++ b/internal/cmd/backup_test.go @@ -454,6 +454,36 @@ func TestBuildGmailCheckpointShardsFromCacheSplitsLargeChunks(t *testing.T) { } } +func TestBuildGmailCheckpointShardsFromCacheSplitsByPlaintextSize(t *testing.T) { + t.Setenv("HOME", t.TempDir()) + oldLimit := gmailCheckpointShardMaxPlaintextBytes + gmailCheckpointShardMaxPlaintextBytes = 1 + t.Cleanup(func() { gmailCheckpointShardMaxPlaintextBytes = oldLimit }) + accountHash := "accthash" + ids := []string{"m1", "m2", "m3"} + for _, id := range ids { + if err := writeGmailBackupMessageCache(accountHash, gmailBackupMessage{ID: id, Raw: strings.Repeat("raw-"+id, 8)}); err != nil { + t.Fatalf("writeGmailBackupMessageCache: %v", err) + } + } + shards, err := buildGmailCheckpointShardsFromCache(accountHash, "run-test", 11, ids) + if err != nil { + t.Fatalf("buildGmailCheckpointShardsFromCache: %v", err) + } + if len(shards) != 3 { + t.Fatalf("len(shards) = %d, want 3", len(shards)) + } + for i, shard := range shards { + if shard.Rows != 1 { + t.Fatalf("shards[%d].Rows = %d, want 1", i, shard.Rows) + } + want := fmt.Sprintf("part-%06d.jsonl.gz.age", 11+i) + if !strings.HasSuffix(shard.Path, want) { + t.Fatalf("shards[%d].Path = %q, want suffix %q", i, shard.Path, want) + } + } +} + func TestBuildGmailMessageShardsFromCacheWritesPlaintextPaths(t *testing.T) { t.Setenv("HOME", t.TempDir()) accountHash := "accthash"