fix(backup): split gmail checkpoints by plaintext size

This commit is contained in:
Peter Steinberger 2026-04-28 04:01:00 +01:00
parent 7b9197ebd1
commit d6111be738
No known key found for this signature in database
3 changed files with 87 additions and 18 deletions

View File

@ -281,12 +281,13 @@ recipients as normal backup shards, and are committed with messages like
to use the root `manifest.json` as the authoritative completed backup. This
keeps long runs crash-tolerant without pretending partial data is a finished
snapshot. A checkpoint commit can cover many messages, but its encrypted files
are split into smaller shard files to stay below normal GitHub blob limits. Tune
the commit cadence with `--gmail-checkpoint-rows` / `--gmail-checkpoint-interval`
on `gog backup push`, or `--checkpoint-rows` / `--checkpoint-interval` on
`gog backup gmail push`; set the interval or rows to `0` to disable that
trigger, or use `--no-gmail-checkpoints` / `--no-checkpoints` to disable
checkpoint pushes entirely.
are split by both row count and a conservative plaintext byte ceiling so large
messages do not create GitHub-rejected blobs. Tune the commit cadence with
`--gmail-checkpoint-rows` / `--gmail-checkpoint-interval` on `gog backup push`,
or `--checkpoint-rows` / `--checkpoint-interval` on `gog backup gmail push`;
set the interval or rows to `0` to disable that trigger, or use
`--no-gmail-checkpoints` / `--no-checkpoints` to disable checkpoint pushes
entirely.
`--include-spam-trash` defaults to true. Use `--query` and `--max` for bounded
test exports; omit them for a full mailbox scan.

View File

@ -353,6 +353,8 @@ type gmailBackupCheckpointer struct {
const gmailCheckpointShardMaxRows = 250
var gmailCheckpointShardMaxPlaintextBytes int64 = 32 * 1024 * 1024
func newGmailBackupCheckpointer(ctx context.Context, opts gmailBackupOptions, total int) *gmailBackupCheckpointer {
enabled := opts.Checkpoints &&
opts.CacheMessages &&
@ -876,21 +878,57 @@ func buildGmailCheckpointShardsFromCache(accountHash, runID string, firstPart in
return nil, nil
}
shards := make([]backup.PlainShard, 0, (len(ids)+gmailCheckpointShardMaxRows-1)/gmailCheckpointShardMaxRows)
for start := 0; start < len(ids); start += gmailCheckpointShardMaxRows {
end := start + gmailCheckpointShardMaxRows
if end > len(ids) {
end = len(ids)
}
shard, err := buildGmailCheckpointShardFromCache(accountHash, runID, firstPart+len(shards), ids[start:end])
if err != nil {
for _, shard := range shards {
if strings.TrimSpace(shard.PlaintextPath) != "" {
_ = os.Remove(shard.PlaintextPath)
}
chunk := make([]string, 0, gmailCheckpointShardMaxRows)
var chunkBytes int64
cleanup := func() {
for _, shard := range shards {
if strings.TrimSpace(shard.PlaintextPath) != "" {
_ = os.Remove(shard.PlaintextPath)
}
return nil, err
}
}
flush := func() error {
if len(chunk) == 0 {
return nil
}
shard, err := buildGmailCheckpointShardFromCache(accountHash, runID, firstPart+len(shards), chunk)
if err != nil {
cleanup()
return err
}
shards = append(shards, shard)
chunk = chunk[:0]
chunkBytes = 0
return nil
}
for _, id := range ids {
msg, ok, err := readGmailBackupMessageCache(accountHash, id)
if err != nil {
cleanup()
return nil, err
}
if !ok {
cleanup()
return nil, fmt.Errorf("gmail message %s missing from backup cache", id)
}
line, err := json.Marshal(msg)
if err != nil {
cleanup()
return nil, fmt.Errorf("encode gmail backup checkpoint shard estimate: %w", err)
}
lineBytes := int64(len(line) + 1)
overRows := len(chunk) >= gmailCheckpointShardMaxRows
overBytes := gmailCheckpointShardMaxPlaintextBytes > 0 && len(chunk) > 0 && chunkBytes+lineBytes > gmailCheckpointShardMaxPlaintextBytes
if overRows || overBytes {
if err := flush(); err != nil {
return nil, err
}
}
chunk = append(chunk, id)
chunkBytes += lineBytes
}
if err := flush(); err != nil {
return nil, err
}
return shards, nil
}

View File

@ -454,6 +454,36 @@ func TestBuildGmailCheckpointShardsFromCacheSplitsLargeChunks(t *testing.T) {
}
}
func TestBuildGmailCheckpointShardsFromCacheSplitsByPlaintextSize(t *testing.T) {
t.Setenv("HOME", t.TempDir())
oldLimit := gmailCheckpointShardMaxPlaintextBytes
gmailCheckpointShardMaxPlaintextBytes = 1
t.Cleanup(func() { gmailCheckpointShardMaxPlaintextBytes = oldLimit })
accountHash := "accthash"
ids := []string{"m1", "m2", "m3"}
for _, id := range ids {
if err := writeGmailBackupMessageCache(accountHash, gmailBackupMessage{ID: id, Raw: strings.Repeat("raw-"+id, 8)}); err != nil {
t.Fatalf("writeGmailBackupMessageCache: %v", err)
}
}
shards, err := buildGmailCheckpointShardsFromCache(accountHash, "run-test", 11, ids)
if err != nil {
t.Fatalf("buildGmailCheckpointShardsFromCache: %v", err)
}
if len(shards) != 3 {
t.Fatalf("len(shards) = %d, want 3", len(shards))
}
for i, shard := range shards {
if shard.Rows != 1 {
t.Fatalf("shards[%d].Rows = %d, want 1", i, shard.Rows)
}
want := fmt.Sprintf("part-%06d.jsonl.gz.age", 11+i)
if !strings.HasSuffix(shard.Path, want) {
t.Fatalf("shards[%d].Path = %q, want suffix %q", i, shard.Path, want)
}
}
}
func TestBuildGmailMessageShardsFromCacheWritesPlaintextPaths(t *testing.T) {
t.Setenv("HOME", t.TempDir())
accountHash := "accthash"