fix(backup): split gmail checkpoints by plaintext size
This commit is contained in:
parent
7b9197ebd1
commit
d6111be738
@ -281,12 +281,13 @@ recipients as normal backup shards, and are committed with messages like
|
||||
to use the root `manifest.json` as the authoritative completed backup. This
|
||||
keeps long runs crash-tolerant without pretending partial data is a finished
|
||||
snapshot. A checkpoint commit can cover many messages, but its encrypted files
|
||||
are split into smaller shard files to stay below normal GitHub blob limits. Tune
|
||||
the commit cadence with `--gmail-checkpoint-rows` / `--gmail-checkpoint-interval`
|
||||
on `gog backup push`, or `--checkpoint-rows` / `--checkpoint-interval` on
|
||||
`gog backup gmail push`; set the interval or rows to `0` to disable that
|
||||
trigger, or use `--no-gmail-checkpoints` / `--no-checkpoints` to disable
|
||||
checkpoint pushes entirely.
|
||||
are split by both row count and a conservative plaintext byte ceiling so large
|
||||
messages do not create GitHub-rejected blobs. Tune the commit cadence with
|
||||
`--gmail-checkpoint-rows` / `--gmail-checkpoint-interval` on `gog backup push`,
|
||||
or `--checkpoint-rows` / `--checkpoint-interval` on `gog backup gmail push`;
|
||||
set the interval or rows to `0` to disable that trigger, or use
|
||||
`--no-gmail-checkpoints` / `--no-checkpoints` to disable checkpoint pushes
|
||||
entirely.
|
||||
|
||||
`--include-spam-trash` defaults to true. Use `--query` and `--max` for bounded
|
||||
test exports; omit them for a full mailbox scan.
|
||||
|
||||
@ -353,6 +353,8 @@ type gmailBackupCheckpointer struct {
|
||||
|
||||
const gmailCheckpointShardMaxRows = 250
|
||||
|
||||
var gmailCheckpointShardMaxPlaintextBytes int64 = 32 * 1024 * 1024
|
||||
|
||||
func newGmailBackupCheckpointer(ctx context.Context, opts gmailBackupOptions, total int) *gmailBackupCheckpointer {
|
||||
enabled := opts.Checkpoints &&
|
||||
opts.CacheMessages &&
|
||||
@ -876,21 +878,57 @@ func buildGmailCheckpointShardsFromCache(accountHash, runID string, firstPart in
|
||||
return nil, nil
|
||||
}
|
||||
shards := make([]backup.PlainShard, 0, (len(ids)+gmailCheckpointShardMaxRows-1)/gmailCheckpointShardMaxRows)
|
||||
for start := 0; start < len(ids); start += gmailCheckpointShardMaxRows {
|
||||
end := start + gmailCheckpointShardMaxRows
|
||||
if end > len(ids) {
|
||||
end = len(ids)
|
||||
}
|
||||
shard, err := buildGmailCheckpointShardFromCache(accountHash, runID, firstPart+len(shards), ids[start:end])
|
||||
if err != nil {
|
||||
for _, shard := range shards {
|
||||
if strings.TrimSpace(shard.PlaintextPath) != "" {
|
||||
_ = os.Remove(shard.PlaintextPath)
|
||||
}
|
||||
chunk := make([]string, 0, gmailCheckpointShardMaxRows)
|
||||
var chunkBytes int64
|
||||
cleanup := func() {
|
||||
for _, shard := range shards {
|
||||
if strings.TrimSpace(shard.PlaintextPath) != "" {
|
||||
_ = os.Remove(shard.PlaintextPath)
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
flush := func() error {
|
||||
if len(chunk) == 0 {
|
||||
return nil
|
||||
}
|
||||
shard, err := buildGmailCheckpointShardFromCache(accountHash, runID, firstPart+len(shards), chunk)
|
||||
if err != nil {
|
||||
cleanup()
|
||||
return err
|
||||
}
|
||||
shards = append(shards, shard)
|
||||
chunk = chunk[:0]
|
||||
chunkBytes = 0
|
||||
return nil
|
||||
}
|
||||
for _, id := range ids {
|
||||
msg, ok, err := readGmailBackupMessageCache(accountHash, id)
|
||||
if err != nil {
|
||||
cleanup()
|
||||
return nil, err
|
||||
}
|
||||
if !ok {
|
||||
cleanup()
|
||||
return nil, fmt.Errorf("gmail message %s missing from backup cache", id)
|
||||
}
|
||||
line, err := json.Marshal(msg)
|
||||
if err != nil {
|
||||
cleanup()
|
||||
return nil, fmt.Errorf("encode gmail backup checkpoint shard estimate: %w", err)
|
||||
}
|
||||
lineBytes := int64(len(line) + 1)
|
||||
overRows := len(chunk) >= gmailCheckpointShardMaxRows
|
||||
overBytes := gmailCheckpointShardMaxPlaintextBytes > 0 && len(chunk) > 0 && chunkBytes+lineBytes > gmailCheckpointShardMaxPlaintextBytes
|
||||
if overRows || overBytes {
|
||||
if err := flush(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
}
|
||||
chunk = append(chunk, id)
|
||||
chunkBytes += lineBytes
|
||||
}
|
||||
if err := flush(); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return shards, nil
|
||||
}
|
||||
|
||||
@ -454,6 +454,36 @@ func TestBuildGmailCheckpointShardsFromCacheSplitsLargeChunks(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildGmailCheckpointShardsFromCacheSplitsByPlaintextSize(t *testing.T) {
|
||||
t.Setenv("HOME", t.TempDir())
|
||||
oldLimit := gmailCheckpointShardMaxPlaintextBytes
|
||||
gmailCheckpointShardMaxPlaintextBytes = 1
|
||||
t.Cleanup(func() { gmailCheckpointShardMaxPlaintextBytes = oldLimit })
|
||||
accountHash := "accthash"
|
||||
ids := []string{"m1", "m2", "m3"}
|
||||
for _, id := range ids {
|
||||
if err := writeGmailBackupMessageCache(accountHash, gmailBackupMessage{ID: id, Raw: strings.Repeat("raw-"+id, 8)}); err != nil {
|
||||
t.Fatalf("writeGmailBackupMessageCache: %v", err)
|
||||
}
|
||||
}
|
||||
shards, err := buildGmailCheckpointShardsFromCache(accountHash, "run-test", 11, ids)
|
||||
if err != nil {
|
||||
t.Fatalf("buildGmailCheckpointShardsFromCache: %v", err)
|
||||
}
|
||||
if len(shards) != 3 {
|
||||
t.Fatalf("len(shards) = %d, want 3", len(shards))
|
||||
}
|
||||
for i, shard := range shards {
|
||||
if shard.Rows != 1 {
|
||||
t.Fatalf("shards[%d].Rows = %d, want 1", i, shard.Rows)
|
||||
}
|
||||
want := fmt.Sprintf("part-%06d.jsonl.gz.age", 11+i)
|
||||
if !strings.HasSuffix(shard.Path, want) {
|
||||
t.Fatalf("shards[%d].Path = %q, want suffix %q", i, shard.Path, want)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildGmailMessageShardsFromCacheWritesPlaintextPaths(t *testing.T) {
|
||||
t.Setenv("HOME", t.TempDir())
|
||||
accountHash := "accthash"
|
||||
|
||||
Loading…
Reference in New Issue
Block a user