diff --git a/CHANGELOG.md b/CHANGELOG.md index 406cb49..fdea392 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ - Backup: expand `gog backup push --services all` with Drive content export/download, Gmail settings, native Workspace Docs/Sheets/Slides/Form data, Apps Script projects, Chat, Classroom, best-effort optional service error shards, and plaintext Drive file export. - Backup: extend `--services all` with Drive permissions/comments/revisions, Calendar ACL/settings/colors, contact groups, Cloud Identity groups, Workspace Admin Directory users/groups/members, Keep notes, and local Gmail message caching for resumable full-mailbox fetches. - Backup: bound individual Drive content exports with `--drive-content-timeout` so one stuck Google export records an encrypted error row instead of blocking the full backup. +- Backup: add Gmail message-list checkpoints and stderr progress counters so full-mailbox backups can resume cleanly after interruption. ### Fixed - Gmail: auto-fill draft reply subjects from the original message when `gmail drafts create --reply-to-message-id` omits `--subject`. (#488) — thanks @jbowerbir. diff --git a/README.md b/README.md index b6f7c93..586f226 100644 --- a/README.md +++ b/README.md @@ -753,9 +753,10 @@ metadata, permissions, comments, revisions, and exported Google-native file content by default. Non-Google binary Drive files are metadata-only unless `--drive-binary-contents` is set. `--drive-content-timeout` turns a stuck per-file export into an encrypted error row instead of wedging the run. Gmail -raw-message fetches use a local cache by default so interrupted full-mailbox -backups can resume; use -`--gmail-refresh-cache` to force a refetch. Workspace inventories +raw-message fetches and message-list pages use a local cache by default so +interrupted full-mailbox backups can resume; progress is written to stderr +while stdout stays parseable. Use `--gmail-refresh-cache` to force a refetch. +Workspace inventories Docs/Sheets/Slides and backs up Forms/responses discovered through Drive; add `--workspace-native` for full native Docs/Sheets/Slides API JSON. Optional Workspace-only services use `--best-effort` by default, recording diff --git a/docs/backup.md b/docs/backup.md index 66cf86d..3523ee2 100644 --- a/docs/backup.md +++ b/docs/backup.md @@ -259,13 +259,15 @@ Raw message payloads stay base64url encoded inside encrypted JSONL. This preserves the RFC 2822 message content while keeping the shard format text friendly. -By default, each fetched raw message is also cached locally under the OS user -cache directory (`gogcli/backup/gmail//raw-v1/`). The cache stores -the same raw message row that will be encrypted into shards and is keyed by a -SHA-256 of the Gmail message ID, so rerunning after an interruption can reuse -already fetched messages. `--gmail-refresh-cache` forces a refetch. The cache is -plaintext local data; clear it if the machine should not retain local mail -copies outside the encrypted backup/export locations. +By default, Gmail backup state is cached locally under the OS user cache +directory (`gogcli/backup/gmail//`). Message-list page checkpoints +live under `list-v1/`, and fetched raw messages live under `raw-v1/`. Raw-message +cache files store the same row that will be encrypted into shards and are keyed +by a SHA-256 of the Gmail message ID, so rerunning after an interruption can +reuse already fetched messages. Long Gmail runs report list/fetch counters to +stderr while stdout stays parseable. `--gmail-refresh-cache` forces a refetch. +The cache is plaintext local data; clear it if the machine should not retain +local mail copies outside the encrypted backup/export locations. `--include-spam-trash` defaults to true. Use `--query` and `--max` for bounded test exports; omit them for a full mailbox scan. diff --git a/internal/cmd/backup_gmail.go b/internal/cmd/backup_gmail.go index c62c148..aaed69b 100644 --- a/internal/cmd/backup_gmail.go +++ b/internal/cmd/backup_gmail.go @@ -16,6 +16,7 @@ import ( "google.golang.org/api/gmail/v1" "github.com/steipete/gogcli/internal/backup" + "github.com/steipete/gogcli/internal/ui" ) type gmailBackupOptions struct { @@ -50,6 +51,18 @@ type gmailBackupLabel struct { ThreadsUnread int64 `json:"threadsUnread,omitempty"` } +type gmailBackupListState struct { + Version int `json:"version"` + AccountHash string `json:"accountHash"` + Query string `json:"query,omitempty"` + Max int64 `json:"max,omitempty"` + IncludeSpamTrash bool `json:"includeSpamTrash"` + PageToken string `json:"pageToken,omitempty"` + IDs []string `json:"ids"` + Complete bool `json:"complete"` + Updated time.Time `json:"updated"` +} + func buildGmailBackupSnapshot(ctx context.Context, flags *RootFlags, opts gmailBackupOptions) (backup.Snapshot, error) { if opts.ShardMaxRows <= 0 { opts.ShardMaxRows = 1000 @@ -125,11 +138,13 @@ func fetchGmailBackupMessages(ctx context.Context, svc *gmail.Service, opts gmai if err != nil { return nil, err } + gmailBackupProgressf(ctx, "backup gmail fetch\tqueued=%d", len(ids)) const maxConcurrency = 8 sem := make(chan struct{}, maxConcurrency) type result struct { index int msg gmailBackupMessage + cache bool err error } results := make(chan result, len(ids)) @@ -152,7 +167,7 @@ func fetchGmailBackupMessages(ctx context.Context, svc *gmail.Service, opts gmai return } if ok { - results <- result{index: index, msg: msg} + results <- result{index: index, msg: msg, cache: true} return } } @@ -193,11 +208,23 @@ func fetchGmailBackupMessages(ctx context.Context, svc *gmail.Service, opts gmai }() ordered := make([]gmailBackupMessage, len(ids)) var firstErr error + done := 0 + cacheHits := 0 + fetched := 0 for res := range results { if res.err != nil && firstErr == nil { firstErr = res.err } ordered[res.index] = res.msg + done++ + if res.cache { + cacheHits++ + } else if res.err == nil { + fetched++ + } + if done == len(ids) || done%100 == 0 { + gmailBackupProgressf(ctx, "backup gmail fetch\t%d/%d\tfetched=%d\tcache=%d", done, len(ids), fetched, cacheHits) + } } if firstErr != nil { return nil, firstErr @@ -288,6 +315,23 @@ func gmailBackupMessageCachePath(accountHash, messageID string) (string, bool) { func listGmailBackupMessageIDs(ctx context.Context, svc *gmail.Service, opts gmailBackupOptions) ([]string, error) { var ids []string pageToken := "" + statePath, hasStatePath := gmailBackupListStatePath(opts) + if opts.CacheMessages && !opts.RefreshCache && hasStatePath { + state, ok, err := readGmailBackupListState(statePath) + if err != nil { + return nil, err + } + if ok { + if state.Complete { + gmailBackupProgressf(ctx, "backup gmail list\tresume=complete\tmessages=%d", len(state.IDs)) + return append([]string(nil), state.IDs...), nil + } + ids = append(ids, state.IDs...) + pageToken = state.PageToken + gmailBackupProgressf(ctx, "backup gmail list\tresume=partial\tmessages=%d", len(ids)) + } + } + gmailBackupProgressf(ctx, "backup gmail list\tstart\tmessages=%d", len(ids)) for { maxResults := int64(500) if opts.Max > 0 { @@ -319,14 +363,123 @@ func listGmailBackupMessageIDs(ctx context.Context, svc *gmail.Service, opts gma ids = append(ids, message.Id) } } - if resp.NextPageToken == "" { + gmailBackupProgressf(ctx, "backup gmail list\tmessages=%d", len(ids)) + complete := resp.NextPageToken == "" || (opts.Max > 0 && int64(len(ids)) >= opts.Max) + if complete { + if opts.CacheMessages && hasStatePath { + if err := writeGmailBackupListState(statePath, opts, ids, "", true); err != nil { + return nil, err + } + } break } pageToken = resp.NextPageToken + if opts.CacheMessages && hasStatePath { + if err := writeGmailBackupListState(statePath, opts, ids, pageToken, false); err != nil { + return nil, err + } + } } return ids, nil } +func readGmailBackupListState(path string) (gmailBackupListState, bool, error) { + data, err := os.ReadFile(path) //nolint:gosec // path is derived from the OS cache dir and query hash. + if err != nil { + if os.IsNotExist(err) { + return gmailBackupListState{}, false, nil + } + return gmailBackupListState{}, false, fmt.Errorf("read gmail backup list state %s: %w", path, err) + } + var state gmailBackupListState + if err := json.Unmarshal(data, &state); err != nil { + return gmailBackupListState{}, false, fmt.Errorf("decode gmail backup list state %s: %w", path, err) + } + if state.Version != 1 { + return gmailBackupListState{}, false, nil + } + return state, true, nil +} + +func writeGmailBackupListState(path string, opts gmailBackupOptions, ids []string, pageToken string, complete bool) error { + if err := os.MkdirAll(filepath.Dir(path), 0o700); err != nil { + return fmt.Errorf("create gmail backup list state dir: %w", err) + } + state := gmailBackupListState{ + Version: 1, + AccountHash: opts.AccountHash, + Query: strings.TrimSpace(opts.Query), + Max: opts.Max, + IncludeSpamTrash: opts.IncludeSpamTrash, + PageToken: pageToken, + IDs: append([]string(nil), ids...), + Complete: complete, + Updated: time.Now().UTC(), + } + data, err := json.Marshal(state) + if err != nil { + return fmt.Errorf("encode gmail backup list state: %w", err) + } + tmp, err := os.CreateTemp(filepath.Dir(path), ".list-*.json") + if err != nil { + return fmt.Errorf("create gmail backup list state temp: %w", err) + } + tmpPath := tmp.Name() + if _, err := tmp.Write(data); err != nil { + _ = tmp.Close() + _ = os.Remove(tmpPath) + return fmt.Errorf("write gmail backup list state temp: %w", err) + } + if err := tmp.Close(); err != nil { + _ = os.Remove(tmpPath) + return fmt.Errorf("close gmail backup list state temp: %w", err) + } + if err := os.Chmod(tmpPath, 0o600); err != nil { + _ = os.Remove(tmpPath) + return fmt.Errorf("chmod gmail backup list state temp: %w", err) + } + if err := os.Rename(tmpPath, path); err != nil { + _ = os.Remove(tmpPath) + return fmt.Errorf("replace gmail backup list state %s: %w", path, err) + } + return nil +} + +func gmailBackupListStatePath(opts gmailBackupOptions) (string, bool) { + accountHash := strings.TrimSpace(opts.AccountHash) + if accountHash == "" { + return "", false + } + dir, err := os.UserCacheDir() + if err != nil || strings.TrimSpace(dir) == "" { + return "", false + } + key := struct { + Query string `json:"query,omitempty"` + Max int64 `json:"max,omitempty"` + IncludeSpamTrash bool `json:"includeSpamTrash"` + }{ + Query: strings.TrimSpace(opts.Query), + Max: opts.Max, + IncludeSpamTrash: opts.IncludeSpamTrash, + } + data, err := json.Marshal(key) + if err != nil { + return "", false + } + sum := sha256.Sum256(data) + name := hex.EncodeToString(sum[:]) + ".json" + return filepath.Join(dir, "gogcli", "backup", "gmail", accountHash, "list-v1", name), true +} + +func gmailBackupProgressf(ctx context.Context, format string, args ...any) { + u := ui.FromContext(ctx) + if u == nil { + return + } + u.Err().Printf(format, args...) +} + func buildGmailMessageShards(accountHash string, messages []gmailBackupMessage, shardMaxRows int) ([]backup.PlainShard, error) { if shardMaxRows <= 0 { shardMaxRows = 1000 diff --git a/internal/cmd/backup_test.go b/internal/cmd/backup_test.go index d1c09f6..c2a6a94 100644 --- a/internal/cmd/backup_test.go +++ b/internal/cmd/backup_test.go @@ -1,9 +1,11 @@ package cmd import ( + "bytes" "context" "encoding/base64" "encoding/json" + "io" "net/http" "net/http/httptest" "os" @@ -16,6 +18,7 @@ import ( "google.golang.org/api/option" "github.com/steipete/gogcli/internal/backup" + "github.com/steipete/gogcli/internal/ui" ) func TestBackupAccountHashStableAndOpaque(t *testing.T) { @@ -174,6 +177,130 @@ func TestGmailBackupMessageCacheRejectsWrongID(t *testing.T) { } } +func TestListGmailBackupMessageIDsResumesFromCheckpoint(t *testing.T) { + t.Setenv("HOME", t.TempDir()) + opts := gmailBackupOptions{ + AccountHash: "accthash", + IncludeSpamTrash: true, + CacheMessages: true, + } + path, ok := gmailBackupListStatePath(opts) + if !ok { + t.Fatal("expected list state path") + } + if err := writeGmailBackupListState(path, opts, []string{"m1"}, "p2", false); err != nil { + t.Fatalf("writeGmailBackupListState: %v", err) + } + + requests := 0 + svc, cleanup := newGmailServiceForTest(t, func(w http.ResponseWriter, r *http.Request) { + requests++ + if got := r.URL.Query().Get("pageToken"); got != "p2" { + t.Fatalf("pageToken = %q, want p2", got) + } + _ = json.NewEncoder(w).Encode(map[string]any{ + "messages": []map[string]string{{"id": "m2"}}, + }) + }) + defer cleanup() + + var stderr bytes.Buffer + u, err := ui.New(ui.Options{Stdout: io.Discard, Stderr: &stderr, Color: "never"}) + if err != nil { + t.Fatalf("ui.New: %v", err) + } + ids, err := listGmailBackupMessageIDs(ui.WithUI(context.Background(), u), svc, opts) + if err != nil { + t.Fatalf("listGmailBackupMessageIDs: %v", err) + } + if strings.Join(ids, ",") != "m1,m2" { + t.Fatalf("ids = %v, want [m1 m2]", ids) + } + if requests != 1 { + t.Fatalf("requests = %d, want 1", requests) + } + if !strings.Contains(stderr.String(), "resume=partial") || !strings.Contains(stderr.String(), "messages=2") { + t.Fatalf("stderr missing progress: %s", stderr.String()) + } + state, ok, err := readGmailBackupListState(path) + if err != nil { + t.Fatalf("readGmailBackupListState: %v", err) + } + if !ok || !state.Complete || strings.Join(state.IDs, ",") != "m1,m2" { + t.Fatalf("state = %#v ok=%t", state, ok) + } +} + +func TestListGmailBackupMessageIDsReusesCompleteCheckpoint(t *testing.T) { + t.Setenv("HOME", t.TempDir()) + opts := gmailBackupOptions{ + AccountHash: "accthash", + IncludeSpamTrash: true, + CacheMessages: true, + } + path, ok := gmailBackupListStatePath(opts) + if !ok { + t.Fatal("expected list state path") + } + if err := writeGmailBackupListState(path, opts, []string{"m1", "m2"}, "", true); err != nil { + t.Fatalf("writeGmailBackupListState: %v", err) + } + + requests := 0 + svc, cleanup := newGmailServiceForTest(t, func(w http.ResponseWriter, r *http.Request) { + requests++ + http.NotFound(w, r) + }) + defer cleanup() + + ids, err := listGmailBackupMessageIDs(context.Background(), svc, opts) + if err != nil { + t.Fatalf("listGmailBackupMessageIDs: %v", err) + } + if strings.Join(ids, ",") != "m1,m2" { + t.Fatalf("ids = %v, want [m1 m2]", ids) + } + if requests != 0 { + t.Fatalf("requests = %d, want 0", requests) + } +} + +func TestListGmailBackupMessageIDsMarksMaxLimitedRunComplete(t *testing.T) { + t.Setenv("HOME", t.TempDir()) + opts := gmailBackupOptions{ + AccountHash: "accthash", + Max: 1, + IncludeSpamTrash: true, + CacheMessages: true, + } + svc, cleanup := newGmailServiceForTest(t, func(w http.ResponseWriter, r *http.Request) { + _ = json.NewEncoder(w).Encode(map[string]any{ + "messages": []map[string]string{{"id": "m1"}}, + "nextPageToken": "p2", + }) + }) + defer cleanup() + + ids, err := listGmailBackupMessageIDs(context.Background(), svc, opts) + if err != nil { + t.Fatalf("listGmailBackupMessageIDs: %v", err) + } + if strings.Join(ids, ",") != "m1" { + t.Fatalf("ids = %v, want [m1]", ids) + } + path, ok := gmailBackupListStatePath(opts) + if !ok { + t.Fatal("expected list state path") + } + state, ok, err := readGmailBackupListState(path) + if err != nil { + t.Fatalf("readGmailBackupListState: %v", err) + } + if !ok || !state.Complete || state.PageToken != "" { + t.Fatalf("state = %#v ok=%t", state, ok) + } +} + func TestFetchBackupDriveCollaborationCollectsMetadataAndErrors(t *testing.T) { srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { w.Header().Set("Content-Type", "application/json")