diff --git a/CHANGELOG.md b/CHANGELOG.md index cc87b23..3642cc5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ - Backup: add Gmail message-list checkpoints, streaming shard construction, and stderr progress counters so full-mailbox backups can resume cleanly after interruption without keeping every raw message in RAM. - Backup: push encrypted incomplete Gmail checkpoint commits during long cached fetches so day-scale mailbox backups have offsite progress before the final manifest is committed. - Backup: push Gmail checkpoint commits through a single ordered background queue so cached fetches continue while GitHub uploads run. +- Backup: add `gog backup export --gmail-format markdown` for local readable Gmail mirrors with Markdown notes and extracted attachment files. - Calendar: add `--start-timezone` / `--end-timezone` to `calendar create` and `calendar update` for preserving named IANA event timezones when RFC3339 inputs only carry numeric offsets. (#422) - Drive: add `drive search --drive` and `--parent` for scoping search to a shared drive or folder. (#525) — thanks @LeanSheng. - Docs: add experimental `docs export --tab` / `drive download --tab` to export a single Google Docs tab as PDF, DOCX, text, Markdown, or HTML. (#535) — thanks @johnbenjaminlewis. diff --git a/README.md b/README.md index 676c8d2..9de72c2 100644 --- a/README.md +++ b/README.md @@ -749,6 +749,7 @@ gog backup status gog backup verify gog backup cat data/gmail//labels.jsonl.gz.age --pretty gog backup export --out ~/Documents/gog-backup-export +gog backup export --no-pull --out ~/Library/CloudStorage/Dropbox/backup/gog --gmail-format markdown ``` For a bounded first run: @@ -789,12 +790,16 @@ Optional Workspace-only services use `--best-effort` by default, recording permission/auth errors as encrypted error shards instead of stopping the run. Use `gog backup cat` to decrypt one shard as JSONL, or `gog backup export` to -write a local plaintext copy. The export writes Gmail messages as `.eml` files, -plus `gmail//messages/index.jsonl` and pretty `labels.json`. -Drive contents export as normal files under `drive//files/` with -an `index.jsonl`; other services export as verified JSONL under `raw/`. -That export is intentionally unencrypted; keep it out of Git, shared folders, -and cloud sync unless that is intentional. +write a local plaintext copy. By default Gmail messages export as `.eml` files. +Use `--gmail-format markdown` for a readable mirror with `message.md` files and +extracted `attachments/` folders, or `--gmail-format both` to keep Markdown and +`.eml` side by side. `--gmail-attachments none` keeps Markdown notes without +writing attachment files. Drive contents export as normal files under +`drive//files/` with an `index.jsonl`; other services export as +verified JSONL under `raw/`. That export is intentionally unencrypted; keep it +out of Git, shared folders, and cloud sync unless that is intentional. +Use `--no-pull` when exporting from a local backup repository that another +process is already updating. `manifest.json` is intentionally cleartext for cheap status and verification. It exposes metadata: export time, service names, account hashes, shard paths, diff --git a/docs/backup.md b/docs/backup.md index 7e25530..e200986 100644 --- a/docs/backup.md +++ b/docs/backup.md @@ -64,6 +64,7 @@ Write an unencrypted local copy for easy reading on the Mac: ```bash gog backup export --out ~/Documents/gog-backup-export +gog backup export --no-pull --out ~/Library/CloudStorage/Dropbox/backup/gog --gmail-format markdown ``` Use `--no-push` on `init` or `push` to commit locally without pushing to the @@ -166,17 +167,24 @@ manifest.json gmail//labels.json gmail//messages/index.jsonl gmail//messages/YYYY/MM/-.eml +gmail//messages/YYYY/MM/--/message.md +gmail//messages/YYYY/MM/--/attachments/ drive//files/index.jsonl drive//files// raw//... ``` `gog backup export` decrypts and verifies the manifest-backed shards before -writing files. Gmail messages become `.eml` files that open in Mail and other -mail clients. Drive content shards become normal files plus an index. Other +writing files. Gmail messages become `.eml` files by default. Use +`--gmail-format markdown` for `message.md` files with YAML metadata and +extracted `attachments/` folders, or `--gmail-format both` to write Markdown and +`.eml` side by side. `--gmail-attachments none` keeps Markdown notes but skips +attachment files. Drive content shards become normal files plus an index. Other services are written as verified JSONL under `raw/`. The export is not encrypted; do not place it inside the backup Git repository, and keep it out of synced/shared folders unless that is intentional. +Use `--no-pull` when exporting from a local backup repository that another +process is already updating. ## Encryption diff --git a/internal/backup/config.go b/internal/backup/config.go index 99ce02c..7bcfa94 100644 --- a/internal/backup/config.go +++ b/internal/backup/config.go @@ -28,6 +28,7 @@ type Options struct { Identity string Recipients []string Push bool + SkipPull bool AsyncPush bool PushQueueLimit int Progress func(format string, args ...any) diff --git a/internal/backup/read.go b/internal/backup/read.go index b1a0f3a..0253ac7 100644 --- a/internal/backup/read.go +++ b/internal/backup/read.go @@ -14,8 +14,13 @@ func Cat(ctx context.Context, opts Options, shardPath string) (PlainShard, error if err != nil { return PlainShard{}, err } - if repoErr := ensureRepo(ctx, cfg); repoErr != nil { - return PlainShard{}, repoErr + if !opts.SkipPull { + repoErr := ensureRepo(ctx, cfg) + if repoErr != nil { + return PlainShard{}, repoErr + } + } else if strings.TrimSpace(cfg.Repo) == "" { + return PlainShard{}, fmt.Errorf("backup repo path is required") } manifest, err := readManifest(cfg.Repo) if err != nil { @@ -36,8 +41,13 @@ func DecryptSnapshot(ctx context.Context, opts Options) (Manifest, string, []Pla if err != nil { return Manifest{}, "", nil, err } - if repoErr := ensureRepo(ctx, cfg); repoErr != nil { - return Manifest{}, "", nil, repoErr + if !opts.SkipPull { + repoErr := ensureRepo(ctx, cfg) + if repoErr != nil { + return Manifest{}, "", nil, repoErr + } + } else if strings.TrimSpace(cfg.Repo) == "" { + return Manifest{}, "", nil, fmt.Errorf("backup repo path is required") } manifest, err := readManifest(cfg.Repo) if err != nil { diff --git a/internal/cmd/backup.go b/internal/cmd/backup.go index 87efbab..613f19c 100644 --- a/internal/cmd/backup.go +++ b/internal/cmd/backup.go @@ -71,6 +71,7 @@ type backupReadFlags struct { Repo string `name:"repo" help:"Local backup repository path"` Remote string `name:"remote" help:"Backup Git remote URL"` Identity string `name:"identity" help:"Local age identity path"` + NoPull bool `name:"no-pull" help:"Use local backup repository state without pulling first"` } func (f backupReadFlags) options() backup.Options { @@ -80,6 +81,7 @@ func (f backupReadFlags) options() backup.Options { Remote: f.Remote, Identity: f.Identity, Push: false, + SkipPull: f.NoPull, } } diff --git a/internal/cmd/backup_export.go b/internal/cmd/backup_export.go index 98c0afb..dd3c4ab 100644 --- a/internal/cmd/backup_export.go +++ b/internal/cmd/backup_export.go @@ -53,7 +53,9 @@ func (c *BackupCatCmd) Run(ctx context.Context) error { type BackupExportCmd struct { backupReadFlags - Out string `name:"out" help:"Plaintext export directory" default:"~/Documents/gog-backup-export"` + Out string `name:"out" help:"Plaintext export directory" default:"~/Documents/gog-backup-export"` + GmailFormat string `name:"gmail-format" help:"Gmail message export format: eml, markdown, or both" default:"eml" enum:"eml,markdown,both"` + GmailAttachments string `name:"gmail-attachments" help:"Gmail attachment export mode for markdown/both: extract or none" default:"extract" enum:"extract,none"` } type backupExportResult struct { @@ -64,14 +66,9 @@ type backupExportResult struct { Counts map[string]int `json:"counts"` } -type gmailExportIndexEntry struct { - ID string `json:"id"` - ThreadID string `json:"threadId,omitempty"` - HistoryID string `json:"historyId,omitempty"` - InternalDate int64 `json:"internalDate,omitempty"` - LabelIDs []string `json:"labelIds,omitempty"` - SizeEstimate int64 `json:"sizeEstimate,omitempty"` - EML string `json:"eml"` +type backupExportOptions struct { + GmailFormat string + GmailAttachments string } func (c *BackupExportCmd) Run(ctx context.Context) error { @@ -101,11 +98,15 @@ func (c *BackupExportCmd) Run(ctx context.Context) error { if manifestErr := writeJSONFile(filepath.Join(outDir, "manifest.json"), manifest); manifestErr != nil { return manifestErr } - if resetErr := resetExportIndexes(outDir, shards); resetErr != nil { + exportOpts := backupExportOptions{ + GmailFormat: c.GmailFormat, + GmailAttachments: c.GmailAttachments, + } + if resetErr := resetExportTargets(outDir, shards); resetErr != nil { return resetErr } for _, shard := range shards { - _, count, shardErr := exportPlainShard(outDir, shard) + _, count, shardErr := exportPlainShard(outDir, shard, exportOpts) if shardErr != nil { return shardErr } @@ -205,24 +206,24 @@ func ensureExportOutsideRepo(outDir, repo string) error { return nil } -func resetExportIndexes(outDir string, shards []backup.PlainShard) error { +func resetExportTargets(outDir string, shards []backup.PlainShard) error { seen := map[string]struct{}{} for _, shard := range shards { - index := "" + target := "" switch { case shard.Service == backupServiceGmail && shard.Kind == "messages": - index = filepath.Join(outDir, backupServiceGmail, sanitizeFilePart(shard.Account), "messages", "index.jsonl") + target = filepath.Join(outDir, backupServiceGmail, sanitizeFilePart(shard.Account), "messages") case shard.Service == backupServiceDrive && shard.Kind == "contents": - index = filepath.Join(outDir, backupServiceDrive, sanitizeFilePart(shard.Account), "files", "index.jsonl") + target = filepath.Join(outDir, backupServiceDrive, sanitizeFilePart(shard.Account), "files", "index.jsonl") } - if index == "" { + if target == "" { continue } - if _, ok := seen[index]; ok { + if _, ok := seen[target]; ok { continue } - seen[index] = struct{}{} - if err := os.Remove(index); err != nil && !os.IsNotExist(err) { + seen[target] = struct{}{} + if err := os.RemoveAll(target); err != nil && !os.IsNotExist(err) { return err } } @@ -235,9 +236,10 @@ func writeBackupExportReadme(outDir string) error { "This directory is an unencrypted local copy created by `gog backup export`.\n" + "Keep it out of Git, shared folders, and cloud sync unless that is intentional.\n" + "\n" + - "Gmail messages are written as `.eml` files that can be opened by Mail and many\n" + - "mail clients. `gmail//messages/index.jsonl` maps backup message IDs\n" + - "to the exported `.eml` files. Labels are written as pretty JSON.\n" + "Gmail messages are written according to `--gmail-format`: `.eml` by default,\n" + + "Markdown notes with extracted attachment files when `--gmail-format markdown`,\n" + + "or both when `--gmail-format both`. `gmail//messages/index.jsonl`\n" + + "maps backup message IDs to exported files. Labels are written as pretty JSON.\n" return os.WriteFile(filepath.Join(outDir, "README.md"), []byte(body), 0o600) } @@ -253,14 +255,14 @@ func writeJSONFile(path string, value any) error { return os.WriteFile(path, data, 0o600) } -func exportPlainShard(outDir string, shard backup.PlainShard) (int, int, error) { +func exportPlainShard(outDir string, shard backup.PlainShard, opts backupExportOptions) (int, int, error) { switch { case shard.Service == backupServiceDrive && shard.Kind == "contents": return exportDriveContents(outDir, shard) case shard.Service == backupServiceGmail && shard.Kind == "labels": return exportGmailLabels(outDir, shard) case shard.Service == backupServiceGmail && shard.Kind == "messages": - return exportGmailMessages(outDir, shard) + return exportGmailMessages(outDir, shard, opts) default: return exportRawShard(outDir, shard) } @@ -321,65 +323,6 @@ func exportDriveContents(outDir string, shard backup.PlainShard) (int, int, erro return files + 1, len(rows), nil } -func exportGmailLabels(outDir string, shard backup.PlainShard) (int, int, error) { - var labels []gmailBackupLabel - if err := backup.DecodeJSONL(shard.Plaintext, &labels); err != nil { - return 0, 0, err - } - path := filepath.Join(outDir, backupServiceGmail, sanitizeFilePart(shard.Account), "labels.json") - if err := writeJSONFile(path, labels); err != nil { - return 0, 0, err - } - return 1, len(labels), nil -} - -func exportGmailMessages(outDir string, shard backup.PlainShard) (int, int, error) { - var messages []gmailBackupMessage - if err := backup.DecodeJSONL(shard.Plaintext, &messages); err != nil { - return 0, 0, err - } - account := sanitizeFilePart(shard.Account) - indexPath := filepath.Join(outDir, backupServiceGmail, account, "messages", "index.jsonl") - if err := os.MkdirAll(filepath.Dir(indexPath), 0o700); err != nil { - return 0, 0, err - } - indexFile, err := os.OpenFile(indexPath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0o600) // #nosec G304 -- path is confined to caller-selected export dir and sanitized account. - if err != nil { - return 0, 0, err - } - defer indexFile.Close() - enc := json.NewEncoder(indexFile) - enc.SetEscapeHTML(false) - files := 0 - for _, message := range messages { - mime, err := decodeGmailRaw(message.Raw) - if err != nil { - return files, 0, fmt.Errorf("decode Gmail raw %s: %w", message.ID, err) - } - rel := backupExportMessagePath(account, message) - path := filepath.Join(outDir, filepath.FromSlash(rel)) - if err := os.MkdirAll(filepath.Dir(path), 0o700); err != nil { - return files, 0, err - } - if err := os.WriteFile(path, mime, 0o600); err != nil { - return files, 0, err - } - files++ - if err := enc.Encode(gmailExportIndexEntry{ - ID: message.ID, - ThreadID: message.ThreadID, - HistoryID: message.HistoryID, - InternalDate: message.InternalDate, - LabelIDs: message.LabelIDs, - SizeEstimate: message.SizeEstimate, - EML: rel, - }); err != nil { - return files, 0, err - } - } - return files + 1, len(messages), nil -} - func exportRawShard(outDir string, shard backup.PlainShard) (int, int, error) { rel := strings.TrimSuffix(shard.Path, ".gz.age") path := filepath.Join(outDir, "raw", filepath.FromSlash(rel)) @@ -406,29 +349,6 @@ func countExportFiles(outDir string) (int, error) { return count, err } -func decodeGmailRaw(raw string) ([]byte, error) { - raw = strings.TrimSpace(raw) - if raw == "" { - return nil, fmt.Errorf("empty raw payload") - } - if data, err := base64.RawURLEncoding.DecodeString(raw); err == nil { - return data, nil - } - return base64.URLEncoding.DecodeString(raw) -} - -func backupExportMessagePath(account string, message gmailBackupMessage) string { - timestamp := trackingUnknown - yearMonth := trackingUnknown - if message.InternalDate > 0 { - t := time.UnixMilli(message.InternalDate).UTC() - timestamp = t.Format("20060102T150405Z") - yearMonth = filepath.Join(fmt.Sprintf("%04d", t.Year()), fmt.Sprintf("%02d", int(t.Month()))) - } - name := timestamp + "-" + sanitizeFilePart(message.ID) + ".eml" - return filepath.ToSlash(filepath.Join(backupServiceGmail, account, "messages", yearMonth, name)) -} - func sanitizeFilePart(value string) string { value = strings.TrimSpace(value) if value == "" { diff --git a/internal/cmd/backup_export_gmail.go b/internal/cmd/backup_export_gmail.go new file mode 100644 index 0000000..12554ca --- /dev/null +++ b/internal/cmd/backup_export_gmail.go @@ -0,0 +1,500 @@ +package cmd + +import ( + "bytes" + "encoding/base64" + "encoding/json" + "fmt" + stdhtml "html" + "io" + "mime" + "mime/multipart" + "net/mail" + "os" + "path/filepath" + "strings" + "time" + + "github.com/steipete/gogcli/internal/backup" +) + +type gmailExportIndexEntry struct { + ID string `json:"id"` + ThreadID string `json:"threadId,omitempty"` + HistoryID string `json:"historyId,omitempty"` + InternalDate int64 `json:"internalDate,omitempty"` + LabelIDs []string `json:"labelIds,omitempty"` + SizeEstimate int64 `json:"sizeEstimate,omitempty"` + Subject string `json:"subject,omitempty"` + From string `json:"from,omitempty"` + To []string `json:"to,omitempty"` + Cc []string `json:"cc,omitempty"` + Date string `json:"date,omitempty"` + EML string `json:"eml,omitempty"` + Markdown string `json:"markdown,omitempty"` + Attachments []string `json:"attachments,omitempty"` +} + +type backupEmail struct { + Subject string + From string + To []string + Cc []string + Date string + TextBody string + HTMLBody string + Attachments []backupEmailAttachment +} + +type backupEmailAttachment struct { + Filename string + Data []byte +} + +func exportGmailLabels(outDir string, shard backup.PlainShard) (int, int, error) { + var labels []gmailBackupLabel + if err := backup.DecodeJSONL(shard.Plaintext, &labels); err != nil { + return 0, 0, err + } + path := filepath.Join(outDir, backupServiceGmail, sanitizeFilePart(shard.Account), "labels.json") + if err := writeJSONFile(path, labels); err != nil { + return 0, 0, err + } + return 1, len(labels), nil +} + +func exportGmailMessages(outDir string, shard backup.PlainShard, opts backupExportOptions) (int, int, error) { + var messages []gmailBackupMessage + if err := backup.DecodeJSONL(shard.Plaintext, &messages); err != nil { + return 0, 0, err + } + gmailFormat := strings.ToLower(strings.TrimSpace(opts.GmailFormat)) + if gmailFormat == "" { + gmailFormat = "eml" + } + attachmentsMode := strings.ToLower(strings.TrimSpace(opts.GmailAttachments)) + if attachmentsMode == "" { + attachmentsMode = "extract" + } + account := sanitizeFilePart(shard.Account) + indexPath := filepath.Join(outDir, backupServiceGmail, account, "messages", "index.jsonl") + if err := os.MkdirAll(filepath.Dir(indexPath), 0o700); err != nil { + return 0, 0, err + } + indexFile, err := os.OpenFile(indexPath, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0o600) // #nosec G304 -- path is confined to caller-selected export dir and sanitized account. + if err != nil { + return 0, 0, err + } + defer indexFile.Close() + enc := json.NewEncoder(indexFile) + enc.SetEscapeHTML(false) + files := 0 + for _, message := range messages { + rawMIME, err := decodeGmailRaw(message.Raw) + if err != nil { + return files, 0, fmt.Errorf("decode Gmail raw %s: %w", message.ID, err) + } + parsed, parseErr := parseBackupEmail(rawMIME) + if parseErr != nil && gmailFormat != "eml" { + return files, 0, fmt.Errorf("parse Gmail MIME %s: %w", message.ID, parseErr) + } + entry := gmailExportIndexEntry{ + ID: message.ID, + ThreadID: message.ThreadID, + HistoryID: message.HistoryID, + InternalDate: message.InternalDate, + LabelIDs: message.LabelIDs, + SizeEstimate: message.SizeEstimate, + Subject: parsed.Subject, + From: parsed.From, + To: parsed.To, + Cc: parsed.Cc, + Date: parsed.Date, + } + if gmailFormat == "eml" || gmailFormat == "both" { + rel := backupExportMessageEMLPath(account, message) + path := filepath.Join(outDir, filepath.FromSlash(rel)) + if err := os.MkdirAll(filepath.Dir(path), 0o700); err != nil { + return files, 0, err + } + if err := os.WriteFile(path, rawMIME, 0o600); err != nil { + return files, 0, err + } + files++ + entry.EML = rel + } + if gmailFormat == "markdown" || gmailFormat == "both" { + rel, attachmentRels, written, err := exportGmailMarkdownMessage(outDir, account, message, parsed, attachmentsMode == "extract") + if err != nil { + return files, 0, err + } + files += written + entry.Markdown = rel + entry.Attachments = attachmentRels + } + if err := enc.Encode(entry); err != nil { + return files, 0, err + } + } + return files + 1, len(messages), nil +} + +func decodeGmailRaw(raw string) ([]byte, error) { + raw = strings.TrimSpace(raw) + if raw == "" { + return nil, fmt.Errorf("empty raw payload") + } + if data, err := base64.RawURLEncoding.DecodeString(raw); err == nil { + return data, nil + } + return base64.URLEncoding.DecodeString(raw) +} + +func backupExportMessageEMLPath(account string, message gmailBackupMessage) string { + timestamp := trackingUnknown + yearMonth := trackingUnknown + if message.InternalDate > 0 { + t := time.UnixMilli(message.InternalDate).UTC() + timestamp = t.Format("20060102T150405Z") + yearMonth = filepath.Join(fmt.Sprintf("%04d", t.Year()), fmt.Sprintf("%02d", int(t.Month()))) + } + name := timestamp + "-" + sanitizeFilePart(message.ID) + ".eml" + return filepath.ToSlash(filepath.Join(backupServiceGmail, account, "messages", yearMonth, name)) +} + +func backupExportMessageDir(account string, message gmailBackupMessage, subject string) string { + timestamp := trackingUnknown + yearMonth := trackingUnknown + if message.InternalDate > 0 { + t := time.UnixMilli(message.InternalDate).UTC() + timestamp = t.Format("20060102T150405Z") + yearMonth = filepath.Join(fmt.Sprintf("%04d", t.Year()), fmt.Sprintf("%02d", int(t.Month()))) + } + subjectPart := truncateFilePart(sanitizeFilePart(subject), 72) + if subjectPart == trackingUnknown { + subjectPart = "no-subject" + } + name := timestamp + "-" + subjectPart + "-" + sanitizeFilePart(message.ID) + return filepath.ToSlash(filepath.Join(backupServiceGmail, account, "messages", yearMonth, name)) +} + +func exportGmailMarkdownMessage(outDir, account string, message gmailBackupMessage, parsed backupEmail, extractAttachments bool) (string, []string, int, error) { + messageDirRel := backupExportMessageDir(account, message, parsed.Subject) + messageDir := filepath.Join(outDir, filepath.FromSlash(messageDirRel)) + if err := os.MkdirAll(messageDir, 0o700); err != nil { + return "", nil, 0, err + } + var attachmentRels []string + files := 0 + if extractAttachments { + seen := map[string]int{} + for i, attachment := range parsed.Attachments { + filename := sanitizeBackupAttachmentFilename(attachment.Filename, i+1) + filename = uniqueExportFilename(seen, filename) + rel := filepath.ToSlash(filepath.Join(messageDirRel, "attachments", filename)) + path := filepath.Join(outDir, filepath.FromSlash(rel)) + if err := os.MkdirAll(filepath.Dir(path), 0o700); err != nil { + return "", nil, files, err + } + if err := os.WriteFile(path, attachment.Data, 0o600); err != nil { + return "", nil, files, err + } + attachmentRels = append(attachmentRels, rel) + files++ + } + } + body := backupEmailMarkdownBody(parsed) + md := renderGmailMessageMarkdown(message, parsed, body, attachmentRels) + rel := filepath.ToSlash(filepath.Join(messageDirRel, "message.md")) + path := filepath.Join(outDir, filepath.FromSlash(rel)) + if err := os.WriteFile(path, []byte(md), 0o600); err != nil { + return "", nil, files, err + } + files++ + return rel, attachmentRels, files, nil +} + +func backupEmailMarkdownBody(parsed backupEmail) string { + if strings.TrimSpace(parsed.TextBody) != "" { + return backupEmailMarkdownText(parsed.TextBody) + } + if strings.TrimSpace(parsed.HTMLBody) != "" { + return cleanBackupHTMLBody(parsed.HTMLBody) + } + return "" +} + +func backupEmailMarkdownText(value string) string { + value = strings.TrimSpace(value) + if value == "" { + return "" + } + if looksLikeHTML(value) || looksLikeHTMLFragment(value) { + return cleanBackupHTMLBody(value) + } + return value +} + +func cleanBackupHTMLBody(value string) string { + cleaned := stdhtml.UnescapeString(stripHTMLTags(value)) + return strings.Join(strings.Fields(cleaned), " ") +} + +func looksLikeHTMLFragment(value string) bool { + trimmed := strings.ToLower(strings.TrimSpace(value)) + if trimmed == "" { + return false + } + for _, marker := range []string{ + " 0 { + writeYAMLScalar(&b, "internal_date", time.UnixMilli(message.InternalDate).UTC().Format(time.RFC3339)) + } + writeYAMLScalar(&b, "date", parsed.Date) + writeYAMLScalar(&b, "from", parsed.From) + writeYAMLList(&b, "to", parsed.To) + writeYAMLList(&b, "cc", parsed.Cc) + writeYAMLScalar(&b, "subject", parsed.Subject) + writeYAMLList(&b, "labels", message.LabelIDs) + if message.SizeEstimate > 0 { + fmt.Fprintf(&b, "size_estimate: %d\n", message.SizeEstimate) + } + writeYAMLList(&b, "attachments", attachmentRels) + b.WriteString("---\n\n") + if strings.TrimSpace(parsed.Subject) != "" { + b.WriteString("# ") + b.WriteString(markdownHeadingText(parsed.Subject)) + b.WriteString("\n\n") + } + if strings.TrimSpace(body) != "" { + b.WriteString(strings.TrimSpace(body)) + b.WriteString("\n") + } else { + b.WriteString("_No text body found._\n") + } + if len(attachmentRels) > 0 { + b.WriteString("\n## Attachments\n\n") + for _, rel := range attachmentRels { + name := filepath.Base(rel) + b.WriteString("- [") + b.WriteString(markdownLinkText(name)) + b.WriteString("](") + b.WriteString("attachments/") + b.WriteString(markdownLinkTarget(name)) + b.WriteString(")\n") + } + } + return b.String() +} + +func parseBackupEmail(rawMIME []byte) (backupEmail, error) { + msg, err := mail.ReadMessage(bytes.NewReader(rawMIME)) + if err != nil { + return backupEmail{}, err + } + out := backupEmail{ + Subject: decodeMIMEHeader(msg.Header.Get("Subject")), + From: decodeMIMEHeader(msg.Header.Get("From")), + Date: decodeMIMEHeader(msg.Header.Get("Date")), + To: parseAddressHeader(msg.Header.Get("To")), + Cc: parseAddressHeader(msg.Header.Get("Cc")), + } + body, err := io.ReadAll(msg.Body) + if err != nil { + return backupEmail{}, err + } + if err := parseBackupEmailEntity(body, string(msg.Header.Get("Content-Type")), string(msg.Header.Get("Content-Transfer-Encoding")), &out); err != nil { + return backupEmail{}, err + } + return out, nil +} + +func parseBackupEmailEntity(body []byte, contentType, transferEncoding string, out *backupEmail) error { + mediaType, params, err := mime.ParseMediaType(contentType) + if err != nil || strings.TrimSpace(mediaType) == "" { + mediaType = "text/plain" + } + mediaType = strings.ToLower(mediaType) + if strings.HasPrefix(mediaType, "multipart/") { + boundary := params["boundary"] + if strings.TrimSpace(boundary) == "" { + return nil + } + reader := multipart.NewReader(bytes.NewReader(body), boundary) + for { + part, partErr := reader.NextPart() + if partErr == io.EOF { + break + } + if partErr != nil { + return partErr + } + partBody, readErr := io.ReadAll(part) + _ = part.Close() + if readErr != nil { + return readErr + } + partContentType := part.Header.Get("Content-Type") + partEncoding := part.Header.Get("Content-Transfer-Encoding") + if isBackupEmailAttachment(part.Header.Get("Content-Disposition"), partContentType) { + decoded := decodeTransferEncoding(partBody, partEncoding) + filename := backupAttachmentFilename(part.Header.Get("Content-Disposition"), partContentType) + out.Attachments = append(out.Attachments, backupEmailAttachment{ + Filename: filename, + Data: decoded, + }) + continue + } + if err := parseBackupEmailEntity(partBody, partContentType, partEncoding, out); err != nil { + return err + } + } + return nil + } + decoded := decodeTransferEncoding(body, transferEncoding) + decoded = decodeBodyCharset(decoded, contentType) + switch mediaType { + case "text/plain": + if strings.TrimSpace(out.TextBody) == "" { + out.TextBody = string(decoded) + } + case "text/html": + if strings.TrimSpace(out.HTMLBody) == "" { + out.HTMLBody = string(decoded) + } + } + return nil +} + +func isBackupEmailAttachment(contentDisposition, contentType string) bool { + disposition, dispParams, _ := mime.ParseMediaType(contentDisposition) + if strings.EqualFold(disposition, "attachment") { + return true + } + if strings.EqualFold(disposition, "inline") && strings.TrimSpace(dispParams["filename"]) != "" { + return true + } + _, typeParams, _ := mime.ParseMediaType(contentType) + return strings.TrimSpace(typeParams["name"]) != "" +} + +func backupAttachmentFilename(contentDisposition, contentType string) string { + _, dispParams, _ := mime.ParseMediaType(contentDisposition) + if filename := decodeMIMEHeader(dispParams["filename"]); strings.TrimSpace(filename) != "" { + return filename + } + _, typeParams, _ := mime.ParseMediaType(contentType) + if filename := decodeMIMEHeader(typeParams["name"]); strings.TrimSpace(filename) != "" { + return filename + } + return "attachment" +} + +func decodeMIMEHeader(value string) string { + value = strings.TrimSpace(value) + if value == "" { + return "" + } + decoded, err := (&mime.WordDecoder{}).DecodeHeader(value) + if err == nil { + return strings.TrimSpace(decoded) + } + return value +} + +func parseAddressHeader(value string) []string { + value = strings.TrimSpace(value) + if value == "" { + return nil + } + addrs, err := mail.ParseAddressList(value) + if err != nil { + return []string{decodeMIMEHeader(value)} + } + out := make([]string, 0, len(addrs)) + for _, addr := range addrs { + out = append(out, addr.String()) + } + return out +} + +func writeYAMLScalar(b *strings.Builder, key, value string) { + if strings.TrimSpace(value) == "" { + return + } + fmt.Fprintf(b, "%s: %q\n", key, value) +} + +func writeYAMLList(b *strings.Builder, key string, values []string) { + if len(values) == 0 { + return + } + fmt.Fprintf(b, "%s:\n", key) + for _, value := range values { + fmt.Fprintf(b, " - %q\n", value) + } +} + +func markdownHeadingText(value string) string { + value = strings.ReplaceAll(value, "\r", " ") + value = strings.ReplaceAll(value, "\n", " ") + return strings.TrimSpace(value) +} + +func markdownLinkText(value string) string { + value = strings.ReplaceAll(value, "[", "\\[") + value = strings.ReplaceAll(value, "]", "\\]") + return value +} + +func markdownLinkTarget(value string) string { + value = strings.ReplaceAll(value, " ", "%20") + value = strings.ReplaceAll(value, "(", "%28") + value = strings.ReplaceAll(value, ")", "%29") + return value +} + +func sanitizeBackupAttachmentFilename(value string, fallbackIndex int) string { + value = filepath.Base(strings.TrimSpace(value)) + if value == "" || value == "." || value == ".." { + value = fmt.Sprintf("attachment-%03d", fallbackIndex) + } + return sanitizeFilePart(value) +} + +func uniqueExportFilename(seen map[string]int, filename string) string { + if filename == "" { + filename = "attachment" + } + count := seen[filename] + seen[filename] = count + 1 + if count == 0 { + return filename + } + ext := filepath.Ext(filename) + base := strings.TrimSuffix(filename, ext) + return fmt.Sprintf("%s-%d%s", base, count+1, ext) +} + +func truncateFilePart(value string, limit int) string { + if limit <= 0 || len(value) <= limit { + return value + } + return strings.Trim(value[:limit], "._-") +} diff --git a/internal/cmd/backup_export_gmail_test.go b/internal/cmd/backup_export_gmail_test.go new file mode 100644 index 0000000..7f49c40 --- /dev/null +++ b/internal/cmd/backup_export_gmail_test.go @@ -0,0 +1,148 @@ +package cmd + +import ( + "encoding/base64" + "os" + "path/filepath" + "strings" + "testing" + + "github.com/steipete/gogcli/internal/backup" +) + +func TestDecodeGmailRawAcceptsBase64URLVariants(t *testing.T) { + payload := []byte("Subject: Hello\r\n\r\nBody") + raw := base64.RawURLEncoding.EncodeToString(payload) + got, err := decodeGmailRaw(raw) + if err != nil { + t.Fatalf("decodeGmailRaw raw: %v", err) + } + if string(got) != string(payload) { + t.Fatalf("raw decoded = %q, want %q", got, payload) + } + + padded := base64.URLEncoding.EncodeToString(payload) + got, err = decodeGmailRaw(padded) + if err != nil { + t.Fatalf("decodeGmailRaw padded: %v", err) + } + if string(got) != string(payload) { + t.Fatalf("padded decoded = %q, want %q", got, payload) + } +} + +func TestExportGmailMessagesWritesReadableEMLAndIndex(t *testing.T) { + outDir := t.TempDir() + payload := []byte("Subject: Hello\r\nFrom: a@example.com\r\n\r\nBody") + message := gmailBackupMessage{ + ID: "msg/one", + ThreadID: "thread-1", + InternalDate: mustUnixMilli(t, "2026-04-02T10:00:00Z"), + LabelIDs: []string{"INBOX"}, + Raw: base64.RawURLEncoding.EncodeToString(payload), + } + shard, err := backup.NewJSONLShard("gmail", "messages", "acct/hash", "data/gmail/acct/messages/2026/04/part-0001.jsonl.gz.age", []gmailBackupMessage{message}) + if err != nil { + t.Fatalf("NewJSONLShard: %v", err) + } + + files, count, err := exportGmailMessages(outDir, shard, backupExportOptions{GmailFormat: "eml"}) + if err != nil { + t.Fatalf("exportGmailMessages: %v", err) + } + if files != 2 || count != 1 { + t.Fatalf("files,count = %d,%d want 2,1", files, count) + } + + emlRel := backupExportMessageEMLPath("acct_hash", message) + eml, err := os.ReadFile(filepath.Join(outDir, filepath.FromSlash(emlRel))) + if err != nil { + t.Fatalf("read eml: %v", err) + } + if string(eml) != string(payload) { + t.Fatalf("eml = %q, want %q", eml, payload) + } + index := readText(t, filepath.Join(outDir, "gmail", "acct_hash", "messages", "index.jsonl")) + if !strings.Contains(index, `"id":"msg/one"`) || !strings.Contains(index, `"eml":"`+emlRel+`"`) { + t.Fatalf("index missing expected fields: %s", index) + } +} + +func TestExportGmailMessagesWritesMarkdownAndAttachments(t *testing.T) { + outDir := t.TempDir() + payload := strings.Join([]string{ + "Subject: Report", + "From: Alice ", + "To: Peter ", + "Date: Thu, 02 Apr 2026 10:00:00 +0000", + "MIME-Version: 1.0", + `Content-Type: multipart/mixed; boundary="b1"`, + "", + "--b1", + "Content-Type: text/plain; charset=utf-8", + "", + "Body text.", + "--b1", + "Content-Type: application/pdf", + "Content-Transfer-Encoding: base64", + `Content-Disposition: attachment; filename="report.pdf"`, + "", + base64.StdEncoding.EncodeToString([]byte("pdf bytes")), + "--b1--", + "", + }, "\r\n") + message := gmailBackupMessage{ + ID: "msg/one", + ThreadID: "thread-1", + InternalDate: mustUnixMilli(t, "2026-04-02T10:00:00Z"), + LabelIDs: []string{"INBOX"}, + Raw: base64.RawURLEncoding.EncodeToString([]byte(payload)), + } + shard, err := backup.NewJSONLShard("gmail", "messages", "acct/hash", "data/gmail/acct/messages/2026/04/part-0001.jsonl.gz.age", []gmailBackupMessage{message}) + if err != nil { + t.Fatalf("NewJSONLShard: %v", err) + } + + files, count, err := exportGmailMessages(outDir, shard, backupExportOptions{GmailFormat: "markdown", GmailAttachments: "extract"}) + if err != nil { + t.Fatalf("exportGmailMessages: %v", err) + } + if files != 3 || count != 1 { + t.Fatalf("files,count = %d,%d want 3,1", files, count) + } + messageDir := backupExportMessageDir("acct_hash", message, "Report") + mdRel := filepath.ToSlash(filepath.Join(messageDir, "message.md")) + md := readText(t, filepath.Join(outDir, filepath.FromSlash(mdRel))) + for _, want := range []string{ + `subject: "Report"`, + "# Report", + "Body text.", + "- [report.pdf](attachments/report.pdf)", + } { + if !strings.Contains(md, want) { + t.Fatalf("markdown missing %q:\n%s", want, md) + } + } + attachment := readText(t, filepath.Join(outDir, filepath.FromSlash(filepath.Join(messageDir, "attachments", "report.pdf")))) + if attachment != "pdf bytes" { + t.Fatalf("attachment = %q", attachment) + } + index := readText(t, filepath.Join(outDir, "gmail", "acct_hash", "messages", "index.jsonl")) + if !strings.Contains(index, `"markdown":"`+mdRel+`"`) || + !strings.Contains(index, `"attachments":["`+filepath.ToSlash(filepath.Join(messageDir, "attachments", "report.pdf"))+`"]`) || + strings.Contains(index, `"eml"`) { + t.Fatalf("index missing expected markdown-only fields: %s", index) + } +} + +func TestBackupEmailMarkdownBodyCleansHTMLFragments(t *testing.T) { + got := backupEmailMarkdownBody(backupEmail{TextBody: "

Hello Peter

"}) + if got != "Hello Peter" { + t.Fatalf("body = %q, want %q", got, "Hello Peter") + } + + got = backupEmailMarkdownBody(backupEmail{HTMLBody: "

Hi
there

"}) + if got != "Hi there" { + t.Fatalf("html body = %q, want %q", got, "Hi there") + } +} diff --git a/internal/cmd/backup_test.go b/internal/cmd/backup_test.go index f8b3b66..81dca07 100644 --- a/internal/cmd/backup_test.go +++ b/internal/cmd/backup_test.go @@ -37,6 +37,16 @@ func TestBackupAccountHashStableAndOpaque(t *testing.T) { } } +func TestBackupReadFlagsOptionsSkipPull(t *testing.T) { + opts := backupReadFlags{NoPull: true}.options() + if !opts.SkipPull { + t.Fatal("SkipPull = false, want true") + } + if opts.Push { + t.Fatal("Push = true, want false") + } +} + func TestBuildGmailMessageShardsBucketsSortsAndChunks(t *testing.T) { accountHash := "accthash" messages := []gmailBackupMessage{ @@ -632,64 +642,6 @@ func TestDownloadDriveBackupContentHonorsTimeout(t *testing.T) { } } -func TestDecodeGmailRawAcceptsBase64URLVariants(t *testing.T) { - payload := []byte("Subject: Hello\r\n\r\nBody") - raw := base64.RawURLEncoding.EncodeToString(payload) - got, err := decodeGmailRaw(raw) - if err != nil { - t.Fatalf("decodeGmailRaw raw: %v", err) - } - if string(got) != string(payload) { - t.Fatalf("raw decoded = %q, want %q", got, payload) - } - - padded := base64.URLEncoding.EncodeToString(payload) - got, err = decodeGmailRaw(padded) - if err != nil { - t.Fatalf("decodeGmailRaw padded: %v", err) - } - if string(got) != string(payload) { - t.Fatalf("padded decoded = %q, want %q", got, payload) - } -} - -func TestExportGmailMessagesWritesReadableEMLAndIndex(t *testing.T) { - outDir := t.TempDir() - payload := []byte("Subject: Hello\r\nFrom: a@example.com\r\n\r\nBody") - message := gmailBackupMessage{ - ID: "msg/one", - ThreadID: "thread-1", - InternalDate: mustUnixMilli(t, "2026-04-02T10:00:00Z"), - LabelIDs: []string{"INBOX"}, - Raw: base64.RawURLEncoding.EncodeToString(payload), - } - shard, err := backup.NewJSONLShard("gmail", "messages", "acct/hash", "data/gmail/acct/messages/2026/04/part-0001.jsonl.gz.age", []gmailBackupMessage{message}) - if err != nil { - t.Fatalf("NewJSONLShard: %v", err) - } - - files, count, err := exportGmailMessages(outDir, shard) - if err != nil { - t.Fatalf("exportGmailMessages: %v", err) - } - if files != 2 || count != 1 { - t.Fatalf("files,count = %d,%d want 2,1", files, count) - } - - emlRel := backupExportMessagePath("acct_hash", message) - eml, err := os.ReadFile(filepath.Join(outDir, filepath.FromSlash(emlRel))) - if err != nil { - t.Fatalf("read eml: %v", err) - } - if string(eml) != string(payload) { - t.Fatalf("eml = %q, want %q", eml, payload) - } - index := readText(t, filepath.Join(outDir, "gmail", "acct_hash", "messages", "index.jsonl")) - if !strings.Contains(index, `"id":"msg/one"`) || !strings.Contains(index, `"eml":"`+emlRel+`"`) { - t.Fatalf("index missing expected fields: %s", index) - } -} - func TestExportDriveContentsWritesReadableFilesAndIndex(t *testing.T) { outDir := t.TempDir() row := driveBackupContent{