diff --git a/CHANGELOG.md b/CHANGELOG.md index a2821a8..45e4038 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ ### Added - Install: publish a GHCR Docker image for release tags, with a non-root runtime image and file-keyring docs for container automation. (#539, #444) — thanks @HuckOps and @rdehuyss. +- Gmail: add `--sanitize-content` (`--safe`) to `gmail get` and `gmail thread get` for agent-oriented sanitized content output without raw Gmail payloads in JSON. (#238, #220) — thanks @urasmutlu. - Agent safety: add baked safety-profile builds for fail-closed agent binaries, with `agent-safe`, `readonly`, and `full` profiles, filtered help/schema output, docs, and build tooling. (#366, #239) — thanks @drewburchfield. - Calendar: add `--with-meet` to `calendar update` for adding Google Meet conferencing to existing events. (#538) — thanks @alexisperumal. - Calendar: add `calendar move` / `calendar transfer` to move an event to another calendar and change its organizer. (#448) — thanks @markusbkoch. diff --git a/README.md b/README.md index 05f5ff1..6777c64 100644 --- a/README.md +++ b/README.md @@ -742,8 +742,10 @@ gog gmail search 'newer_than:7d' --max 10 gog gmail thread get gog gmail thread get --download # Download attachments to current dir gog gmail thread get --download --out-dir ./attachments +gog gmail thread get --sanitize-content # Agent-oriented sanitized content output gog gmail get gog gmail get --format metadata +gog gmail get --sanitize-content # Agent-oriented sanitized content output gog gmail attachment gog gmail attachment --out ./attachment.bin gog gmail url # Print Gmail web URL @@ -830,6 +832,16 @@ Gmail watch (Pub/Sub push): - `watch serve --fetch-delay` defaults to `3s` and helps avoid Gmail History indexing races after push delivery. - `watch serve --exclude-labels` defaults to `SPAM,TRASH`; IDs are case-sensitive. +Sanitized Gmail content (`--sanitize-content`, alias `--safe`): +- Converts HTML bodies to text with an HTML parser and removes script/style content. +- Replaces HTTP(S) URLs with `[url removed]` after decoding HTML entities. +- Omits raw Gmail `payload`/RFC822 data and unsubscribe links from sanitized JSON envelopes. +- Rejects `gmail get --format raw`, because raw output cannot be sanitized. + +This reduces prompt-injection, phishing-link, and tracking-link exposure for agents. +It is not a sandbox; use command guards or baked safety profiles for command +boundaries. + ### Encrypted Backup ```bash diff --git a/internal/cmd/gmail_get.go b/internal/cmd/gmail_get.go index 531904e..e2e58f7 100644 --- a/internal/cmd/gmail_get.go +++ b/internal/cmd/gmail_get.go @@ -12,9 +12,10 @@ import ( ) type GmailGetCmd struct { - MessageID string `arg:"" name:"messageId" help:"Message ID"` - Format string `name:"format" help:"Message format: full|metadata|raw" default:"full"` - Headers string `name:"headers" help:"Metadata headers (comma-separated; only for --format=metadata)"` + MessageID string `arg:"" name:"messageId" help:"Message ID"` + Format string `name:"format" help:"Message format: full|metadata|raw" default:"full"` + Headers string `name:"headers" help:"Metadata headers (comma-separated; only for --format=metadata)"` + SanitizeContent bool `name:"sanitize-content" aliases:"sanitize,safe" help:"Emit agent-oriented sanitized content: strip HTML, remove HTTP(S) URLs, and omit raw Gmail payloads from JSON"` } const ( @@ -44,6 +45,9 @@ func (c *GmailGetCmd) Run(ctx context.Context, flags *RootFlags) error { default: return fmt.Errorf("invalid --format: %q (expected full|metadata|raw)", format) } + if c.SanitizeContent && format == gmailFormatRaw { + return usage("--sanitize-content cannot be used with --format raw") + } svc, err := newGmailService(ctx, account) if err != nil { @@ -68,6 +72,17 @@ func (c *GmailGetCmd) Run(ctx context.Context, flags *RootFlags) error { unsubscribe := bestUnsubscribeLink(msg.Payload) if outfmt.IsJSON(ctx) { + if c.SanitizeContent { + output := sanitizedGmailMessage(msg, format == gmailFormatFull) + payload := map[string]any{ + "message": output, + "headers": output.Headers, + } + if format == gmailFormatFull && output.Body != "" { + payload["body"] = output.Body + } + return outfmt.WriteJSON(ctx, os.Stdout, payload) + } // Include a flattened headers map for easier querying // (e.g., jq '.headers.to' instead of complex nested queries) headers := map[string]string{ @@ -120,13 +135,20 @@ func (c *GmailGetCmd) Run(ctx context.Context, flags *RootFlags) error { u.Out().Println(string(decoded)) return nil case gmailFormatMetadata, gmailFormatFull: - u.Out().Printf("from\t%s", headerValue(msg.Payload, "From")) - u.Out().Printf("to\t%s", headerValue(msg.Payload, "To")) - u.Out().Printf("cc\t%s", headerValue(msg.Payload, "Cc")) - u.Out().Printf("bcc\t%s", headerValue(msg.Payload, "Bcc")) - u.Out().Printf("subject\t%s", headerValue(msg.Payload, "Subject")) - u.Out().Printf("date\t%s", headerValue(msg.Payload, "Date")) - if unsubscribe != "" { + header := func(name string) string { + value := headerValue(msg.Payload, name) + if c.SanitizeContent { + return sanitizeGmailText(value) + } + return value + } + u.Out().Printf("from\t%s", header("From")) + u.Out().Printf("to\t%s", header("To")) + u.Out().Printf("cc\t%s", header("Cc")) + u.Out().Printf("bcc\t%s", header("Bcc")) + u.Out().Printf("subject\t%s", header("Subject")) + u.Out().Printf("date\t%s", header("Date")) + if unsubscribe != "" && !c.SanitizeContent { u.Out().Printf("unsubscribe\t%s", unsubscribe) } attachments := attachmentOutputs(collectAttachments(msg.Payload)) @@ -137,6 +159,10 @@ func (c *GmailGetCmd) Run(ctx context.Context, flags *RootFlags) error { if format == gmailFormatFull { body := bestBodyText(msg.Payload) if body != "" { + if c.SanitizeContent { + displayBody, isHTML := bestBodyForDisplay(msg.Payload) + body = sanitizeGmailBody(displayBody, isHTML) + } u.Out().Println("") u.Out().Println(body) } diff --git a/internal/cmd/gmail_sanitize.go b/internal/cmd/gmail_sanitize.go new file mode 100644 index 0000000..bb9cb33 --- /dev/null +++ b/internal/cmd/gmail_sanitize.go @@ -0,0 +1,149 @@ +package cmd + +import ( + htmlpkg "html" + "regexp" + "strings" + + "golang.org/x/net/html" + "google.golang.org/api/gmail/v1" +) + +var ( + sanitizeURLPattern = regexp.MustCompile(`https?://[^\s<>"'` + "`" + `\]\)]+`) + sanitizeBlockTags = map[string]bool{ + "article": true, "blockquote": true, "br": true, "dd": true, "div": true, + "dl": true, "dt": true, "footer": true, "h1": true, "h2": true, + "h3": true, "h4": true, "h5": true, "h6": true, "header": true, + "hr": true, "li": true, "ol": true, "p": true, "pre": true, + "section": true, "table": true, "tr": true, "ul": true, + } +) + +type gmailSanitizedThreadOutput struct { + ID string `json:"id,omitempty"` + Messages []gmailSanitizedMessageOutput `json:"messages"` +} + +type gmailSanitizedMessageOutput struct { + ID string `json:"id,omitempty"` + ThreadID string `json:"threadId,omitempty"` + LabelIDs []string `json:"labelIds,omitempty"` + Snippet string `json:"snippet,omitempty"` + InternalDate int64 `json:"internalDate,omitempty"` + SizeEstimate int64 `json:"sizeEstimate,omitempty"` + Headers map[string]string `json:"headers"` + Body string `json:"body,omitempty"` + Attachments []attachmentOutput `json:"attachments,omitempty"` +} + +func sanitizeGmailText(value string) string { + value = htmlpkg.UnescapeString(value) + return sanitizeURLPattern.ReplaceAllString(value, "[url removed]") +} + +func sanitizeGmailBody(body string, isHTML bool) string { + if body == "" { + return "" + } + text := body + if isHTML { + text = extractSanitizedHTMLText(text) + } + text = sanitizeGmailText(text) + text = whitespacePattern.ReplaceAllString(text, " ") + return strings.TrimSpace(text) +} + +func extractSanitizedHTMLText(value string) string { + tokenizer := html.NewTokenizer(strings.NewReader(value)) + var out strings.Builder + skipDepth := 0 + for { + switch tokenizer.Next() { + case html.ErrorToken: + text := whitespacePattern.ReplaceAllString(out.String(), " ") + return strings.TrimSpace(text) + case html.StartTagToken, html.SelfClosingTagToken: + name, _ := tokenizer.TagName() + tag := strings.ToLower(string(name)) + if tag == "script" || tag == "style" { + skipDepth++ + } + if sanitizeBlockTags[tag] { + out.WriteByte(' ') + } + case html.EndTagToken: + name, _ := tokenizer.TagName() + tag := strings.ToLower(string(name)) + if (tag == "script" || tag == "style") && skipDepth > 0 { + skipDepth-- + } + if sanitizeBlockTags[tag] { + out.WriteByte(' ') + } + case html.TextToken: + if skipDepth == 0 { + out.Write(tokenizer.Text()) + } + } + } +} + +func sanitizedGmailHeaders(p *gmail.MessagePart) map[string]string { + headers := map[string]string{ + "from": sanitizeGmailText(headerValue(p, "From")), + "to": sanitizeGmailText(headerValue(p, "To")), + "cc": sanitizeGmailText(headerValue(p, "Cc")), + "bcc": sanitizeGmailText(headerValue(p, "Bcc")), + "subject": sanitizeGmailText(headerValue(p, "Subject")), + "date": sanitizeGmailText(headerValue(p, "Date")), + "message_id": sanitizeGmailText(headerValue(p, "Message-ID")), + "in_reply_to": sanitizeGmailText(headerValue(p, "In-Reply-To")), + "references": sanitizeGmailText(headerValue(p, "References")), + } + for key, value := range headers { + if value == "" { + delete(headers, key) + } + } + return headers +} + +func sanitizedGmailMessage(msg *gmail.Message, includeBody bool) gmailSanitizedMessageOutput { + if msg == nil { + return gmailSanitizedMessageOutput{Headers: map[string]string{}} + } + out := gmailSanitizedMessageOutput{ + ID: msg.Id, + ThreadID: msg.ThreadId, + LabelIDs: msg.LabelIds, + Snippet: sanitizeGmailText(msg.Snippet), + InternalDate: msg.InternalDate, + SizeEstimate: msg.SizeEstimate, + Headers: sanitizedGmailHeaders(msg.Payload), + Attachments: attachmentOutputs(collectAttachments(msg.Payload)), + } + if includeBody { + body, isHTML := bestBodyForDisplay(msg.Payload) + out.Body = sanitizeGmailBody(body, isHTML) + } + return out +} + +func sanitizedGmailThread(thread *gmail.Thread, includeBody bool) gmailSanitizedThreadOutput { + if thread == nil { + return gmailSanitizedThreadOutput{Messages: []gmailSanitizedMessageOutput{}} + } + out := gmailSanitizedThreadOutput{ + ID: thread.Id, + Messages: make([]gmailSanitizedMessageOutput, 0, len(thread.Messages)), + } + for _, msg := range thread.Messages { + if msg == nil { + continue + } + out.Messages = append(out.Messages, sanitizedGmailMessage(msg, includeBody)) + } + return out +} diff --git a/internal/cmd/gmail_sanitize_test.go b/internal/cmd/gmail_sanitize_test.go new file mode 100644 index 0000000..081767e --- /dev/null +++ b/internal/cmd/gmail_sanitize_test.go @@ -0,0 +1,219 @@ +package cmd + +import ( + "context" + "encoding/base64" + "encoding/json" + "net/http" + "net/http/httptest" + "strings" + "testing" + + "google.golang.org/api/gmail/v1" + "google.golang.org/api/option" +) + +func TestSanitizeGmailBody(t *testing.T) { + tests := []struct { + name string + body string + isHTML bool + want string + }{ + { + name: "html strips scripts and visible urls", + body: `

Hello https://phish.example/login

`, + isHTML: true, + want: "Hello [url removed]", + }, + { + name: "plain decodes entity-obfuscated url", + body: `open https://evil.example/path now`, + isHTML: false, + want: "open [url removed] now", + }, + { + name: "html keeps link text but drops href target", + body: `

Click here

`, + isHTML: true, + want: "Click here", + }, + { + name: "style block removed", + body: `

Visible

`, + isHTML: true, + want: "Visible", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + if got := sanitizeGmailBody(tt.body, tt.isHTML); got != tt.want { + t.Fatalf("sanitizeGmailBody() = %q, want %q", got, tt.want) + } + }) + } +} + +func TestGmailGetCmd_SanitizeContent_JSONUsesSafeEnvelope(t *testing.T) { + origNew := newGmailService + t.Cleanup(func() { newGmailService = origNew }) + + htmlBody := base64.RawURLEncoding.EncodeToString([]byte( + `

Hello https://phish.example/login

`, + )) + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + if !strings.Contains(r.URL.Path, "/gmail/v1/users/me/messages/") { + http.NotFound(w, r) + return + } + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(map[string]any{ + "id": "m1", + "threadId": "t1", + "labelIds": []string{"INBOX"}, + "snippet": "snippet https://snippet.example", + "internalDate": "1766743200000", + "payload": map[string]any{ + "mimeType": "text/html", + "body": map[string]any{"data": htmlBody}, + "headers": []map[string]any{ + {"name": "From", "value": "a@example.com"}, + {"name": "To", "value": "b@example.com"}, + {"name": "Subject", "value": "Visit https://evil.example now"}, + {"name": "Date", "value": "Fri, 26 Dec 2025 10:00:00 +0000"}, + {"name": "List-Unsubscribe", "value": ""}, + }, + }, + }) + })) + defer srv.Close() + + svc, err := gmail.NewService(context.Background(), + option.WithoutAuthentication(), + option.WithHTTPClient(srv.Client()), + option.WithEndpoint(srv.URL+"/"), + ) + if err != nil { + t.Fatalf("NewService: %v", err) + } + newGmailService = func(context.Context, string) (*gmail.Service, error) { return svc, nil } + + out := captureStdout(t, func() { + _ = captureStderr(t, func() { + err := Execute([]string{"--json", "--account", "a@b.com", "gmail", "get", "m1", "--sanitize-content"}) + if err != nil { + t.Fatalf("Execute: %v", err) + } + }) + }) + + if strings.Contains(out, "https://") || strings.Contains(out, "tracker.example") || strings.Contains(out, htmlBody) { + t.Fatalf("sanitized JSON leaked unsafe content: %s", out) + } + if strings.Contains(out, "payload") || strings.Contains(out, "unsubscribe") { + t.Fatalf("sanitized JSON should not expose raw Gmail payload/unsubscribe: %s", out) + } + var parsed struct { + Body string `json:"body"` + Message struct { + ID string `json:"id"` + Headers map[string]string `json:"headers"` + } `json:"message"` + } + if err := json.Unmarshal([]byte(out), &parsed); err != nil { + t.Fatalf("decode JSON: %v", err) + } + if parsed.Body != "Hello [url removed]" { + t.Fatalf("unexpected body: %q", parsed.Body) + } + if parsed.Message.Headers["subject"] != "Visit [url removed] now" { + t.Fatalf("unexpected sanitized subject: %#v", parsed.Message.Headers) + } +} + +func TestGmailGetCmd_SanitizeContentRejectsRaw(t *testing.T) { + err := Execute([]string{"--account", "a@b.com", "gmail", "get", "m1", "--format", "raw", "--sanitize-content"}) + if err == nil || !strings.Contains(err.Error(), "--sanitize-content cannot be used with --format raw") { + t.Fatalf("expected raw/sanitize usage error, got: %v", err) + } +} + +func TestGmailThreadGet_SanitizeContent_JSONUsesSafeEnvelope(t *testing.T) { + origNew := newGmailService + t.Cleanup(func() { newGmailService = origNew }) + + htmlBody := base64.RawURLEncoding.EncodeToString([]byte( + `

Hello https://phish.example/login

`, + )) + threadResp := map[string]any{ + "id": "t1", + "messages": []map[string]any{ + { + "id": "m1", + "threadId": "t1", + "payload": map[string]any{ + "headers": []map[string]any{ + {"name": "From", "value": "a@example.com"}, + {"name": "To", "value": "b@example.com"}, + {"name": "Subject", "value": "Check https://evil.example now"}, + {"name": "Date", "value": "Mon, 1 Jan 2025 00:00:00 +0000"}, + {"name": "List-Unsubscribe", "value": ""}, + }, + "mimeType": "text/html", + "body": map[string]any{"data": htmlBody}, + }, + }, + }, + } + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + path := strings.TrimPrefix(r.URL.Path, "/gmail/v1") + if r.Method == http.MethodGet && path == "/users/me/threads/t1" { + w.Header().Set("Content-Type", "application/json") + _ = json.NewEncoder(w).Encode(threadResp) + return + } + http.NotFound(w, r) + })) + defer srv.Close() + + svc, err := gmail.NewService(context.Background(), + option.WithoutAuthentication(), + option.WithHTTPClient(srv.Client()), + option.WithEndpoint(srv.URL+"/"), + ) + if err != nil { + t.Fatalf("NewService: %v", err) + } + newGmailService = func(context.Context, string) (*gmail.Service, error) { return svc, nil } + + out := captureStdout(t, func() { + _ = captureStderr(t, func() { + err := Execute([]string{"--json", "--account", "a@b.com", "gmail", "thread", "get", "t1", "--sanitize-content"}) + if err != nil { + t.Fatalf("Execute: %v", err) + } + }) + }) + + if strings.Contains(out, "https://") || strings.Contains(out, "tracker.example") || strings.Contains(out, htmlBody) { + t.Fatalf("sanitized thread JSON leaked unsafe content: %s", out) + } + if strings.Contains(out, "payload") || strings.Contains(out, "unsubscribe") { + t.Fatalf("sanitized thread JSON should not expose raw Gmail payload/unsubscribe: %s", out) + } + var parsed struct { + Thread struct { + Messages []gmailSanitizedMessageOutput `json:"messages"` + } `json:"thread"` + } + if err := json.Unmarshal([]byte(out), &parsed); err != nil { + t.Fatalf("decode JSON: %v", err) + } + if len(parsed.Thread.Messages) != 1 { + t.Fatalf("unexpected messages: %#v", parsed.Thread.Messages) + } + if got := parsed.Thread.Messages[0].Body; got != "Hello [url removed]" { + t.Fatalf("unexpected body: %q", got) + } +} diff --git a/internal/cmd/gmail_thread.go b/internal/cmd/gmail_thread.go index 02e534d..2142a9e 100644 --- a/internal/cmd/gmail_thread.go +++ b/internal/cmd/gmail_thread.go @@ -55,10 +55,11 @@ type GmailThreadCmd struct { } type GmailThreadGetCmd struct { - ThreadID string `arg:"" name:"threadId" help:"Thread ID"` - Download bool `name:"download" help:"Download attachments"` - Full bool `name:"full" help:"Show full message bodies"` - OutputDir OutputDirFlag `embed:""` + ThreadID string `arg:"" name:"threadId" help:"Thread ID"` + Download bool `name:"download" help:"Download attachments"` + Full bool `name:"full" help:"Show full message bodies"` + SanitizeContent bool `name:"sanitize-content" aliases:"sanitize,safe" help:"Emit agent-oriented sanitized content: strip HTML, remove HTTP(S) URLs, and omit raw Gmail payloads from JSON"` + OutputDir OutputDirFlag `embed:""` } func (c *GmailThreadGetCmd) Run(ctx context.Context, flags *RootFlags) error { @@ -111,6 +112,12 @@ func (c *GmailThreadGetCmd) Run(ctx context.Context, flags *RootFlags) error { downloadedFiles = append(downloadedFiles, attachmentDownloadSummaries(downloads)...) } } + if c.SanitizeContent { + return outfmt.WriteJSON(ctx, os.Stdout, map[string]any{ + "thread": sanitizedGmailThread(thread, true), + "downloaded": downloadedFiles, + }) + } return outfmt.WriteJSON(ctx, os.Stdout, map[string]any{ "thread": thread, "downloaded": downloadedFiles, @@ -130,16 +137,25 @@ func (c *GmailThreadGetCmd) Run(ctx context.Context, flags *RootFlags) error { continue } u.Out().Printf("=== Message %d/%d: %s ===", i+1, len(thread.Messages), msg.Id) - u.Out().Printf("From: %s", headerValue(msg.Payload, "From")) - u.Out().Printf("To: %s", headerValue(msg.Payload, "To")) - u.Out().Printf("Subject: %s", headerValue(msg.Payload, "Subject")) - u.Out().Printf("Date: %s", headerValue(msg.Payload, "Date")) + header := func(name string) string { + value := headerValue(msg.Payload, name) + if c.SanitizeContent { + return sanitizeGmailText(value) + } + return value + } + u.Out().Printf("From: %s", header("From")) + u.Out().Printf("To: %s", header("To")) + u.Out().Printf("Subject: %s", header("Subject")) + u.Out().Printf("Date: %s", header("Date")) u.Out().Println("") body, isHTML := bestBodyForDisplay(msg.Payload) if body != "" { cleanBody := body - if isHTML { + if c.SanitizeContent { + cleanBody = sanitizeGmailBody(body, isHTML) + } else if isHTML { // Strip HTML tags for cleaner text output cleanBody = stripHTMLTags(body) }