From 69a4bc56f78f733ed168911997e43338e562eeaa Mon Sep 17 00:00:00 2001
From: Peter Steinberger <steipete@gmail.com>
Date: Tue, 5 May 2026 05:21:13 +0100
Subject: [PATCH] feat(portable): add v2 backup data

---
 CHANGELOG.md                        |   3 +-
 docs/commands.md                    |   2 +-
 docs/concepts.md                    |   2 +-
 docs/portable-stores.md             |  22 +++--
 docs/refresh-and-embed.md           |   2 +-
 internal/cli/gh_shim_detail_test.go |  17 +++-
 internal/cli/gh_shim_prcache.go     |  35 ++++++++
 internal/cli/gh_shim_test.go        |  26 ++++++
 internal/store/comments.go          |  36 ++++++++
 internal/store/coverage_test.go     |  19 ++++
 internal/store/portable.go          | 133 +++++++++++++++++++++-------
 internal/store/store_test.go        |  64 ++++++++++++-
 12 files changed, 315 insertions(+), 46 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index c41fb38..d1f9361 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,12 +1,13 @@
 # Changelog
 
-## 0.1.3 - Unreleased
+## 0.2.0 - Unreleased
 
 - Force embedding refreshes when the embedding input rune cap changes, so stale larger-cap vectors are not reused.
 - Expand the `gh` shim with local list filters, PR diff caching by cached head SHA, xcache GC, hit/miss/write counters, and throttled portable-store refreshes to reduce GitHub API pressure across agent sessions.
 - Add explicit PR-detail hydration for files, commits, checks, and workflow runs so `gh pr view`, `gh pr checks`, and `gh run list/view` can answer common review reads from the existing SQLite cache.
 - Auto-hydrate one exact pull request when local PR detail reads miss or check/run data is stale, using `gh auth token` if `GITHUB_TOKEN` is absent, then retry from SQLite before falling back to live `gh`.
 - Cache more ghx-style read-only fallthroughs, including release, workflow, secret, variable, project, ruleset, gist, org, and search reads; cache repeat read failures by default; and clear the fallthrough cache after the corresponding mutating `gh` commands.
+- Promote portable backups to the v2 format: keep compact comments, PR files, commits, checks, and workflow runs while stripping raw JSON, generated documents, vectors, clusters, and run history.
 
 ## 0.1.2 - 2026-05-01
 
diff --git a/docs/commands.md b/docs/commands.md
index b2cd642..a92b669 100644
--- a/docs/commands.md
+++ b/docs/commands.md
@@ -109,7 +109,7 @@ The shim binary can be installed standalone by symlinking the `gitcrawl` binary
 
 | Command | Purpose | Docs |
 | --- | --- | --- |
-| `gitcrawl portable prune [--body-chars --no-vacuum --json]` | Truncate thread bodies and (optionally) `VACUUM` for a small publishable database | [Portable stores](/portable-stores/#publishing-gitcrawl-portable-prune) |
+| `gitcrawl portable prune [--body-chars --no-vacuum --json]` | Build a compact portable v2 backup and (optionally) `VACUUM` for publishing | [Portable stores](/portable-stores/#publishing-gitcrawl-portable-prune) |
 
 ## Not yet implemented
 
diff --git a/docs/concepts.md b/docs/concepts.md
index 62ef77f..09f5ef7 100644
--- a/docs/concepts.md
+++ b/docs/concepts.md
@@ -88,7 +88,7 @@ Every sync, embed, and cluster operation records a **run** in `run_records` with
 
 A **portable store** is a Git-backed publish target for a `gitcrawl.db` plus its derived bodies, designed for sharing a local cache across agents or machines without a hosted service.
 
-`gitcrawl init --portable-store https://github.com/org/repo` clones a portable store into `~/.config/gitcrawl/portable/`, points the runtime at it, and `gitcrawl portable prune --body-chars 256` keeps the published payload small. Read-only commands run against portable stores refresh the checkout before reading. See [Portable stores](/portable-stores/).
+`gitcrawl init --portable-store https://github.com/org/repo` clones a portable store into `~/.config/gitcrawl/portable/`, points the runtime at it, and `gitcrawl portable prune --body-chars 256` keeps the published payload small while retaining comments, PR details, checks, and workflow runs. Read-only commands run against portable stores refresh the checkout before reading. See [Portable stores](/portable-stores/).
 
 ## Cache
 
diff --git a/docs/portable-stores.md b/docs/portable-stores.md
index 88c8575..2a22ea0 100644
--- a/docs/portable-stores.md
+++ b/docs/portable-stores.md
@@ -19,7 +19,7 @@ A Git-backed publish target for a `gitcrawl.db` plus its derived bodies — shar
 - You want a backup of the SQLite cache that someone else can clone and use immediately.
 - You want a deterministic snapshot of "what gitcrawl knew at time T" for reproducible triage.
 
-A portable store is just a Git repository whose contents include a SQLite database (and optionally derived bodies and vectors). Anyone with read access to the repository can `git clone` it and have a fully populated gitcrawl mirror in seconds.
+A portable store is just a Git repository whose contents include a SQLite database. Anyone with read access to the repository can `git clone` it and have a fully populated gitcrawl mirror in seconds.
 
 ## Setup: pointing gitcrawl at a portable store
 
@@ -68,11 +68,21 @@ gitcrawl portable prune --body-chars 512 --no-vacuum
 gitcrawl portable prune --json
 ```
 
-`prune` truncates thread bodies in the database to the requested character cap and (by default) runs SQLite `VACUUM` to reclaim space. The result is a smaller database suitable for committing back to Git.
+`prune` converts the database into the portable v2 backup format and (by default) runs SQLite `VACUUM` to reclaim space. The result is a smaller database suitable for committing back to Git.
+
+Portable v2 keeps the data agents most often need for offline GitHub reads:
+
+- Repositories, issues, pull requests, labels, authors, and timestamps
+- Compact issue/PR body excerpts plus original body lengths
+- Compact comments, reviews, and review-comment excerpts plus original body lengths
+- PR details, files, commits, status checks, and workflow runs
+- Thread fingerprints used by duplicate and cluster-oriented workflows
+
+It strips the data that is large, easy to regenerate, or mainly useful for exact API replay: raw GitHub JSON, generated documents and FTS indexes, embeddings and vectors, code snapshots and diff blobs, cluster run history, similarity edges, and blob storage. The database records this contract in `portable_metadata` with `schema=gitcrawl-portable-sync-v2`, `includes`, `excluded`, and `capabilities` keys.
 
 | Flag | Default | Description |
 | --- | --- | --- |
-| `--body-chars <n>` | `256` | Maximum body characters to keep per thread |
+| `--body-chars <n>` | `256` | Maximum body characters to keep per thread/comment excerpt |
 | `--no-vacuum` | _(off)_ | Skip the post-prune `VACUUM` |
 | `--json` | _(off)_ | JSON output |
 
@@ -100,10 +110,12 @@ Other agents and machines pull the new commit on their next read-only command.
 
 `gitcrawl search` (and the gh-shim's search) work against portable-store data with one wrinkle: when the portable store has been pruned, generated document indexes may not be present. Search falls back to compact thread title/body data automatically — you keep useful results without the publisher needing to ship the full document indexes.
 
+The v2 backup also keeps comments and PR-detail tables, so common shim reads such as `gh issue view --json comments`, `gh pr view --json files,commits,statusCheckRollup`, `gh pr checks`, and `gh run list` can be answered from the shared checkout when those details were synced before publishing.
+
 ## Caveats
 
-- The portable store carries the SQLite database. It does not carry the runtime cache or the vector store unless you explicitly publish them.
-- Vectors regenerated on each consumer's machine after `embed` are not shared; if you want shared vectors, publish the `vectors/` directory alongside the database.
+- The portable store carries the SQLite database. It does not carry the runtime fallthrough cache.
+- Vectors regenerated on each consumer's machine after `embed` are not shared; portable pruning removes vector tables from the published database.
 - Portable stores are read-mostly. Multiple writers pushing concurrently will race the way any Git workflow does — gate writes through a single publisher or a CI workflow.
 
 ## See also
diff --git a/docs/refresh-and-embed.md b/docs/refresh-and-embed.md
index bc8a7df..2ab806e 100644
--- a/docs/refresh-and-embed.md
+++ b/docs/refresh-and-embed.md
@@ -127,4 +127,4 @@ Each row carries `started_at`, `finished_at`, `status`, and `stats_json` — use
 
 - **GitHub.** Sync uses standard REST endpoints; the API quota is the dominant cost on busy repos. Use `--include-comments` and `--with pr-details` selectively.
 - **OpenAI.** `text-embedding-3-small` is inexpensive but not free. `embed` is bounded by `--limit` if you want to stay under a budget on initial backfills.
-- **Disk.** Vectors and PR detail blobs grow with the repo. The portable-store flow includes `gitcrawl portable prune` to keep published payloads small — see [Portable stores](/portable-stores/).
+- **Disk.** Vectors, generated documents, and raw API payloads grow with the repo. The portable-store flow includes `gitcrawl portable prune` to keep published payloads small while retaining compact comments and PR details — see [Portable stores](/portable-stores/).
diff --git a/internal/cli/gh_shim_detail_test.go b/internal/cli/gh_shim_detail_test.go
index 320f842..ea09b4d 100644
--- a/internal/cli/gh_shim_detail_test.go
+++ b/internal/cli/gh_shim_detail_test.go
@@ -19,14 +19,15 @@ func TestGHShimViewAndListUseLocalCache(t *testing.T) {
 	run := New()
 	var stdout bytes.Buffer
 	run.Stdout = &stdout
-	if err := run.Run(ctx, []string{"--config", configPath, "gh", "pr", "view", "12", "-R", "openclaw/openclaw", "--json", "number,title,isDraft,author"}); err != nil {
+	if err := run.Run(ctx, []string{"--config", configPath, "gh", "pr", "view", "12", "-R", "openclaw/openclaw", "--json", "number,title,isDraft,author,comments"}); err != nil {
 		t.Fatalf("gh pr view: %v", err)
 	}
 	var view map[string]any
 	if err := json.Unmarshal(stdout.Bytes(), &view); err != nil {
 		t.Fatalf("decode view: %v\n%s", err, stdout.String())
 	}
-	if int(view["number"].(float64)) != 12 || view["isDraft"] != true {
+	comments := view["comments"].([]any)
+	if int(view["number"].(float64)) != 12 || view["isDraft"] != true || len(comments) != 1 || comments[0].(map[string]any)["body"] != "cache path looks good" {
 		t.Fatalf("view = %#v", view)
 	}
 
@@ -89,6 +90,18 @@ func TestGHShimViewAndListUseLocalCache(t *testing.T) {
 		t.Fatalf("list = %#v", list)
 	}
 
+	stdout.Reset()
+	if err := run.Run(ctx, []string{"--config", configPath, "gh", "issue", "view", "10", "-R", "openclaw/openclaw", "--json", "number,comments"}); err != nil {
+		t.Fatalf("gh issue view comments: %v", err)
+	}
+	if err := json.Unmarshal(stdout.Bytes(), &view); err != nil {
+		t.Fatalf("decode issue comments: %v\n%s", err, stdout.String())
+	}
+	comments = view["comments"].([]any)
+	if len(comments) != 1 || comments[0].(map[string]any)["body"] != "same hot loop here" {
+		t.Fatalf("issue comments = %#v", view)
+	}
+
 	stdout.Reset()
 	if err := run.Run(ctx, []string{"--config", configPath, "gh", "issue", "list", "-R", "openclaw/openclaw", "--author", "alice", "--assignee", "peter", "--label", "bug", "--json", "number,title"}); err != nil {
 		t.Fatalf("gh issue list filtered: %v", err)
diff --git a/internal/cli/gh_shim_prcache.go b/internal/cli/gh_shim_prcache.go
index a60789c..1b4aee7 100644
--- a/internal/cli/gh_shim_prcache.go
+++ b/internal/cli/gh_shim_prcache.go
@@ -18,6 +18,14 @@ func (a *App) ghThreadViewJSONRow(ctx context.Context, repoValue string, thread
 	row := make(map[string]any, len(fields))
 	var cache *store.PullRequestCache
 	for _, field := range fields {
+		if field == "comments" {
+			comments, err := a.localGHThreadComments(ctx, thread.ID)
+			if err != nil {
+				return nil, err
+			}
+			row[field] = ghCommentsJSONValue(comments)
+			continue
+		}
 		value, err := ghSearchJSONValue(thread, field)
 		if err == nil {
 			row[field] = value
@@ -63,6 +71,33 @@ func (a *App) localGHPullRequestCache(ctx context.Context, repoValue string, num
 	return cache, nil
 }
 
+func (a *App) localGHThreadComments(ctx context.Context, threadID int64) ([]store.Comment, error) {
+	rt, err := a.openLocalRuntimeReadOnly(ctx)
+	if err != nil {
+		return nil, localGHUnsupported(err)
+	}
+	defer rt.Store.Close()
+	comments, err := rt.Store.ListComments(ctx, threadID)
+	if err != nil {
+		return nil, localGHUnsupported(err)
+	}
+	return comments, nil
+}
+
+func ghCommentsJSONValue(comments []store.Comment) []map[string]any {
+	out := make([]map[string]any, 0, len(comments))
+	for _, comment := range comments {
+		out = append(out, map[string]any{
+			"id":        comment.GitHubID,
+			"author":    map[string]any{"login": comment.AuthorLogin, "type": comment.AuthorType},
+			"body":      comment.Body,
+			"createdAt": comment.CreatedAtGitHub,
+			"updatedAt": comment.UpdatedAtGitHub,
+		})
+	}
+	return out
+}
+
 func ghPRDetailJSONValue(thread store.Thread, cache store.PullRequestCache, field string) (any, error) {
 	switch field {
 	case "files":
diff --git a/internal/cli/gh_shim_test.go b/internal/cli/gh_shim_test.go
index 7ebddff..a155e0b 100644
--- a/internal/cli/gh_shim_test.go
+++ b/internal/cli/gh_shim_test.go
@@ -119,6 +119,19 @@ func seedGHShimRepo(t *testing.T, ctx context.Context) string {
 	if _, err := st.UpsertDocument(ctx, store.Document{ThreadID: issueID, Title: "Hot loop burns CPU", RawText: "runtime hot loop burns CPU", DedupeText: "runtime hot loop burns cpu", UpdatedAt: "2026-04-27T01:00:00Z"}); err != nil {
 		t.Fatalf("seed issue document: %v", err)
 	}
+	if _, err := st.UpsertComment(ctx, store.Comment{
+		ThreadID:        issueID,
+		GitHubID:        "1001",
+		CommentType:     "issue_comment",
+		AuthorLogin:     "carol",
+		AuthorType:      "User",
+		Body:            "same hot loop here",
+		RawJSON:         "{}",
+		CreatedAtGitHub: "2026-04-27T01:10:00Z",
+		UpdatedAtGitHub: "2026-04-27T01:10:00Z",
+	}); err != nil {
+		t.Fatalf("seed issue comment: %v", err)
+	}
 	prID, err := st.UpsertThread(ctx, store.Thread{
 		RepoID:          repoID,
 		GitHubID:        "12",
@@ -143,6 +156,19 @@ func seedGHShimRepo(t *testing.T, ctx context.Context) string {
 	if _, err := st.UpsertDocument(ctx, store.Document{ThreadID: prID, Title: "Manifest cache update", RawText: "manifest cache refresh", DedupeText: "manifest cache refresh", UpdatedAt: "2026-04-27T02:00:00Z"}); err != nil {
 		t.Fatalf("seed pr document: %v", err)
 	}
+	if _, err := st.UpsertComment(ctx, store.Comment{
+		ThreadID:        prID,
+		GitHubID:        "1201",
+		CommentType:     "review_comment",
+		AuthorLogin:     "dana",
+		AuthorType:      "User",
+		Body:            "cache path looks good",
+		RawJSON:         "{}",
+		CreatedAtGitHub: "2026-04-27T02:10:00Z",
+		UpdatedAtGitHub: "2026-04-27T02:10:00Z",
+	}); err != nil {
+		t.Fatalf("seed pr comment: %v", err)
+	}
 	fetchedAt := time.Now().UTC().Format(time.RFC3339Nano)
 	if err := st.UpsertPullRequestCache(ctx, store.PullRequestDetail{
 		ThreadID:         prID,
diff --git a/internal/store/comments.go b/internal/store/comments.go
index 3d0d0c3..9fbdf3c 100644
--- a/internal/store/comments.go
+++ b/internal/store/comments.go
@@ -2,6 +2,7 @@ package store
 
 import (
 	"context"
+	"database/sql"
 	"fmt"
 )
 
@@ -40,3 +41,38 @@ func (s *Store) UpsertComment(ctx context.Context, comment Comment) (int64, erro
 	}
 	return id, nil
 }
+
+func (s *Store) ListComments(ctx context.Context, threadID int64) ([]Comment, error) {
+	if !s.tableExists(ctx, "comments") {
+		return nil, nil
+	}
+	rows, err := s.q().QueryContext(ctx, `
+		select id, thread_id, github_id, comment_type, author_login, author_type, body, is_bot, raw_json, created_at_gh, updated_at_gh
+		from comments
+		where thread_id = ?
+		order by created_at_gh, id
+	`, threadID)
+	if err != nil {
+		return nil, fmt.Errorf("list comments: %w", err)
+	}
+	defer rows.Close()
+	var comments []Comment
+	for rows.Next() {
+		var comment Comment
+		var authorLogin, authorType, createdAt, updatedAt sql.NullString
+		var isBot int
+		if err := rows.Scan(&comment.ID, &comment.ThreadID, &comment.GitHubID, &comment.CommentType, &authorLogin, &authorType, &comment.Body, &isBot, &comment.RawJSON, &createdAt, &updatedAt); err != nil {
+			return nil, fmt.Errorf("scan comment: %w", err)
+		}
+		comment.AuthorLogin = authorLogin.String
+		comment.AuthorType = authorType.String
+		comment.IsBot = isBot != 0
+		comment.CreatedAtGitHub = createdAt.String
+		comment.UpdatedAtGitHub = updatedAt.String
+		comments = append(comments, comment)
+	}
+	if err := rows.Err(); err != nil {
+		return nil, fmt.Errorf("iterate comments: %w", err)
+	}
+	return comments, nil
+}
diff --git a/internal/store/coverage_test.go b/internal/store/coverage_test.go
index dcb28ad..47ca16d 100644
--- a/internal/store/coverage_test.go
+++ b/internal/store/coverage_test.go
@@ -205,6 +205,9 @@ func TestPortablePruneCanonicalizesSchemaAndMetadata(t *testing.T) {
 	if err := st.UpsertThreadVector(ctx, ThreadVector{ThreadID: threadIDs[0], Basis: "title_original", Model: "test", Dimensions: 2, ContentHash: "hash", Vector: []float64{1, 0}, CreatedAt: "2026-04-30T00:00:00Z", UpdatedAt: "2026-04-30T00:00:00Z"}); err != nil {
 		t.Fatalf("upsert vector: %v", err)
 	}
+	if _, err := st.UpsertComment(ctx, Comment{ThreadID: threadIDs[0], GitHubID: "c1", CommentType: "issue_comment", AuthorLogin: "alice", Body: "portable comment body", RawJSON: `{"body":"portable comment body"}`, CreatedAtGitHub: "2026-04-30T00:00:00Z", UpdatedAtGitHub: "2026-04-30T00:00:00Z"}); err != nil {
+		t.Fatalf("upsert comment: %v", err)
+	}
 	if _, err := st.DB().ExecContext(ctx, `insert into sync_runs(repo_id, scope, status, started_at, finished_at, stats_json) values(?, 'open', 'success', '2026-04-30T00:00:00Z', '2026-04-30T00:01:00Z', '{}')`, repoID); err != nil {
 		t.Fatalf("seed sync run: %v", err)
 	}
@@ -218,6 +221,22 @@ func TestPortablePruneCanonicalizesSchemaAndMetadata(t *testing.T) {
 	if !st.tableExists(ctx, "portable_metadata") || st.hasColumn(ctx, "threads", "body") {
 		t.Fatalf("portable schema was not canonicalized")
 	}
+	if !st.tableExists(ctx, "comments") {
+		t.Fatalf("comments should remain in portable v2")
+	}
+	var schema, includes, excluded string
+	if err := st.DB().QueryRowContext(ctx, `select value from portable_metadata where key = 'schema'`).Scan(&schema); err != nil {
+		t.Fatalf("schema metadata: %v", err)
+	}
+	if err := st.DB().QueryRowContext(ctx, `select value from portable_metadata where key = 'includes'`).Scan(&includes); err != nil {
+		t.Fatalf("includes metadata: %v", err)
+	}
+	if err := st.DB().QueryRowContext(ctx, `select value from portable_metadata where key = 'excluded'`).Scan(&excluded); err != nil {
+		t.Fatalf("excluded metadata: %v", err)
+	}
+	if schema != "gitcrawl-portable-sync-v2" || !strings.Contains(includes, "comments") || strings.Contains(excluded, "comments") {
+		t.Fatalf("portable metadata schema=%q includes=%q excluded=%q", schema, includes, excluded)
+	}
 	if err := st.Close(); err != nil {
 		t.Fatalf("close store: %v", err)
 	}
diff --git a/internal/store/portable.go b/internal/store/portable.go
index 54b22bf..a66fc68 100644
--- a/internal/store/portable.go
+++ b/internal/store/portable.go
@@ -20,7 +20,9 @@ type PortablePruneStats struct {
 	BytesBefore         int64    `json:"bytes_before"`
 	BytesAfter          int64    `json:"bytes_after"`
 	ThreadsPruned       int64    `json:"threads_pruned"`
+	CommentsPruned      int64    `json:"comments_pruned"`
 	RepositoriesPruned  int64    `json:"repositories_pruned"`
+	RawJSONPruned       int64    `json:"raw_json_pruned"`
 	FingerprintsPruned  int64    `json:"fingerprints_pruned"`
 	DocumentsDeleted    int64    `json:"documents_deleted"`
 	DocumentsFTSRebuilt bool     `json:"documents_fts_rebuilt"`
@@ -43,34 +45,25 @@ func (s *Store) PrunePortablePayloads(ctx context.Context, options PortablePrune
 	}
 
 	if s.hasColumn(ctx, "threads", "body") {
-		if s.hasColumn(ctx, "threads", "body_excerpt") && s.hasColumn(ctx, "threads", "body_length") {
-			if result, err := s.db.ExecContext(ctx, `
-				update threads
-				   set body_length = case when body is not null then length(body) else body_length end,
-				       body_excerpt = case
-				         when body is not null and length(body) > ? then substr(body, 1, ?)
-				         when body is not null then body
-				         else body_excerpt
-				       end
-				 where body is not null
-			`, options.BodyChars, options.BodyChars); err != nil {
-				return stats, fmt.Errorf("prune thread body excerpts: %w", err)
-			} else {
-				stats.ThreadsPruned += rowsAffected(result)
-			}
-			if _, err := s.db.ExecContext(ctx, `update threads set body = body_excerpt`); err != nil {
-				return stats, fmt.Errorf("replace thread bodies with excerpts: %w", err)
-			}
+		if err := s.ensurePortableExcerptColumns(ctx, "threads"); err != nil {
+			return stats, err
+		}
+		if result, err := s.db.ExecContext(ctx, `
+			update threads
+			   set body_length = case when body is not null then length(body) else body_length end,
+			       body_excerpt = case
+			         when body is not null and length(body) > ? then substr(body, 1, ?)
+			         when body is not null then body
+			         else body_excerpt
+			       end
+			 where body is not null
+		`, options.BodyChars, options.BodyChars); err != nil {
+			return stats, fmt.Errorf("prune thread body excerpts: %w", err)
 		} else {
-			if result, err := s.db.ExecContext(ctx, `
-				update threads
-				   set body = case when length(body) > ? then substr(body, 1, ?) else body end
-				 where body is not null
-			`, options.BodyChars, options.BodyChars); err != nil {
-				return stats, fmt.Errorf("trim thread bodies: %w", err)
-			} else {
-				stats.ThreadsPruned += rowsAffected(result)
-			}
+			stats.ThreadsPruned += rowsAffected(result)
+		}
+		if _, err := s.db.ExecContext(ctx, `update threads set body = body_excerpt`); err != nil {
+			return stats, fmt.Errorf("replace thread bodies with excerpts: %w", err)
 		}
 	}
 	if s.hasColumn(ctx, "threads", "raw_json") {
@@ -85,6 +78,26 @@ func (s *Store) PrunePortablePayloads(ctx context.Context, options PortablePrune
 		}
 		stats.RepositoriesPruned = rowsAffected(result)
 	}
+	if s.tableExists(ctx, "comments") && s.hasColumn(ctx, "comments", "body") {
+		if err := s.ensurePortableExcerptColumns(ctx, "comments"); err != nil {
+			return stats, err
+		}
+		if result, err := s.db.ExecContext(ctx, `
+			update comments
+			   set body_length = length(body),
+			       body_excerpt = case when length(body) > ? then substr(body, 1, ?) else body end,
+			       body = case when length(body) > ? then substr(body, 1, ?) else body end
+		`, options.BodyChars, options.BodyChars, options.BodyChars, options.BodyChars); err != nil {
+			return stats, fmt.Errorf("prune comment bodies: %w", err)
+		} else {
+			stats.CommentsPruned = rowsAffected(result)
+		}
+	}
+	if pruned, err := s.clearPortableRawJSON(ctx); err != nil {
+		return stats, err
+	} else {
+		stats.RawJSONPruned = pruned
+	}
 	if s.tableExists(ctx, "thread_fingerprints") {
 		result, err := s.db.ExecContext(ctx, `
 			update thread_fingerprints
@@ -180,11 +193,13 @@ func (s *Store) canonicalizePortableSchema(ctx context.Context, bodyChars int, s
 		return fmt.Errorf("ensure portable metadata: %w", err)
 	}
 	metadata := map[string]string{
-		"schema":      "ghcrawl-portable-sync-v1",
-		"body_chars":  fmt.Sprintf("%d", bodyChars),
-		"excluded":    "raw_json,comments,documents,fts,vectors,code_snapshots,cluster_events,run_history,similarity_edges,blobs",
-		"exported_at": time.Now().UTC().Format(timeLayout),
-		"source_path": s.path,
+		"schema":       "gitcrawl-portable-sync-v2",
+		"body_chars":   fmt.Sprintf("%d", bodyChars),
+		"capabilities": "body_excerpts,comment_excerpts,pr_details,pr_files,pr_commits,pr_checks,workflow_runs,raw_json_stripped",
+		"includes":     "repositories,threads,comments,pull_request_details,pull_request_files,pull_request_commits,pull_request_checks,github_workflow_runs,thread_fingerprints",
+		"excluded":     "raw_json,documents,fts,vectors,code_snapshots,cluster_events,run_history,similarity_edges,blobs",
+		"exported_at":  time.Now().UTC().Format(timeLayout),
+		"source_path":  s.path,
 	}
 	for key, value := range metadata {
 		if _, err := s.db.ExecContext(ctx, `
@@ -198,6 +213,59 @@ func (s *Store) canonicalizePortableSchema(ctx context.Context, bodyChars int, s
 	return nil
 }
 
+func (s *Store) ensurePortableExcerptColumns(ctx context.Context, table string) error {
+	if !s.hasColumn(ctx, table, "body_excerpt") {
+		if _, err := s.db.ExecContext(ctx, `alter table `+sqliteIdentifier(table)+` add column body_excerpt text`); err != nil {
+			return fmt.Errorf("add portable %s.body_excerpt: %w", table, err)
+		}
+	}
+	if !s.hasColumn(ctx, table, "body_length") {
+		if _, err := s.db.ExecContext(ctx, `alter table `+sqliteIdentifier(table)+` add column body_length integer not null default 0`); err != nil {
+			return fmt.Errorf("add portable %s.body_length: %w", table, err)
+		}
+	}
+	return nil
+}
+
+func (s *Store) clearPortableRawJSON(ctx context.Context) (int64, error) {
+	var total int64
+	for _, column := range []struct {
+		table string
+		name  string
+	}{
+		{table: "comments", name: "raw_json"},
+		{table: "pull_request_details", name: "raw_json"},
+		{table: "pull_request_files", name: "raw_json"},
+		{table: "pull_request_commits", name: "raw_json"},
+		{table: "pull_request_checks", name: "raw_json"},
+		{table: "github_workflow_runs", name: "raw_json"},
+	} {
+		if !s.hasColumn(ctx, column.table, column.name) {
+			continue
+		}
+		result, err := s.db.ExecContext(ctx, `update `+sqliteIdentifier(column.table)+` set `+sqliteIdentifier(column.name)+` = '' where `+sqliteIdentifier(column.name)+` is not null and `+sqliteIdentifier(column.name)+` != ''`)
+		if err != nil {
+			return total, fmt.Errorf("clear portable raw json %s.%s: %w", column.table, column.name, err)
+		}
+		total += rowsAffected(result)
+	}
+	for _, column := range []struct {
+		table string
+		name  string
+	}{
+		{table: "comments", name: "raw_json_blob_id"},
+		{table: "thread_revisions", name: "raw_json_blob_id"},
+	} {
+		if !s.hasColumn(ctx, column.table, column.name) {
+			continue
+		}
+		if _, err := s.db.ExecContext(ctx, `update `+sqliteIdentifier(column.table)+` set `+sqliteIdentifier(column.name)+` = null where `+sqliteIdentifier(column.name)+` is not null`); err != nil {
+			return total, fmt.Errorf("clear portable raw blob pointer %s.%s: %w", column.table, column.name, err)
+		}
+	}
+	return total, nil
+}
+
 func canonicalPortableDroppedTables() []string {
 	return []string{
 		"documents_fts",
@@ -205,7 +273,6 @@ func canonicalPortableDroppedTables() []string {
 		"documents_fts_data",
 		"documents_fts_docsize",
 		"documents_fts_idx",
-		"comments",
 		"documents",
 		"document_embeddings",
 		"document_summaries",
diff --git a/internal/store/store_test.go b/internal/store/store_test.go
index 88f997c..9ef3518 100644
--- a/internal/store/store_test.go
+++ b/internal/store/store_test.go
@@ -6,6 +6,7 @@ import (
 	"database/sql"
 	"os"
 	"path/filepath"
+	"strings"
 	"testing"
 	"time"
 )
@@ -456,7 +457,19 @@ func TestPrunePortablePayloads(t *testing.T) {
 		insert into repositories(id, owner, name, full_name, raw_json, updated_at)
 		values(1, 'openclaw', 'gitcrawl', 'openclaw/gitcrawl', '{"id":1}', '2026-04-26T00:00:00Z');
 		insert into threads(id, repo_id, github_id, number, kind, state, title, body, html_url, labels_json, assignees_json, raw_json, content_hash, updated_at)
-		values(1, 1, '1', 1, 'issue', 'open', 'download stalls', 'abcdefghijklmnopqrstuvwxyz', 'https://github.com/openclaw/gitcrawl/issues/1', '[]', '[]', '{"body":"abcdefghijklmnopqrstuvwxyz"}', 'hash', '2026-04-26T00:00:00Z');
+		values(1, 1, '1', 1, 'pull_request', 'open', 'download stalls', 'abcdefghijklmnopqrstuvwxyz', 'https://github.com/openclaw/gitcrawl/pull/1', '[]', '[]', '{"body":"abcdefghijklmnopqrstuvwxyz"}', 'hash', '2026-04-26T00:00:00Z');
+		insert into comments(id, thread_id, github_id, comment_type, author_login, author_type, body, is_bot, raw_json, created_at_gh, updated_at_gh)
+		values(1, 1, 'c1', 'issue_comment', 'alice', 'User', 'comment abcdefghijklmnopqrstuvwxyz', 0, '{"body":"comment abcdefghijklmnopqrstuvwxyz"}', '2026-04-26T00:00:00Z', '2026-04-26T00:00:00Z');
+		insert into pull_request_details(thread_id, repo_id, number, base_sha, head_sha, additions, deletions, changed_files, raw_json, fetched_at, updated_at)
+		values(1, 1, 1, 'base', 'head', 10, 2, 1, '{"mergeable":true}', '2026-04-26T00:00:00Z', '2026-04-26T00:00:00Z');
+		insert into pull_request_files(thread_id, path, status, additions, deletions, changes, patch, raw_json, fetched_at)
+		values(1, 'README.md', 'modified', 10, 2, 12, '@@ patch', '{"filename":"README.md"}', '2026-04-26T00:00:00Z');
+		insert into pull_request_commits(thread_id, sha, message, raw_json, fetched_at)
+		values(1, 'abc123', 'fix download stall', '{"sha":"abc123"}', '2026-04-26T00:00:00Z');
+		insert into pull_request_checks(thread_id, name, status, conclusion, details_url, raw_json, fetched_at)
+		values(1, 'CI', 'completed', 'success', 'https://example.test/check', '{"name":"CI"}', '2026-04-26T00:00:00Z');
+		insert into github_workflow_runs(repo_id, run_id, run_number, head_branch, head_sha, status, conclusion, workflow_name, html_url, raw_json, fetched_at)
+		values(1, '99', 99, 'main', 'head', 'completed', 'success', 'CI', 'https://example.test/run', '{"id":99}', '2026-04-26T00:00:00Z');
 		insert into documents(thread_id, title, body, raw_text, dedupe_text, updated_at)
 		values(1, 'download stalls', 'abcdefghijklmnopqrstuvwxyz', 'download stalls abcdefghijklmnopqrstuvwxyz', 'download stalls', '2026-04-26T00:00:00Z');
 		insert into thread_revisions(thread_id, source_updated_at, content_hash, title_hash, body_hash, labels_hash, created_at)
@@ -472,7 +485,7 @@ func TestPrunePortablePayloads(t *testing.T) {
 	if err != nil {
 		t.Fatalf("prune: %v", err)
 	}
-	if stats.DocumentsDeleted != 1 || stats.FingerprintsPruned != 1 {
+	if stats.DocumentsDeleted != 1 || stats.FingerprintsPruned != 1 || stats.CommentsPruned != 1 || stats.RawJSONPruned == 0 {
 		t.Fatalf("unexpected stats: %#v", stats)
 	}
 
@@ -489,12 +502,59 @@ func TestPrunePortablePayloads(t *testing.T) {
 	if err := st.DB().QueryRowContext(ctx, `select body_excerpt from threads where id = 1`).Scan(&bodyExcerpt); err != nil {
 		t.Fatalf("thread body excerpt: %v", err)
 	}
+	var bodyLength int
+	if err := st.DB().QueryRowContext(ctx, `select body_length from threads where id = 1`).Scan(&bodyLength); err != nil {
+		t.Fatalf("thread body length: %v", err)
+	}
+	if bodyLength != 26 {
+		t.Fatalf("thread body_length = %d, want 26", bodyLength)
+	}
 	if err := st.DB().QueryRowContext(ctx, `select title_tokens_json, linked_refs_json, module_buckets_json, feature_json from thread_fingerprints where id = 1`).Scan(&titleTokens, &linkedRefs, &buckets, &features); err != nil {
 		t.Fatalf("fingerprint payload: %v", err)
 	}
 	if st.tableExists(ctx, "documents") {
 		t.Fatal("documents table was not dropped")
 	}
+	if !st.tableExists(ctx, "comments") {
+		t.Fatal("comments table was dropped")
+	}
+	var commentBody, commentExcerpt, commentRawJSON string
+	var commentBodyLength int
+	if err := st.DB().QueryRowContext(ctx, `select body, body_excerpt, body_length, raw_json from comments where id = 1`).Scan(&commentBody, &commentExcerpt, &commentBodyLength, &commentRawJSON); err != nil {
+		t.Fatalf("comment portable payload: %v", err)
+	}
+	if commentBody != "comment " || commentExcerpt != "comment " || commentBodyLength != 34 || commentRawJSON != "" {
+		t.Fatalf("comment not pruned: body=%q excerpt=%q length=%d raw=%q", commentBody, commentExcerpt, commentBodyLength, commentRawJSON)
+	}
+	var prDetailCount, prFileCount, prCommitCount, prCheckCount, runCount int
+	if err := st.DB().QueryRowContext(ctx, `select count(*) from pull_request_details where raw_json = ''`).Scan(&prDetailCount); err != nil {
+		t.Fatalf("pr detail count: %v", err)
+	}
+	if err := st.DB().QueryRowContext(ctx, `select count(*) from pull_request_files where raw_json = ''`).Scan(&prFileCount); err != nil {
+		t.Fatalf("pr file count: %v", err)
+	}
+	if err := st.DB().QueryRowContext(ctx, `select count(*) from pull_request_commits where raw_json = ''`).Scan(&prCommitCount); err != nil {
+		t.Fatalf("pr commit count: %v", err)
+	}
+	if err := st.DB().QueryRowContext(ctx, `select count(*) from pull_request_checks where raw_json = ''`).Scan(&prCheckCount); err != nil {
+		t.Fatalf("pr check count: %v", err)
+	}
+	if err := st.DB().QueryRowContext(ctx, `select count(*) from github_workflow_runs where raw_json = ''`).Scan(&runCount); err != nil {
+		t.Fatalf("workflow run count: %v", err)
+	}
+	if prDetailCount != 1 || prFileCount != 1 || prCommitCount != 1 || prCheckCount != 1 || runCount != 1 {
+		t.Fatalf("pr/run rows not retained: detail=%d files=%d commits=%d checks=%d runs=%d", prDetailCount, prFileCount, prCommitCount, prCheckCount, runCount)
+	}
+	var portableSchema, capabilities string
+	if err := st.DB().QueryRowContext(ctx, `select value from portable_metadata where key = 'schema'`).Scan(&portableSchema); err != nil {
+		t.Fatalf("portable schema metadata: %v", err)
+	}
+	if err := st.DB().QueryRowContext(ctx, `select value from portable_metadata where key = 'capabilities'`).Scan(&capabilities); err != nil {
+		t.Fatalf("portable capabilities metadata: %v", err)
+	}
+	if portableSchema != "gitcrawl-portable-sync-v2" || !strings.Contains(capabilities, "comment_excerpts") || !strings.Contains(capabilities, "workflow_runs") {
+		t.Fatalf("portable metadata schema=%q capabilities=%q", portableSchema, capabilities)
+	}
 	if bodyExcerpt != "abcdefgh" || titleTokens != "[]" || linkedRefs != "[]" || buckets != "[]" || features != "{}" {
 		t.Fatalf("payloads not pruned: bodyExcerpt=%q titleTokens=%q linkedRefs=%q buckets=%q features=%q", bodyExcerpt, titleTokens, linkedRefs, buckets, features)
 	}