From 69a4bc56f78f733ed168911997e43338e562eeaa Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Tue, 5 May 2026 05:21:13 +0100 Subject: [PATCH] feat(portable): add v2 backup data --- CHANGELOG.md | 3 +- docs/commands.md | 2 +- docs/concepts.md | 2 +- docs/portable-stores.md | 22 +++-- docs/refresh-and-embed.md | 2 +- internal/cli/gh_shim_detail_test.go | 17 +++- internal/cli/gh_shim_prcache.go | 35 ++++++++ internal/cli/gh_shim_test.go | 26 ++++++ internal/store/comments.go | 36 ++++++++ internal/store/coverage_test.go | 19 ++++ internal/store/portable.go | 133 +++++++++++++++++++++------- internal/store/store_test.go | 64 ++++++++++++- 12 files changed, 315 insertions(+), 46 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c41fb38..d1f9361 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,12 +1,13 @@ # Changelog -## 0.1.3 - Unreleased +## 0.2.0 - Unreleased - Force embedding refreshes when the embedding input rune cap changes, so stale larger-cap vectors are not reused. - Expand the `gh` shim with local list filters, PR diff caching by cached head SHA, xcache GC, hit/miss/write counters, and throttled portable-store refreshes to reduce GitHub API pressure across agent sessions. - Add explicit PR-detail hydration for files, commits, checks, and workflow runs so `gh pr view`, `gh pr checks`, and `gh run list/view` can answer common review reads from the existing SQLite cache. - Auto-hydrate one exact pull request when local PR detail reads miss or check/run data is stale, using `gh auth token` if `GITHUB_TOKEN` is absent, then retry from SQLite before falling back to live `gh`. - Cache more ghx-style read-only fallthroughs, including release, workflow, secret, variable, project, ruleset, gist, org, and search reads; cache repeat read failures by default; and clear the fallthrough cache after the corresponding mutating `gh` commands. +- Promote portable backups to the v2 format: keep compact comments, PR files, commits, checks, and workflow runs while stripping raw JSON, generated documents, vectors, clusters, and run history. ## 0.1.2 - 2026-05-01 diff --git a/docs/commands.md b/docs/commands.md index b2cd642..a92b669 100644 --- a/docs/commands.md +++ b/docs/commands.md @@ -109,7 +109,7 @@ The shim binary can be installed standalone by symlinking the `gitcrawl` binary | Command | Purpose | Docs | | --- | --- | --- | -| `gitcrawl portable prune [--body-chars --no-vacuum --json]` | Truncate thread bodies and (optionally) `VACUUM` for a small publishable database | [Portable stores](/portable-stores/#publishing-gitcrawl-portable-prune) | +| `gitcrawl portable prune [--body-chars --no-vacuum --json]` | Build a compact portable v2 backup and (optionally) `VACUUM` for publishing | [Portable stores](/portable-stores/#publishing-gitcrawl-portable-prune) | ## Not yet implemented diff --git a/docs/concepts.md b/docs/concepts.md index 62ef77f..09f5ef7 100644 --- a/docs/concepts.md +++ b/docs/concepts.md @@ -88,7 +88,7 @@ Every sync, embed, and cluster operation records a **run** in `run_records` with A **portable store** is a Git-backed publish target for a `gitcrawl.db` plus its derived bodies, designed for sharing a local cache across agents or machines without a hosted service. -`gitcrawl init --portable-store https://github.com/org/repo` clones a portable store into `~/.config/gitcrawl/portable/`, points the runtime at it, and `gitcrawl portable prune --body-chars 256` keeps the published payload small. Read-only commands run against portable stores refresh the checkout before reading. See [Portable stores](/portable-stores/). +`gitcrawl init --portable-store https://github.com/org/repo` clones a portable store into `~/.config/gitcrawl/portable/`, points the runtime at it, and `gitcrawl portable prune --body-chars 256` keeps the published payload small while retaining comments, PR details, checks, and workflow runs. Read-only commands run against portable stores refresh the checkout before reading. See [Portable stores](/portable-stores/). ## Cache diff --git a/docs/portable-stores.md b/docs/portable-stores.md index 88c8575..2a22ea0 100644 --- a/docs/portable-stores.md +++ b/docs/portable-stores.md @@ -19,7 +19,7 @@ A Git-backed publish target for a `gitcrawl.db` plus its derived bodies — shar - You want a backup of the SQLite cache that someone else can clone and use immediately. - You want a deterministic snapshot of "what gitcrawl knew at time T" for reproducible triage. -A portable store is just a Git repository whose contents include a SQLite database (and optionally derived bodies and vectors). Anyone with read access to the repository can `git clone` it and have a fully populated gitcrawl mirror in seconds. +A portable store is just a Git repository whose contents include a SQLite database. Anyone with read access to the repository can `git clone` it and have a fully populated gitcrawl mirror in seconds. ## Setup: pointing gitcrawl at a portable store @@ -68,11 +68,21 @@ gitcrawl portable prune --body-chars 512 --no-vacuum gitcrawl portable prune --json ``` -`prune` truncates thread bodies in the database to the requested character cap and (by default) runs SQLite `VACUUM` to reclaim space. The result is a smaller database suitable for committing back to Git. +`prune` converts the database into the portable v2 backup format and (by default) runs SQLite `VACUUM` to reclaim space. The result is a smaller database suitable for committing back to Git. + +Portable v2 keeps the data agents most often need for offline GitHub reads: + +- Repositories, issues, pull requests, labels, authors, and timestamps +- Compact issue/PR body excerpts plus original body lengths +- Compact comments, reviews, and review-comment excerpts plus original body lengths +- PR details, files, commits, status checks, and workflow runs +- Thread fingerprints used by duplicate and cluster-oriented workflows + +It strips the data that is large, easy to regenerate, or mainly useful for exact API replay: raw GitHub JSON, generated documents and FTS indexes, embeddings and vectors, code snapshots and diff blobs, cluster run history, similarity edges, and blob storage. The database records this contract in `portable_metadata` with `schema=gitcrawl-portable-sync-v2`, `includes`, `excluded`, and `capabilities` keys. | Flag | Default | Description | | --- | --- | --- | -| `--body-chars ` | `256` | Maximum body characters to keep per thread | +| `--body-chars ` | `256` | Maximum body characters to keep per thread/comment excerpt | | `--no-vacuum` | _(off)_ | Skip the post-prune `VACUUM` | | `--json` | _(off)_ | JSON output | @@ -100,10 +110,12 @@ Other agents and machines pull the new commit on their next read-only command. `gitcrawl search` (and the gh-shim's search) work against portable-store data with one wrinkle: when the portable store has been pruned, generated document indexes may not be present. Search falls back to compact thread title/body data automatically — you keep useful results without the publisher needing to ship the full document indexes. +The v2 backup also keeps comments and PR-detail tables, so common shim reads such as `gh issue view --json comments`, `gh pr view --json files,commits,statusCheckRollup`, `gh pr checks`, and `gh run list` can be answered from the shared checkout when those details were synced before publishing. + ## Caveats -- The portable store carries the SQLite database. It does not carry the runtime cache or the vector store unless you explicitly publish them. -- Vectors regenerated on each consumer's machine after `embed` are not shared; if you want shared vectors, publish the `vectors/` directory alongside the database. +- The portable store carries the SQLite database. It does not carry the runtime fallthrough cache. +- Vectors regenerated on each consumer's machine after `embed` are not shared; portable pruning removes vector tables from the published database. - Portable stores are read-mostly. Multiple writers pushing concurrently will race the way any Git workflow does — gate writes through a single publisher or a CI workflow. ## See also diff --git a/docs/refresh-and-embed.md b/docs/refresh-and-embed.md index bc8a7df..2ab806e 100644 --- a/docs/refresh-and-embed.md +++ b/docs/refresh-and-embed.md @@ -127,4 +127,4 @@ Each row carries `started_at`, `finished_at`, `status`, and `stats_json` — use - **GitHub.** Sync uses standard REST endpoints; the API quota is the dominant cost on busy repos. Use `--include-comments` and `--with pr-details` selectively. - **OpenAI.** `text-embedding-3-small` is inexpensive but not free. `embed` is bounded by `--limit` if you want to stay under a budget on initial backfills. -- **Disk.** Vectors and PR detail blobs grow with the repo. The portable-store flow includes `gitcrawl portable prune` to keep published payloads small — see [Portable stores](/portable-stores/). +- **Disk.** Vectors, generated documents, and raw API payloads grow with the repo. The portable-store flow includes `gitcrawl portable prune` to keep published payloads small while retaining compact comments and PR details — see [Portable stores](/portable-stores/). diff --git a/internal/cli/gh_shim_detail_test.go b/internal/cli/gh_shim_detail_test.go index 320f842..ea09b4d 100644 --- a/internal/cli/gh_shim_detail_test.go +++ b/internal/cli/gh_shim_detail_test.go @@ -19,14 +19,15 @@ func TestGHShimViewAndListUseLocalCache(t *testing.T) { run := New() var stdout bytes.Buffer run.Stdout = &stdout - if err := run.Run(ctx, []string{"--config", configPath, "gh", "pr", "view", "12", "-R", "openclaw/openclaw", "--json", "number,title,isDraft,author"}); err != nil { + if err := run.Run(ctx, []string{"--config", configPath, "gh", "pr", "view", "12", "-R", "openclaw/openclaw", "--json", "number,title,isDraft,author,comments"}); err != nil { t.Fatalf("gh pr view: %v", err) } var view map[string]any if err := json.Unmarshal(stdout.Bytes(), &view); err != nil { t.Fatalf("decode view: %v\n%s", err, stdout.String()) } - if int(view["number"].(float64)) != 12 || view["isDraft"] != true { + comments := view["comments"].([]any) + if int(view["number"].(float64)) != 12 || view["isDraft"] != true || len(comments) != 1 || comments[0].(map[string]any)["body"] != "cache path looks good" { t.Fatalf("view = %#v", view) } @@ -89,6 +90,18 @@ func TestGHShimViewAndListUseLocalCache(t *testing.T) { t.Fatalf("list = %#v", list) } + stdout.Reset() + if err := run.Run(ctx, []string{"--config", configPath, "gh", "issue", "view", "10", "-R", "openclaw/openclaw", "--json", "number,comments"}); err != nil { + t.Fatalf("gh issue view comments: %v", err) + } + if err := json.Unmarshal(stdout.Bytes(), &view); err != nil { + t.Fatalf("decode issue comments: %v\n%s", err, stdout.String()) + } + comments = view["comments"].([]any) + if len(comments) != 1 || comments[0].(map[string]any)["body"] != "same hot loop here" { + t.Fatalf("issue comments = %#v", view) + } + stdout.Reset() if err := run.Run(ctx, []string{"--config", configPath, "gh", "issue", "list", "-R", "openclaw/openclaw", "--author", "alice", "--assignee", "peter", "--label", "bug", "--json", "number,title"}); err != nil { t.Fatalf("gh issue list filtered: %v", err) diff --git a/internal/cli/gh_shim_prcache.go b/internal/cli/gh_shim_prcache.go index a60789c..1b4aee7 100644 --- a/internal/cli/gh_shim_prcache.go +++ b/internal/cli/gh_shim_prcache.go @@ -18,6 +18,14 @@ func (a *App) ghThreadViewJSONRow(ctx context.Context, repoValue string, thread row := make(map[string]any, len(fields)) var cache *store.PullRequestCache for _, field := range fields { + if field == "comments" { + comments, err := a.localGHThreadComments(ctx, thread.ID) + if err != nil { + return nil, err + } + row[field] = ghCommentsJSONValue(comments) + continue + } value, err := ghSearchJSONValue(thread, field) if err == nil { row[field] = value @@ -63,6 +71,33 @@ func (a *App) localGHPullRequestCache(ctx context.Context, repoValue string, num return cache, nil } +func (a *App) localGHThreadComments(ctx context.Context, threadID int64) ([]store.Comment, error) { + rt, err := a.openLocalRuntimeReadOnly(ctx) + if err != nil { + return nil, localGHUnsupported(err) + } + defer rt.Store.Close() + comments, err := rt.Store.ListComments(ctx, threadID) + if err != nil { + return nil, localGHUnsupported(err) + } + return comments, nil +} + +func ghCommentsJSONValue(comments []store.Comment) []map[string]any { + out := make([]map[string]any, 0, len(comments)) + for _, comment := range comments { + out = append(out, map[string]any{ + "id": comment.GitHubID, + "author": map[string]any{"login": comment.AuthorLogin, "type": comment.AuthorType}, + "body": comment.Body, + "createdAt": comment.CreatedAtGitHub, + "updatedAt": comment.UpdatedAtGitHub, + }) + } + return out +} + func ghPRDetailJSONValue(thread store.Thread, cache store.PullRequestCache, field string) (any, error) { switch field { case "files": diff --git a/internal/cli/gh_shim_test.go b/internal/cli/gh_shim_test.go index 7ebddff..a155e0b 100644 --- a/internal/cli/gh_shim_test.go +++ b/internal/cli/gh_shim_test.go @@ -119,6 +119,19 @@ func seedGHShimRepo(t *testing.T, ctx context.Context) string { if _, err := st.UpsertDocument(ctx, store.Document{ThreadID: issueID, Title: "Hot loop burns CPU", RawText: "runtime hot loop burns CPU", DedupeText: "runtime hot loop burns cpu", UpdatedAt: "2026-04-27T01:00:00Z"}); err != nil { t.Fatalf("seed issue document: %v", err) } + if _, err := st.UpsertComment(ctx, store.Comment{ + ThreadID: issueID, + GitHubID: "1001", + CommentType: "issue_comment", + AuthorLogin: "carol", + AuthorType: "User", + Body: "same hot loop here", + RawJSON: "{}", + CreatedAtGitHub: "2026-04-27T01:10:00Z", + UpdatedAtGitHub: "2026-04-27T01:10:00Z", + }); err != nil { + t.Fatalf("seed issue comment: %v", err) + } prID, err := st.UpsertThread(ctx, store.Thread{ RepoID: repoID, GitHubID: "12", @@ -143,6 +156,19 @@ func seedGHShimRepo(t *testing.T, ctx context.Context) string { if _, err := st.UpsertDocument(ctx, store.Document{ThreadID: prID, Title: "Manifest cache update", RawText: "manifest cache refresh", DedupeText: "manifest cache refresh", UpdatedAt: "2026-04-27T02:00:00Z"}); err != nil { t.Fatalf("seed pr document: %v", err) } + if _, err := st.UpsertComment(ctx, store.Comment{ + ThreadID: prID, + GitHubID: "1201", + CommentType: "review_comment", + AuthorLogin: "dana", + AuthorType: "User", + Body: "cache path looks good", + RawJSON: "{}", + CreatedAtGitHub: "2026-04-27T02:10:00Z", + UpdatedAtGitHub: "2026-04-27T02:10:00Z", + }); err != nil { + t.Fatalf("seed pr comment: %v", err) + } fetchedAt := time.Now().UTC().Format(time.RFC3339Nano) if err := st.UpsertPullRequestCache(ctx, store.PullRequestDetail{ ThreadID: prID, diff --git a/internal/store/comments.go b/internal/store/comments.go index 3d0d0c3..9fbdf3c 100644 --- a/internal/store/comments.go +++ b/internal/store/comments.go @@ -2,6 +2,7 @@ package store import ( "context" + "database/sql" "fmt" ) @@ -40,3 +41,38 @@ func (s *Store) UpsertComment(ctx context.Context, comment Comment) (int64, erro } return id, nil } + +func (s *Store) ListComments(ctx context.Context, threadID int64) ([]Comment, error) { + if !s.tableExists(ctx, "comments") { + return nil, nil + } + rows, err := s.q().QueryContext(ctx, ` + select id, thread_id, github_id, comment_type, author_login, author_type, body, is_bot, raw_json, created_at_gh, updated_at_gh + from comments + where thread_id = ? + order by created_at_gh, id + `, threadID) + if err != nil { + return nil, fmt.Errorf("list comments: %w", err) + } + defer rows.Close() + var comments []Comment + for rows.Next() { + var comment Comment + var authorLogin, authorType, createdAt, updatedAt sql.NullString + var isBot int + if err := rows.Scan(&comment.ID, &comment.ThreadID, &comment.GitHubID, &comment.CommentType, &authorLogin, &authorType, &comment.Body, &isBot, &comment.RawJSON, &createdAt, &updatedAt); err != nil { + return nil, fmt.Errorf("scan comment: %w", err) + } + comment.AuthorLogin = authorLogin.String + comment.AuthorType = authorType.String + comment.IsBot = isBot != 0 + comment.CreatedAtGitHub = createdAt.String + comment.UpdatedAtGitHub = updatedAt.String + comments = append(comments, comment) + } + if err := rows.Err(); err != nil { + return nil, fmt.Errorf("iterate comments: %w", err) + } + return comments, nil +} diff --git a/internal/store/coverage_test.go b/internal/store/coverage_test.go index dcb28ad..47ca16d 100644 --- a/internal/store/coverage_test.go +++ b/internal/store/coverage_test.go @@ -205,6 +205,9 @@ func TestPortablePruneCanonicalizesSchemaAndMetadata(t *testing.T) { if err := st.UpsertThreadVector(ctx, ThreadVector{ThreadID: threadIDs[0], Basis: "title_original", Model: "test", Dimensions: 2, ContentHash: "hash", Vector: []float64{1, 0}, CreatedAt: "2026-04-30T00:00:00Z", UpdatedAt: "2026-04-30T00:00:00Z"}); err != nil { t.Fatalf("upsert vector: %v", err) } + if _, err := st.UpsertComment(ctx, Comment{ThreadID: threadIDs[0], GitHubID: "c1", CommentType: "issue_comment", AuthorLogin: "alice", Body: "portable comment body", RawJSON: `{"body":"portable comment body"}`, CreatedAtGitHub: "2026-04-30T00:00:00Z", UpdatedAtGitHub: "2026-04-30T00:00:00Z"}); err != nil { + t.Fatalf("upsert comment: %v", err) + } if _, err := st.DB().ExecContext(ctx, `insert into sync_runs(repo_id, scope, status, started_at, finished_at, stats_json) values(?, 'open', 'success', '2026-04-30T00:00:00Z', '2026-04-30T00:01:00Z', '{}')`, repoID); err != nil { t.Fatalf("seed sync run: %v", err) } @@ -218,6 +221,22 @@ func TestPortablePruneCanonicalizesSchemaAndMetadata(t *testing.T) { if !st.tableExists(ctx, "portable_metadata") || st.hasColumn(ctx, "threads", "body") { t.Fatalf("portable schema was not canonicalized") } + if !st.tableExists(ctx, "comments") { + t.Fatalf("comments should remain in portable v2") + } + var schema, includes, excluded string + if err := st.DB().QueryRowContext(ctx, `select value from portable_metadata where key = 'schema'`).Scan(&schema); err != nil { + t.Fatalf("schema metadata: %v", err) + } + if err := st.DB().QueryRowContext(ctx, `select value from portable_metadata where key = 'includes'`).Scan(&includes); err != nil { + t.Fatalf("includes metadata: %v", err) + } + if err := st.DB().QueryRowContext(ctx, `select value from portable_metadata where key = 'excluded'`).Scan(&excluded); err != nil { + t.Fatalf("excluded metadata: %v", err) + } + if schema != "gitcrawl-portable-sync-v2" || !strings.Contains(includes, "comments") || strings.Contains(excluded, "comments") { + t.Fatalf("portable metadata schema=%q includes=%q excluded=%q", schema, includes, excluded) + } if err := st.Close(); err != nil { t.Fatalf("close store: %v", err) } diff --git a/internal/store/portable.go b/internal/store/portable.go index 54b22bf..a66fc68 100644 --- a/internal/store/portable.go +++ b/internal/store/portable.go @@ -20,7 +20,9 @@ type PortablePruneStats struct { BytesBefore int64 `json:"bytes_before"` BytesAfter int64 `json:"bytes_after"` ThreadsPruned int64 `json:"threads_pruned"` + CommentsPruned int64 `json:"comments_pruned"` RepositoriesPruned int64 `json:"repositories_pruned"` + RawJSONPruned int64 `json:"raw_json_pruned"` FingerprintsPruned int64 `json:"fingerprints_pruned"` DocumentsDeleted int64 `json:"documents_deleted"` DocumentsFTSRebuilt bool `json:"documents_fts_rebuilt"` @@ -43,34 +45,25 @@ func (s *Store) PrunePortablePayloads(ctx context.Context, options PortablePrune } if s.hasColumn(ctx, "threads", "body") { - if s.hasColumn(ctx, "threads", "body_excerpt") && s.hasColumn(ctx, "threads", "body_length") { - if result, err := s.db.ExecContext(ctx, ` - update threads - set body_length = case when body is not null then length(body) else body_length end, - body_excerpt = case - when body is not null and length(body) > ? then substr(body, 1, ?) - when body is not null then body - else body_excerpt - end - where body is not null - `, options.BodyChars, options.BodyChars); err != nil { - return stats, fmt.Errorf("prune thread body excerpts: %w", err) - } else { - stats.ThreadsPruned += rowsAffected(result) - } - if _, err := s.db.ExecContext(ctx, `update threads set body = body_excerpt`); err != nil { - return stats, fmt.Errorf("replace thread bodies with excerpts: %w", err) - } + if err := s.ensurePortableExcerptColumns(ctx, "threads"); err != nil { + return stats, err + } + if result, err := s.db.ExecContext(ctx, ` + update threads + set body_length = case when body is not null then length(body) else body_length end, + body_excerpt = case + when body is not null and length(body) > ? then substr(body, 1, ?) + when body is not null then body + else body_excerpt + end + where body is not null + `, options.BodyChars, options.BodyChars); err != nil { + return stats, fmt.Errorf("prune thread body excerpts: %w", err) } else { - if result, err := s.db.ExecContext(ctx, ` - update threads - set body = case when length(body) > ? then substr(body, 1, ?) else body end - where body is not null - `, options.BodyChars, options.BodyChars); err != nil { - return stats, fmt.Errorf("trim thread bodies: %w", err) - } else { - stats.ThreadsPruned += rowsAffected(result) - } + stats.ThreadsPruned += rowsAffected(result) + } + if _, err := s.db.ExecContext(ctx, `update threads set body = body_excerpt`); err != nil { + return stats, fmt.Errorf("replace thread bodies with excerpts: %w", err) } } if s.hasColumn(ctx, "threads", "raw_json") { @@ -85,6 +78,26 @@ func (s *Store) PrunePortablePayloads(ctx context.Context, options PortablePrune } stats.RepositoriesPruned = rowsAffected(result) } + if s.tableExists(ctx, "comments") && s.hasColumn(ctx, "comments", "body") { + if err := s.ensurePortableExcerptColumns(ctx, "comments"); err != nil { + return stats, err + } + if result, err := s.db.ExecContext(ctx, ` + update comments + set body_length = length(body), + body_excerpt = case when length(body) > ? then substr(body, 1, ?) else body end, + body = case when length(body) > ? then substr(body, 1, ?) else body end + `, options.BodyChars, options.BodyChars, options.BodyChars, options.BodyChars); err != nil { + return stats, fmt.Errorf("prune comment bodies: %w", err) + } else { + stats.CommentsPruned = rowsAffected(result) + } + } + if pruned, err := s.clearPortableRawJSON(ctx); err != nil { + return stats, err + } else { + stats.RawJSONPruned = pruned + } if s.tableExists(ctx, "thread_fingerprints") { result, err := s.db.ExecContext(ctx, ` update thread_fingerprints @@ -180,11 +193,13 @@ func (s *Store) canonicalizePortableSchema(ctx context.Context, bodyChars int, s return fmt.Errorf("ensure portable metadata: %w", err) } metadata := map[string]string{ - "schema": "ghcrawl-portable-sync-v1", - "body_chars": fmt.Sprintf("%d", bodyChars), - "excluded": "raw_json,comments,documents,fts,vectors,code_snapshots,cluster_events,run_history,similarity_edges,blobs", - "exported_at": time.Now().UTC().Format(timeLayout), - "source_path": s.path, + "schema": "gitcrawl-portable-sync-v2", + "body_chars": fmt.Sprintf("%d", bodyChars), + "capabilities": "body_excerpts,comment_excerpts,pr_details,pr_files,pr_commits,pr_checks,workflow_runs,raw_json_stripped", + "includes": "repositories,threads,comments,pull_request_details,pull_request_files,pull_request_commits,pull_request_checks,github_workflow_runs,thread_fingerprints", + "excluded": "raw_json,documents,fts,vectors,code_snapshots,cluster_events,run_history,similarity_edges,blobs", + "exported_at": time.Now().UTC().Format(timeLayout), + "source_path": s.path, } for key, value := range metadata { if _, err := s.db.ExecContext(ctx, ` @@ -198,6 +213,59 @@ func (s *Store) canonicalizePortableSchema(ctx context.Context, bodyChars int, s return nil } +func (s *Store) ensurePortableExcerptColumns(ctx context.Context, table string) error { + if !s.hasColumn(ctx, table, "body_excerpt") { + if _, err := s.db.ExecContext(ctx, `alter table `+sqliteIdentifier(table)+` add column body_excerpt text`); err != nil { + return fmt.Errorf("add portable %s.body_excerpt: %w", table, err) + } + } + if !s.hasColumn(ctx, table, "body_length") { + if _, err := s.db.ExecContext(ctx, `alter table `+sqliteIdentifier(table)+` add column body_length integer not null default 0`); err != nil { + return fmt.Errorf("add portable %s.body_length: %w", table, err) + } + } + return nil +} + +func (s *Store) clearPortableRawJSON(ctx context.Context) (int64, error) { + var total int64 + for _, column := range []struct { + table string + name string + }{ + {table: "comments", name: "raw_json"}, + {table: "pull_request_details", name: "raw_json"}, + {table: "pull_request_files", name: "raw_json"}, + {table: "pull_request_commits", name: "raw_json"}, + {table: "pull_request_checks", name: "raw_json"}, + {table: "github_workflow_runs", name: "raw_json"}, + } { + if !s.hasColumn(ctx, column.table, column.name) { + continue + } + result, err := s.db.ExecContext(ctx, `update `+sqliteIdentifier(column.table)+` set `+sqliteIdentifier(column.name)+` = '' where `+sqliteIdentifier(column.name)+` is not null and `+sqliteIdentifier(column.name)+` != ''`) + if err != nil { + return total, fmt.Errorf("clear portable raw json %s.%s: %w", column.table, column.name, err) + } + total += rowsAffected(result) + } + for _, column := range []struct { + table string + name string + }{ + {table: "comments", name: "raw_json_blob_id"}, + {table: "thread_revisions", name: "raw_json_blob_id"}, + } { + if !s.hasColumn(ctx, column.table, column.name) { + continue + } + if _, err := s.db.ExecContext(ctx, `update `+sqliteIdentifier(column.table)+` set `+sqliteIdentifier(column.name)+` = null where `+sqliteIdentifier(column.name)+` is not null`); err != nil { + return total, fmt.Errorf("clear portable raw blob pointer %s.%s: %w", column.table, column.name, err) + } + } + return total, nil +} + func canonicalPortableDroppedTables() []string { return []string{ "documents_fts", @@ -205,7 +273,6 @@ func canonicalPortableDroppedTables() []string { "documents_fts_data", "documents_fts_docsize", "documents_fts_idx", - "comments", "documents", "document_embeddings", "document_summaries", diff --git a/internal/store/store_test.go b/internal/store/store_test.go index 88f997c..9ef3518 100644 --- a/internal/store/store_test.go +++ b/internal/store/store_test.go @@ -6,6 +6,7 @@ import ( "database/sql" "os" "path/filepath" + "strings" "testing" "time" ) @@ -456,7 +457,19 @@ func TestPrunePortablePayloads(t *testing.T) { insert into repositories(id, owner, name, full_name, raw_json, updated_at) values(1, 'openclaw', 'gitcrawl', 'openclaw/gitcrawl', '{"id":1}', '2026-04-26T00:00:00Z'); insert into threads(id, repo_id, github_id, number, kind, state, title, body, html_url, labels_json, assignees_json, raw_json, content_hash, updated_at) - values(1, 1, '1', 1, 'issue', 'open', 'download stalls', 'abcdefghijklmnopqrstuvwxyz', 'https://github.com/openclaw/gitcrawl/issues/1', '[]', '[]', '{"body":"abcdefghijklmnopqrstuvwxyz"}', 'hash', '2026-04-26T00:00:00Z'); + values(1, 1, '1', 1, 'pull_request', 'open', 'download stalls', 'abcdefghijklmnopqrstuvwxyz', 'https://github.com/openclaw/gitcrawl/pull/1', '[]', '[]', '{"body":"abcdefghijklmnopqrstuvwxyz"}', 'hash', '2026-04-26T00:00:00Z'); + insert into comments(id, thread_id, github_id, comment_type, author_login, author_type, body, is_bot, raw_json, created_at_gh, updated_at_gh) + values(1, 1, 'c1', 'issue_comment', 'alice', 'User', 'comment abcdefghijklmnopqrstuvwxyz', 0, '{"body":"comment abcdefghijklmnopqrstuvwxyz"}', '2026-04-26T00:00:00Z', '2026-04-26T00:00:00Z'); + insert into pull_request_details(thread_id, repo_id, number, base_sha, head_sha, additions, deletions, changed_files, raw_json, fetched_at, updated_at) + values(1, 1, 1, 'base', 'head', 10, 2, 1, '{"mergeable":true}', '2026-04-26T00:00:00Z', '2026-04-26T00:00:00Z'); + insert into pull_request_files(thread_id, path, status, additions, deletions, changes, patch, raw_json, fetched_at) + values(1, 'README.md', 'modified', 10, 2, 12, '@@ patch', '{"filename":"README.md"}', '2026-04-26T00:00:00Z'); + insert into pull_request_commits(thread_id, sha, message, raw_json, fetched_at) + values(1, 'abc123', 'fix download stall', '{"sha":"abc123"}', '2026-04-26T00:00:00Z'); + insert into pull_request_checks(thread_id, name, status, conclusion, details_url, raw_json, fetched_at) + values(1, 'CI', 'completed', 'success', 'https://example.test/check', '{"name":"CI"}', '2026-04-26T00:00:00Z'); + insert into github_workflow_runs(repo_id, run_id, run_number, head_branch, head_sha, status, conclusion, workflow_name, html_url, raw_json, fetched_at) + values(1, '99', 99, 'main', 'head', 'completed', 'success', 'CI', 'https://example.test/run', '{"id":99}', '2026-04-26T00:00:00Z'); insert into documents(thread_id, title, body, raw_text, dedupe_text, updated_at) values(1, 'download stalls', 'abcdefghijklmnopqrstuvwxyz', 'download stalls abcdefghijklmnopqrstuvwxyz', 'download stalls', '2026-04-26T00:00:00Z'); insert into thread_revisions(thread_id, source_updated_at, content_hash, title_hash, body_hash, labels_hash, created_at) @@ -472,7 +485,7 @@ func TestPrunePortablePayloads(t *testing.T) { if err != nil { t.Fatalf("prune: %v", err) } - if stats.DocumentsDeleted != 1 || stats.FingerprintsPruned != 1 { + if stats.DocumentsDeleted != 1 || stats.FingerprintsPruned != 1 || stats.CommentsPruned != 1 || stats.RawJSONPruned == 0 { t.Fatalf("unexpected stats: %#v", stats) } @@ -489,12 +502,59 @@ func TestPrunePortablePayloads(t *testing.T) { if err := st.DB().QueryRowContext(ctx, `select body_excerpt from threads where id = 1`).Scan(&bodyExcerpt); err != nil { t.Fatalf("thread body excerpt: %v", err) } + var bodyLength int + if err := st.DB().QueryRowContext(ctx, `select body_length from threads where id = 1`).Scan(&bodyLength); err != nil { + t.Fatalf("thread body length: %v", err) + } + if bodyLength != 26 { + t.Fatalf("thread body_length = %d, want 26", bodyLength) + } if err := st.DB().QueryRowContext(ctx, `select title_tokens_json, linked_refs_json, module_buckets_json, feature_json from thread_fingerprints where id = 1`).Scan(&titleTokens, &linkedRefs, &buckets, &features); err != nil { t.Fatalf("fingerprint payload: %v", err) } if st.tableExists(ctx, "documents") { t.Fatal("documents table was not dropped") } + if !st.tableExists(ctx, "comments") { + t.Fatal("comments table was dropped") + } + var commentBody, commentExcerpt, commentRawJSON string + var commentBodyLength int + if err := st.DB().QueryRowContext(ctx, `select body, body_excerpt, body_length, raw_json from comments where id = 1`).Scan(&commentBody, &commentExcerpt, &commentBodyLength, &commentRawJSON); err != nil { + t.Fatalf("comment portable payload: %v", err) + } + if commentBody != "comment " || commentExcerpt != "comment " || commentBodyLength != 34 || commentRawJSON != "" { + t.Fatalf("comment not pruned: body=%q excerpt=%q length=%d raw=%q", commentBody, commentExcerpt, commentBodyLength, commentRawJSON) + } + var prDetailCount, prFileCount, prCommitCount, prCheckCount, runCount int + if err := st.DB().QueryRowContext(ctx, `select count(*) from pull_request_details where raw_json = ''`).Scan(&prDetailCount); err != nil { + t.Fatalf("pr detail count: %v", err) + } + if err := st.DB().QueryRowContext(ctx, `select count(*) from pull_request_files where raw_json = ''`).Scan(&prFileCount); err != nil { + t.Fatalf("pr file count: %v", err) + } + if err := st.DB().QueryRowContext(ctx, `select count(*) from pull_request_commits where raw_json = ''`).Scan(&prCommitCount); err != nil { + t.Fatalf("pr commit count: %v", err) + } + if err := st.DB().QueryRowContext(ctx, `select count(*) from pull_request_checks where raw_json = ''`).Scan(&prCheckCount); err != nil { + t.Fatalf("pr check count: %v", err) + } + if err := st.DB().QueryRowContext(ctx, `select count(*) from github_workflow_runs where raw_json = ''`).Scan(&runCount); err != nil { + t.Fatalf("workflow run count: %v", err) + } + if prDetailCount != 1 || prFileCount != 1 || prCommitCount != 1 || prCheckCount != 1 || runCount != 1 { + t.Fatalf("pr/run rows not retained: detail=%d files=%d commits=%d checks=%d runs=%d", prDetailCount, prFileCount, prCommitCount, prCheckCount, runCount) + } + var portableSchema, capabilities string + if err := st.DB().QueryRowContext(ctx, `select value from portable_metadata where key = 'schema'`).Scan(&portableSchema); err != nil { + t.Fatalf("portable schema metadata: %v", err) + } + if err := st.DB().QueryRowContext(ctx, `select value from portable_metadata where key = 'capabilities'`).Scan(&capabilities); err != nil { + t.Fatalf("portable capabilities metadata: %v", err) + } + if portableSchema != "gitcrawl-portable-sync-v2" || !strings.Contains(capabilities, "comment_excerpts") || !strings.Contains(capabilities, "workflow_runs") { + t.Fatalf("portable metadata schema=%q capabilities=%q", portableSchema, capabilities) + } if bodyExcerpt != "abcdefgh" || titleTokens != "[]" || linkedRefs != "[]" || buckets != "[]" || features != "{}" { t.Fatalf("payloads not pruned: bodyExcerpt=%q titleTokens=%q linkedRefs=%q buckets=%q features=%q", bodyExcerpt, titleTokens, linkedRefs, buckets, features) }