diff --git a/CHANGELOG.md b/CHANGELOG.md index d950b4c..d78f566 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ ## Unreleased +- Add `gitcrawl sync --numbers` for exact issue and pull request hydration, including comment documents, without relying on list ordering or updated-time windows. - Implement `gitcrawl refresh` and `gitcrawl embed` so synced repositories can generate OpenAI embeddings and rebuild durable clusters end to end. - Add `gitcrawl sync --state open|closed|all` so incremental backups can refresh recently closed issues and pull requests. - Default `gitcrawl sync` to `--state all`, keeping closed issue and pull request state fresh unless a narrower state is requested. diff --git a/README.md b/README.md index e2dc103..7b94a06 100644 --- a/README.md +++ b/README.md @@ -15,6 +15,7 @@ gitcrawl init gitcrawl doctor gitcrawl sync owner/repo gitcrawl sync owner/repo --state open +gitcrawl sync owner/repo --numbers 123,456 --include-comments gitcrawl refresh owner/repo gitcrawl cluster owner/repo --threshold 0.80 gitcrawl clusters owner/repo @@ -41,6 +42,7 @@ gitcrawl tui owner/repo `gitcrawl cluster` and `gitcrawl refresh` build ghcrawl-shaped durable clusters by default (`--threshold 0.80`, `--min-size 1`, `--max-cluster-size 40`, `--k 16`, `--cross-kind-threshold 0.93`): every active vector-backed thread is represented, singleton rows use `singleton_orphan`, multi-member rows use `duplicate_candidate`, and stable IDs are derived from the representative thread. They also add deterministic GitHub reference evidence for direct issue/PR links such as `#123`, `issues/123`, and `pull/123`. Weak embedding edges need concrete title-token overlap unless their similarity is already high, which keeps generic low-confidence bridges from forming unrelated clusters. `gitcrawl tui` infers the most recently updated local repository when `owner/repo` is omitted. `serve` is intentionally not part of `gitcrawl`. `gitcrawl sync` fetches open issues and pull requests by default. Pass `--state all` or `--state closed` for explicit backfill workflows; incremental open syncs with `--since` also sweep recently closed items so local open state does not rot. +Pass `--numbers` to refresh exact issue or pull request rows without relying on list ordering or updated-time windows. `gitcrawl search issues|prs` accepts the common `gh search` shape (` -R owner/repo --state open --json fields --limit N`) and answers from the local SQLite cache. It is intended for discovery without spending GitHub REST search quota; use `gh` for final live verification and GitHub write actions. Pass `--sync-if-stale 5m` to perform one metadata sync before the cached search when the local repository mirror is older than that duration. The TUI starts at `--min-size 5` and `--sort size`, like ghcrawl's saved default, so the first screen is the useful cluster workload instead of singleton noise. Pass `--min-size 1` when you intentionally want singleton clusters. Mouse support is built in: click rows, wheel panes, and right-click for copy, sort, filter, jump, link, neighbor, local close/reopen, and member triage actions. Press `a` to open the same action menu from the keyboard, `#` to jump directly to an issue or PR number, `p` to switch between repositories already present in the local store, or `n` to load neighbors for the selected issue or PR. Enter from the members pane also loads neighbors before opening detail. The TUI quietly refreshes from the local store every 15 seconds. diff --git a/internal/cli/app.go b/internal/cli/app.go index dd2192e..e2f1f39 100644 --- a/internal/cli/app.go +++ b/internal/cli/app.go @@ -1563,11 +1563,12 @@ func (a *App) runSync(ctx context.Context, args []string) error { fs.SetOutput(io.Discard) since := fs.String("since", "", "GitHub since timestamp") state := fs.String("state", "", "GitHub issue state: open|closed|all; default open") + numbersRaw := fs.String("numbers", "", "comma-separated issue or pull request numbers") limitRaw := fs.String("limit", "", "maximum issue/PR rows") jsonOut := fs.Bool("json", false, "write JSON output") includeComments := fs.Bool("include-comments", false, "hydrate issue comments, PR reviews, and PR review comments") fs.Bool("include-code", false, "accepted for compatibility; code hydration is not implemented yet") - if err := fs.Parse(normalizeCommandArgs(args, map[string]bool{"since": true, "state": true, "limit": true})); err != nil { + if err := fs.Parse(normalizeCommandArgs(args, map[string]bool{"numbers": true, "since": true, "state": true, "limit": true})); err != nil { return usageErr(err) } a.applyCommandJSON(*jsonOut) @@ -1582,11 +1583,16 @@ func (a *App) runSync(ctx context.Context, args []string) error { if err != nil { return usageErr(err) } + numbers, err := parseOptionalPositiveIntList(*numbersRaw) + if err != nil { + return usageErr(err) + } stats, err := a.syncRepository(ctx, owner, repo, syncOptions{ Since: strings.TrimSpace(*since), State: strings.TrimSpace(*state), Limit: limit, + Numbers: numbers, IncludeComments: *includeComments, }) if err != nil { @@ -1599,6 +1605,7 @@ type syncOptions struct { Since string State string Limit int + Numbers []int IncludeComments bool } @@ -1628,6 +1635,7 @@ func (a *App) syncRepository(ctx context.Context, owner, repo string, options sy State: strings.TrimSpace(options.State), Since: strings.TrimSpace(options.Since), Limit: options.Limit, + Numbers: options.Numbers, IncludeComments: options.IncludeComments, Reporter: func(message string) { fmt.Fprintln(a.Stderr, message) diff --git a/internal/syncer/syncer.go b/internal/syncer/syncer.go index d141819..aa7d056 100644 --- a/internal/syncer/syncer.go +++ b/internal/syncer/syncer.go @@ -17,6 +17,7 @@ import ( type GitHubClient interface { GetRepo(ctx context.Context, owner, repo string, reporter gh.Reporter) (map[string]any, error) + GetIssue(ctx context.Context, owner, repo string, number int, reporter gh.Reporter) (map[string]any, error) ListRepositoryIssues(ctx context.Context, owner, repo string, options gh.ListIssuesOptions, reporter gh.Reporter) ([]map[string]any, error) ListIssueComments(ctx context.Context, owner, repo string, number int, reporter gh.Reporter) ([]map[string]any, error) ListPullReviews(ctx context.Context, owner, repo string, number int, reporter gh.Reporter) ([]map[string]any, error) @@ -35,6 +36,7 @@ type Options struct { State string Since string Limit int + Numbers []int IncludeComments bool Reporter gh.Reporter } @@ -48,6 +50,7 @@ type Stats struct { ThreadsClosed int `json:"threads_closed"` RequestedSince string `json:"requested_since,omitempty"` Limit int `json:"limit,omitempty"` + Numbers []int `json:"numbers,omitempty"` MetadataOnly bool `json:"metadata_only"` StartedAt string `json:"started_at"` FinishedAt string `json:"finished_at"` @@ -87,19 +90,33 @@ func (s *Syncer) Sync(ctx context.Context, options Options) (Stats, error) { return Stats{}, err } - rows, err := s.client.ListRepositoryIssues(ctx, options.Owner, options.Repo, gh.ListIssuesOptions{ - State: state, - Since: since, - Limit: options.Limit, - }, options.Reporter) - if err != nil { - return Stats{}, err + numbers := uniquePositiveNumbers(options.Numbers) + rows := make([]map[string]any, 0, len(numbers)) + if len(numbers) > 0 { + for _, number := range numbers { + row, err := s.client.GetIssue(ctx, options.Owner, options.Repo, number, options.Reporter) + if err != nil { + return Stats{}, err + } + rows = append(rows, row) + } + } else { + var err error + rows, err = s.client.ListRepositoryIssues(ctx, options.Owner, options.Repo, gh.ListIssuesOptions{ + State: state, + Since: since, + Limit: options.Limit, + }, options.Reporter) + if err != nil { + return Stats{}, err + } } stats := Stats{ Repository: options.Owner + "/" + options.Repo, RequestedSince: since, Limit: options.Limit, + Numbers: numbers, MetadataOnly: !options.IncludeComments, StartedAt: started, } @@ -130,7 +147,7 @@ func (s *Syncer) Sync(ctx context.Context, options Options) (Stats, error) { stats.IssuesSynced++ } } - if state == "open" && since != "" && options.Limit <= 0 { + if len(numbers) == 0 && state == "open" && since != "" && options.Limit <= 0 { closed, err := s.applyClosedOverlapSweep(ctx, st, repoID, options, since) if err != nil { return err @@ -141,7 +158,7 @@ func (s *Syncer) Sync(ctx context.Context, options Options) (Stats, error) { if _, err := st.RecordRun(ctx, store.RunRecord{ RepoID: repoID, Kind: "sync", - Scope: state, + Scope: syncRunScope(state, numbers), Status: "success", StartedAt: stats.StartedAt, FinishedAt: stats.FinishedAt, @@ -163,6 +180,36 @@ func (s *Syncer) Sync(ctx context.Context, options Options) (Stats, error) { return stats, nil } +func uniquePositiveNumbers(numbers []int) []int { + if len(numbers) == 0 { + return nil + } + seen := make(map[int]struct{}, len(numbers)) + out := make([]int, 0, len(numbers)) + for _, number := range numbers { + if number <= 0 { + continue + } + if _, ok := seen[number]; ok { + continue + } + seen[number] = struct{}{} + out = append(out, number) + } + return out +} + +func syncRunScope(state string, numbers []int) string { + if len(numbers) == 0 { + return state + } + parts := make([]string, 0, len(numbers)) + for _, number := range numbers { + parts = append(parts, strconv.Itoa(number)) + } + return "numbers:" + strings.Join(parts, ",") +} + func normalizeState(value string) (string, error) { value = strings.TrimSpace(strings.ToLower(value)) if value == "" { diff --git a/internal/syncer/syncer_test.go b/internal/syncer/syncer_test.go index 48fc651..3b1eb9a 100644 --- a/internal/syncer/syncer_test.go +++ b/internal/syncer/syncer_test.go @@ -16,6 +16,38 @@ func (fakeGitHub) GetRepo(ctx context.Context, owner, repo string, reporter gh.R return map[string]any{"id": 123}, nil } +func (fakeGitHub) GetIssue(ctx context.Context, owner, repo string, number int, reporter gh.Reporter) (map[string]any, error) { + if number == 8 { + return map[string]any{ + "id": 2, + "number": 8, + "state": "open", + "title": "fix sync", + "body": "", + "html_url": "https://github.com/openclaw/gitcrawl/pull/8", + "created_at": "2026-04-26T00:00:00Z", + "updated_at": "2026-04-26T00:00:00Z", + "labels": []map[string]any{}, + "assignees": []map[string]any{}, + "user": map[string]any{"login": "vincentkoc", "type": "User"}, + "pull_request": map[string]any{"url": "https://api.github.com/repos/openclaw/gitcrawl/pulls/8"}, + }, nil + } + return map[string]any{ + "id": 1, + "number": 7, + "state": "open", + "title": "download stalls", + "body": "large file download stalls", + "html_url": "https://github.com/openclaw/gitcrawl/issues/7", + "created_at": "2026-04-26T00:00:00Z", + "updated_at": "2026-04-26T00:00:00Z", + "labels": []map[string]any{{"name": "bug"}}, + "assignees": []map[string]any{}, + "user": map[string]any{"login": "vincentkoc", "type": "User"}, + }, nil +} + func (fakeGitHub) ListRepositoryIssues(ctx context.Context, owner, repo string, options gh.ListIssuesOptions, reporter gh.Reporter) ([]map[string]any, error) { if options.State == "closed" { return nil, nil @@ -116,6 +148,22 @@ func (closedSweepGitHub) ListRepositoryIssues(ctx context.Context, owner, repo s return nil, nil } +type targetedGitHub struct { + fakeGitHub + listCalled bool + numbers []int +} + +func (f *targetedGitHub) GetIssue(ctx context.Context, owner, repo string, number int, reporter gh.Reporter) (map[string]any, error) { + f.numbers = append(f.numbers, number) + return f.fakeGitHub.GetIssue(ctx, owner, repo, number, reporter) +} + +func (f *targetedGitHub) ListRepositoryIssues(ctx context.Context, owner, repo string, options gh.ListIssuesOptions, reporter gh.Reporter) ([]map[string]any, error) { + f.listCalled = true + return nil, nil +} + func TestSyncPersistsIssuesAndPullRequests(t *testing.T) { ctx := context.Background() st, err := store.Open(ctx, filepath.Join(t.TempDir(), "gitcrawl.db")) @@ -163,6 +211,47 @@ func TestSyncPersistsIssuesAndPullRequests(t *testing.T) { } } +func TestSyncCanTargetIssueNumbers(t *testing.T) { + ctx := context.Background() + st, err := store.Open(ctx, filepath.Join(t.TempDir(), "gitcrawl.db")) + if err != nil { + t.Fatalf("open store: %v", err) + } + defer st.Close() + + client := &targetedGitHub{} + s := New(client, st) + s.now = func() time.Time { return time.Date(2026, 4, 26, 0, 0, 0, 0, time.UTC) } + stats, err := s.Sync(ctx, Options{Owner: "openclaw", Repo: "gitcrawl", Numbers: []int{7, 7, 8}, IncludeComments: true}) + if err != nil { + t.Fatalf("sync: %v", err) + } + if client.listCalled { + t.Fatal("targeted sync should not call repository issue listing") + } + if got, want := client.numbers, []int{7, 8}; len(got) != len(want) || got[0] != want[0] || got[1] != want[1] { + t.Fatalf("targeted numbers: got %#v want %#v", got, want) + } + if stats.ThreadsSynced != 2 || stats.IssuesSynced != 1 || stats.PullRequestsSynced != 1 { + t.Fatalf("unexpected stats: %#v", stats) + } + if stats.CommentsSynced != 1 { + t.Fatalf("comments synced: got %d want 1", stats.CommentsSynced) + } + + repo, err := st.RepositoryByFullName(ctx, "openclaw/gitcrawl") + if err != nil { + t.Fatalf("repo: %v", err) + } + threads, err := st.ListThreads(ctx, repo.ID, false) + if err != nil { + t.Fatalf("threads: %v", err) + } + if len(threads) != 2 { + t.Fatalf("threads: got %d want 2", len(threads)) + } +} + func TestSyncNormalizesRelativeSince(t *testing.T) { ctx := context.Background() st, err := store.Open(ctx, filepath.Join(t.TempDir(), "gitcrawl.db"))