feat: add targeted issue sync
Some checks failed
CI / Go / ${{ matrix.os }} (macos-latest) (push) Has been cancelled
CI / Go / ${{ matrix.os }} (ubuntu-latest) (push) Has been cancelled

This commit is contained in:
Peter Steinberger 2026-04-29 12:28:05 +01:00
parent bae22a73a2
commit fdef74b910
No known key found for this signature in database
5 changed files with 157 additions and 10 deletions

View File

@ -2,6 +2,7 @@
## Unreleased
- Add `gitcrawl sync --numbers` for exact issue and pull request hydration, including comment documents, without relying on list ordering or updated-time windows.
- Implement `gitcrawl refresh` and `gitcrawl embed` so synced repositories can generate OpenAI embeddings and rebuild durable clusters end to end.
- Add `gitcrawl sync --state open|closed|all` so incremental backups can refresh recently closed issues and pull requests.
- Default `gitcrawl sync` to `--state all`, keeping closed issue and pull request state fresh unless a narrower state is requested.

View File

@ -15,6 +15,7 @@ gitcrawl init
gitcrawl doctor
gitcrawl sync owner/repo
gitcrawl sync owner/repo --state open
gitcrawl sync owner/repo --numbers 123,456 --include-comments
gitcrawl refresh owner/repo
gitcrawl cluster owner/repo --threshold 0.80
gitcrawl clusters owner/repo
@ -41,6 +42,7 @@ gitcrawl tui owner/repo
`gitcrawl cluster` and `gitcrawl refresh` build ghcrawl-shaped durable clusters by default (`--threshold 0.80`, `--min-size 1`, `--max-cluster-size 40`, `--k 16`, `--cross-kind-threshold 0.93`): every active vector-backed thread is represented, singleton rows use `singleton_orphan`, multi-member rows use `duplicate_candidate`, and stable IDs are derived from the representative thread. They also add deterministic GitHub reference evidence for direct issue/PR links such as `#123`, `issues/123`, and `pull/123`. Weak embedding edges need concrete title-token overlap unless their similarity is already high, which keeps generic low-confidence bridges from forming unrelated clusters.
`gitcrawl tui` infers the most recently updated local repository when `owner/repo` is omitted. `serve` is intentionally not part of `gitcrawl`.
`gitcrawl sync` fetches open issues and pull requests by default. Pass `--state all` or `--state closed` for explicit backfill workflows; incremental open syncs with `--since` also sweep recently closed items so local open state does not rot.
Pass `--numbers` to refresh exact issue or pull request rows without relying on list ordering or updated-time windows.
`gitcrawl search issues|prs` accepts the common `gh search` shape (`<query> -R owner/repo --state open --json fields --limit N`) and answers from the local SQLite cache. It is intended for discovery without spending GitHub REST search quota; use `gh` for final live verification and GitHub write actions. Pass `--sync-if-stale 5m` to perform one metadata sync before the cached search when the local repository mirror is older than that duration.
The TUI starts at `--min-size 5` and `--sort size`, like ghcrawl's saved default, so the first screen is the useful cluster workload instead of singleton noise. Pass `--min-size 1` when you intentionally want singleton clusters. Mouse support is built in: click rows, wheel panes, and right-click for copy, sort, filter, jump, link, neighbor, local close/reopen, and member triage actions. Press `a` to open the same action menu from the keyboard, `#` to jump directly to an issue or PR number, `p` to switch between repositories already present in the local store, or `n` to load neighbors for the selected issue or PR. Enter from the members pane also loads neighbors before opening detail. The TUI quietly refreshes from the local store every 15 seconds.

View File

@ -1563,11 +1563,12 @@ func (a *App) runSync(ctx context.Context, args []string) error {
fs.SetOutput(io.Discard)
since := fs.String("since", "", "GitHub since timestamp")
state := fs.String("state", "", "GitHub issue state: open|closed|all; default open")
numbersRaw := fs.String("numbers", "", "comma-separated issue or pull request numbers")
limitRaw := fs.String("limit", "", "maximum issue/PR rows")
jsonOut := fs.Bool("json", false, "write JSON output")
includeComments := fs.Bool("include-comments", false, "hydrate issue comments, PR reviews, and PR review comments")
fs.Bool("include-code", false, "accepted for compatibility; code hydration is not implemented yet")
if err := fs.Parse(normalizeCommandArgs(args, map[string]bool{"since": true, "state": true, "limit": true})); err != nil {
if err := fs.Parse(normalizeCommandArgs(args, map[string]bool{"numbers": true, "since": true, "state": true, "limit": true})); err != nil {
return usageErr(err)
}
a.applyCommandJSON(*jsonOut)
@ -1582,11 +1583,16 @@ func (a *App) runSync(ctx context.Context, args []string) error {
if err != nil {
return usageErr(err)
}
numbers, err := parseOptionalPositiveIntList(*numbersRaw)
if err != nil {
return usageErr(err)
}
stats, err := a.syncRepository(ctx, owner, repo, syncOptions{
Since: strings.TrimSpace(*since),
State: strings.TrimSpace(*state),
Limit: limit,
Numbers: numbers,
IncludeComments: *includeComments,
})
if err != nil {
@ -1599,6 +1605,7 @@ type syncOptions struct {
Since string
State string
Limit int
Numbers []int
IncludeComments bool
}
@ -1628,6 +1635,7 @@ func (a *App) syncRepository(ctx context.Context, owner, repo string, options sy
State: strings.TrimSpace(options.State),
Since: strings.TrimSpace(options.Since),
Limit: options.Limit,
Numbers: options.Numbers,
IncludeComments: options.IncludeComments,
Reporter: func(message string) {
fmt.Fprintln(a.Stderr, message)

View File

@ -17,6 +17,7 @@ import (
type GitHubClient interface {
GetRepo(ctx context.Context, owner, repo string, reporter gh.Reporter) (map[string]any, error)
GetIssue(ctx context.Context, owner, repo string, number int, reporter gh.Reporter) (map[string]any, error)
ListRepositoryIssues(ctx context.Context, owner, repo string, options gh.ListIssuesOptions, reporter gh.Reporter) ([]map[string]any, error)
ListIssueComments(ctx context.Context, owner, repo string, number int, reporter gh.Reporter) ([]map[string]any, error)
ListPullReviews(ctx context.Context, owner, repo string, number int, reporter gh.Reporter) ([]map[string]any, error)
@ -35,6 +36,7 @@ type Options struct {
State string
Since string
Limit int
Numbers []int
IncludeComments bool
Reporter gh.Reporter
}
@ -48,6 +50,7 @@ type Stats struct {
ThreadsClosed int `json:"threads_closed"`
RequestedSince string `json:"requested_since,omitempty"`
Limit int `json:"limit,omitempty"`
Numbers []int `json:"numbers,omitempty"`
MetadataOnly bool `json:"metadata_only"`
StartedAt string `json:"started_at"`
FinishedAt string `json:"finished_at"`
@ -87,19 +90,33 @@ func (s *Syncer) Sync(ctx context.Context, options Options) (Stats, error) {
return Stats{}, err
}
rows, err := s.client.ListRepositoryIssues(ctx, options.Owner, options.Repo, gh.ListIssuesOptions{
State: state,
Since: since,
Limit: options.Limit,
}, options.Reporter)
if err != nil {
return Stats{}, err
numbers := uniquePositiveNumbers(options.Numbers)
rows := make([]map[string]any, 0, len(numbers))
if len(numbers) > 0 {
for _, number := range numbers {
row, err := s.client.GetIssue(ctx, options.Owner, options.Repo, number, options.Reporter)
if err != nil {
return Stats{}, err
}
rows = append(rows, row)
}
} else {
var err error
rows, err = s.client.ListRepositoryIssues(ctx, options.Owner, options.Repo, gh.ListIssuesOptions{
State: state,
Since: since,
Limit: options.Limit,
}, options.Reporter)
if err != nil {
return Stats{}, err
}
}
stats := Stats{
Repository: options.Owner + "/" + options.Repo,
RequestedSince: since,
Limit: options.Limit,
Numbers: numbers,
MetadataOnly: !options.IncludeComments,
StartedAt: started,
}
@ -130,7 +147,7 @@ func (s *Syncer) Sync(ctx context.Context, options Options) (Stats, error) {
stats.IssuesSynced++
}
}
if state == "open" && since != "" && options.Limit <= 0 {
if len(numbers) == 0 && state == "open" && since != "" && options.Limit <= 0 {
closed, err := s.applyClosedOverlapSweep(ctx, st, repoID, options, since)
if err != nil {
return err
@ -141,7 +158,7 @@ func (s *Syncer) Sync(ctx context.Context, options Options) (Stats, error) {
if _, err := st.RecordRun(ctx, store.RunRecord{
RepoID: repoID,
Kind: "sync",
Scope: state,
Scope: syncRunScope(state, numbers),
Status: "success",
StartedAt: stats.StartedAt,
FinishedAt: stats.FinishedAt,
@ -163,6 +180,36 @@ func (s *Syncer) Sync(ctx context.Context, options Options) (Stats, error) {
return stats, nil
}
func uniquePositiveNumbers(numbers []int) []int {
if len(numbers) == 0 {
return nil
}
seen := make(map[int]struct{}, len(numbers))
out := make([]int, 0, len(numbers))
for _, number := range numbers {
if number <= 0 {
continue
}
if _, ok := seen[number]; ok {
continue
}
seen[number] = struct{}{}
out = append(out, number)
}
return out
}
func syncRunScope(state string, numbers []int) string {
if len(numbers) == 0 {
return state
}
parts := make([]string, 0, len(numbers))
for _, number := range numbers {
parts = append(parts, strconv.Itoa(number))
}
return "numbers:" + strings.Join(parts, ",")
}
func normalizeState(value string) (string, error) {
value = strings.TrimSpace(strings.ToLower(value))
if value == "" {

View File

@ -16,6 +16,38 @@ func (fakeGitHub) GetRepo(ctx context.Context, owner, repo string, reporter gh.R
return map[string]any{"id": 123}, nil
}
func (fakeGitHub) GetIssue(ctx context.Context, owner, repo string, number int, reporter gh.Reporter) (map[string]any, error) {
if number == 8 {
return map[string]any{
"id": 2,
"number": 8,
"state": "open",
"title": "fix sync",
"body": "",
"html_url": "https://github.com/openclaw/gitcrawl/pull/8",
"created_at": "2026-04-26T00:00:00Z",
"updated_at": "2026-04-26T00:00:00Z",
"labels": []map[string]any{},
"assignees": []map[string]any{},
"user": map[string]any{"login": "vincentkoc", "type": "User"},
"pull_request": map[string]any{"url": "https://api.github.com/repos/openclaw/gitcrawl/pulls/8"},
}, nil
}
return map[string]any{
"id": 1,
"number": 7,
"state": "open",
"title": "download stalls",
"body": "large file download stalls",
"html_url": "https://github.com/openclaw/gitcrawl/issues/7",
"created_at": "2026-04-26T00:00:00Z",
"updated_at": "2026-04-26T00:00:00Z",
"labels": []map[string]any{{"name": "bug"}},
"assignees": []map[string]any{},
"user": map[string]any{"login": "vincentkoc", "type": "User"},
}, nil
}
func (fakeGitHub) ListRepositoryIssues(ctx context.Context, owner, repo string, options gh.ListIssuesOptions, reporter gh.Reporter) ([]map[string]any, error) {
if options.State == "closed" {
return nil, nil
@ -116,6 +148,22 @@ func (closedSweepGitHub) ListRepositoryIssues(ctx context.Context, owner, repo s
return nil, nil
}
type targetedGitHub struct {
fakeGitHub
listCalled bool
numbers []int
}
func (f *targetedGitHub) GetIssue(ctx context.Context, owner, repo string, number int, reporter gh.Reporter) (map[string]any, error) {
f.numbers = append(f.numbers, number)
return f.fakeGitHub.GetIssue(ctx, owner, repo, number, reporter)
}
func (f *targetedGitHub) ListRepositoryIssues(ctx context.Context, owner, repo string, options gh.ListIssuesOptions, reporter gh.Reporter) ([]map[string]any, error) {
f.listCalled = true
return nil, nil
}
func TestSyncPersistsIssuesAndPullRequests(t *testing.T) {
ctx := context.Background()
st, err := store.Open(ctx, filepath.Join(t.TempDir(), "gitcrawl.db"))
@ -163,6 +211,47 @@ func TestSyncPersistsIssuesAndPullRequests(t *testing.T) {
}
}
func TestSyncCanTargetIssueNumbers(t *testing.T) {
ctx := context.Background()
st, err := store.Open(ctx, filepath.Join(t.TempDir(), "gitcrawl.db"))
if err != nil {
t.Fatalf("open store: %v", err)
}
defer st.Close()
client := &targetedGitHub{}
s := New(client, st)
s.now = func() time.Time { return time.Date(2026, 4, 26, 0, 0, 0, 0, time.UTC) }
stats, err := s.Sync(ctx, Options{Owner: "openclaw", Repo: "gitcrawl", Numbers: []int{7, 7, 8}, IncludeComments: true})
if err != nil {
t.Fatalf("sync: %v", err)
}
if client.listCalled {
t.Fatal("targeted sync should not call repository issue listing")
}
if got, want := client.numbers, []int{7, 8}; len(got) != len(want) || got[0] != want[0] || got[1] != want[1] {
t.Fatalf("targeted numbers: got %#v want %#v", got, want)
}
if stats.ThreadsSynced != 2 || stats.IssuesSynced != 1 || stats.PullRequestsSynced != 1 {
t.Fatalf("unexpected stats: %#v", stats)
}
if stats.CommentsSynced != 1 {
t.Fatalf("comments synced: got %d want 1", stats.CommentsSynced)
}
repo, err := st.RepositoryByFullName(ctx, "openclaw/gitcrawl")
if err != nil {
t.Fatalf("repo: %v", err)
}
threads, err := st.ListThreads(ctx, repo.ID, false)
if err != nil {
t.Fatalf("threads: %v", err)
}
if len(threads) != 2 {
t.Fatalf("threads: got %d want 2", len(threads))
}
}
func TestSyncNormalizesRelativeSince(t *testing.T) {
ctx := context.Background()
st, err := store.Open(ctx, filepath.Join(t.TempDir(), "gitcrawl.db"))