diff --git a/CHANGELOG.md b/CHANGELOG.md index 426e1ef..a2563e6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,3 +9,4 @@ - Refresh clean portable-store checkouts before read-only commands so `search`, `threads`, clusters, and the TUI see freshly published GitHub backup data automatically. - Show active primary cluster memberships by default in `clusters`, `durable-clusters`, and the TUI, with `--include-closed` reserved for historical audit views. - Split generated clusters with bounded nearest-neighbor graph safeguards, GitHub reference evidence, and cross-kind score pruning so weak similarity bridges stop merging unrelated reports into one mega-cluster. +- Tighten clustering precision by ignoring ambiguous one-digit prose references and requiring weak embedding edges to share concrete title tokens unless they have high similarity or direct GitHub reference evidence. diff --git a/README.md b/README.md index ab6e112..f482202 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ gitcrawl tui owner/repo ``` `gitcrawl clusters`, `gitcrawl durable-clusters`, and `gitcrawl tui` show active primary cluster memberships by default. Pass `--include-closed` to inspect closed rows and historical secondary memberships. -`gitcrawl cluster` and `gitcrawl refresh` build bounded nearest-neighbor clusters by default (`--max-cluster-size 40`, `--k 16`, `--cross-kind-threshold 0.93`) and add deterministic GitHub reference evidence for direct issue/PR links such as `#123`, `issues/123`, and `pull/123`. +`gitcrawl cluster` and `gitcrawl refresh` build bounded nearest-neighbor clusters by default (`--max-cluster-size 40`, `--k 16`, `--cross-kind-threshold 0.93`) and add deterministic GitHub reference evidence for direct issue/PR links such as `#123`, `issues/123`, and `pull/123`. Weak embedding edges also need concrete title-token overlap unless their similarity is already high, which keeps generic low-confidence bridges from forming unrelated clusters. `gitcrawl tui` infers the most recently updated local repository when `owner/repo` is omitted. `serve` is intentionally not part of `gitcrawl`. `gitcrawl sync` fetches issues and pull requests in every GitHub state by default. Pass `--state open` or `--state closed` to limit a sync to one state. The TUI starts at `--min-size 5` so maintainer-significant active clusters are visible first; pass `--min-size 1` to include singletons. Mouse support is built in: click rows, wheel panes, and right-click for copy, sort, filter, jump, link, neighbor, local close/reopen, and member triage actions. Press `a` to open the same action menu from the keyboard, `#` to jump directly to an issue or PR number, `p` to switch between repositories already present in the local store, or `n` to load neighbors for the selected issue or PR. Enter from the members pane also loads neighbors before opening detail. The TUI quietly refreshes from the local store every 15 seconds. diff --git a/internal/cli/app.go b/internal/cli/app.go index 7726a9d..62a3f21 100644 --- a/internal/cli/app.go +++ b/internal/cli/app.go @@ -33,11 +33,13 @@ const ( defaultClusterMaxSize = 40 defaultClusterFanout = 16 defaultCrossKindMinScore = 0.93 + highConfidenceEdgeScore = 0.90 + weakEdgeMinTitleOverlap = 0.18 deterministicRefScore = 0.94 - sharedRefMaxBucketSize = 8 ) -var threadReferencePattern = regexp.MustCompile(`(?i)(?:#|issues/|pull/)(\d+)`) +var threadReferencePattern = regexp.MustCompile(`(?i)(?:\b[\w.-]+/[\w.-]+#(\d+)|(?:issues|pull)/(\d+)|#(\d{2,}))`) +var titleTokenPattern = regexp.MustCompile(`[A-Za-z0-9]{4,}`) type App struct { Stdout io.Writer @@ -1966,6 +1968,9 @@ func buildDurableClusterInputs(ctx context.Context, st *store.Store, repoID int6 if score < options.Threshold { continue } + if score < highConfidenceEdgeScore && titleTokenOverlap(threads[leftID].Title, threads[rightID].Title) < weakEdgeMinTitleOverlap { + continue + } if threads[leftID].Kind != threads[rightID].Kind && score < options.CrossKindThreshold { continue } @@ -2038,7 +2043,6 @@ func addDeterministicReferenceEdges(edges map[string]clusterer.Edge, nodes []clu threadIDByNumber[thread.Number] = node.ThreadID } refIDsByThreadID := make(map[int64]map[int64]bool, len(nodes)) - threadsByReferencedNumber := map[int][]int64{} for _, node := range nodes { thread := threads[node.ThreadID] refNumbers := referencedThreadNumbers(thread) @@ -2047,7 +2051,6 @@ func addDeterministicReferenceEdges(edges map[string]clusterer.Edge, nodes []clu if referencedID, ok := threadIDByNumber[number]; ok && referencedID != node.ThreadID { refIDs[referencedID] = true } - threadsByReferencedNumber[number] = append(threadsByReferencedNumber[number], node.ThreadID) } refIDsByThreadID[node.ThreadID] = refIDs } @@ -2056,27 +2059,14 @@ func addDeterministicReferenceEdges(edges map[string]clusterer.Edge, nodes []clu upsertClusterEdge(edges, threadID, referencedID, deterministicRefScore) } } - for _, threadIDs := range threadsByReferencedNumber { - if len(threadIDs) < 2 || len(threadIDs) > sharedRefMaxBucketSize { - continue - } - sort.Slice(threadIDs, func(i, j int) bool { return threadIDs[i] < threadIDs[j] }) - for left := 0; left < len(threadIDs); left++ { - for right := left + 1; right < len(threadIDs); right++ { - upsertClusterEdge(edges, threadIDs[left], threadIDs[right], deterministicRefScore) - } - } - } } func referencedThreadNumbers(thread store.Thread) map[int]bool { value := thread.Title + "\n" + thread.Body refs := map[int]bool{} for _, match := range threadReferencePattern.FindAllStringSubmatch(value, -1) { - if len(match) < 2 { - continue - } - number, err := strconv.Atoi(match[1]) + numberText := firstNonEmpty(match[1:]...) + number, err := strconv.Atoi(numberText) if err != nil || number <= 0 || number == thread.Number { continue } @@ -2085,6 +2075,33 @@ func referencedThreadNumbers(thread store.Thread) map[int]bool { return refs } +func titleTokenOverlap(left, right string) float64 { + leftTokens := titleTokenSet(left) + rightTokens := titleTokenSet(right) + if len(leftTokens) == 0 || len(rightTokens) == 0 { + return 0 + } + overlap := 0 + for token := range leftTokens { + if rightTokens[token] { + overlap++ + } + } + base := len(leftTokens) + if len(rightTokens) < base { + base = len(rightTokens) + } + return float64(overlap) / float64(base) +} + +func titleTokenSet(value string) map[string]bool { + out := map[string]bool{} + for _, token := range titleTokenPattern.FindAllString(strings.ToLower(value), -1) { + out[token] = true + } + return out +} + func keepTopEdges(edges []clusterer.Edge, fanout int) []clusterer.Edge { if fanout <= 0 || len(edges) == 0 { return edges diff --git a/internal/cli/app_test.go b/internal/cli/app_test.go index 8ac29e5..a2b0e04 100644 --- a/internal/cli/app_test.go +++ b/internal/cli/app_test.go @@ -1023,6 +1023,147 @@ func TestBuildDurableClusterInputsKeepsDeterministicReferenceEdges(t *testing.T) } } +func TestBuildDurableClusterInputsIgnoresBareOneDigitProseRefs(t *testing.T) { + ctx := context.Background() + st, err := store.Open(ctx, filepath.Join(t.TempDir(), "gitcrawl.db")) + if err != nil { + t.Fatalf("open store: %v", err) + } + defer st.Close() + repoID, err := st.UpsertRepository(ctx, store.Repository{ + Owner: "openclaw", + Name: "openclaw", + FullName: "openclaw/openclaw", + RawJSON: "{}", + UpdatedAt: time.Now().UTC().Format(time.RFC3339Nano), + }) + if err != nil { + t.Fatalf("seed repository: %v", err) + } + firstID, err := st.UpsertThread(ctx, store.Thread{ + RepoID: repoID, + GitHubID: "401", + Number: 401, + Kind: "pull_request", + State: "open", + Title: "Background task notification", + Body: "This is the #1 UX gap for orchestration.", + HTMLURL: "https://github.com/openclaw/openclaw/pull/401", + LabelsJSON: "[]", + AssigneesJSON: "[]", + RawJSON: "{}", + ContentHash: "hash-401", + UpdatedAt: time.Now().UTC().Format(time.RFC3339Nano), + }) + if err != nil { + t.Fatalf("seed first thread: %v", err) + } + secondID, err := st.UpsertThread(ctx, store.Thread{ + RepoID: repoID, + GitHubID: "402", + Number: 402, + Kind: "pull_request", + State: "open", + Title: "Plugin config overlay", + Body: "This is #1 for locked-down deployments.", + HTMLURL: "https://github.com/openclaw/openclaw/pull/402", + LabelsJSON: "[]", + AssigneesJSON: "[]", + RawJSON: "{}", + ContentHash: "hash-402", + UpdatedAt: time.Now().UTC().Format(time.RFC3339Nano), + }) + if err != nil { + t.Fatalf("seed second thread: %v", err) + } + inputs, edgeCount, err := buildDurableClusterInputs(ctx, st, repoID, []store.ThreadVector{ + {ThreadID: firstID, Vector: []float64{1, 0}}, + {ThreadID: secondID, Vector: []float64{0, 1}}, + }, clusterBuildOptions{ + Threshold: 0.99, + MinSize: 2, + MaxClusterSize: defaultClusterMaxSize, + Fanout: 16, + CrossKindThreshold: 0.99, + }) + if err != nil { + t.Fatalf("build inputs: %v", err) + } + if edgeCount != 0 || len(inputs) != 0 { + t.Fatalf("bare one-digit prose refs should not form evidence edges, edges=%d inputs=%#v", edgeCount, inputs) + } +} + +func TestBuildDurableClusterInputsPrunesWeakGenericTitleEdges(t *testing.T) { + ctx := context.Background() + st, err := store.Open(ctx, filepath.Join(t.TempDir(), "gitcrawl.db")) + if err != nil { + t.Fatalf("open store: %v", err) + } + defer st.Close() + repoID, err := st.UpsertRepository(ctx, store.Repository{ + Owner: "openclaw", + Name: "openclaw", + FullName: "openclaw/openclaw", + RawJSON: "{}", + UpdatedAt: time.Now().UTC().Format(time.RFC3339Nano), + }) + if err != nil { + t.Fatalf("seed repository: %v", err) + } + firstID, err := st.UpsertThread(ctx, store.Thread{ + RepoID: repoID, + GitHubID: "501", + Number: 501, + Kind: "pull_request", + State: "open", + Title: "fix: improve error handling and logging for security-critical operations", + HTMLURL: "https://github.com/openclaw/openclaw/pull/501", + LabelsJSON: "[]", + AssigneesJSON: "[]", + RawJSON: "{}", + ContentHash: "hash-501", + UpdatedAt: time.Now().UTC().Format(time.RFC3339Nano), + }) + if err != nil { + t.Fatalf("seed first thread: %v", err) + } + secondID, err := st.UpsertThread(ctx, store.Thread{ + RepoID: repoID, + GitHubID: "502", + Number: 502, + Kind: "pull_request", + State: "open", + Title: "fix(gateway): isolate control-plane write rate limits by connection", + HTMLURL: "https://github.com/openclaw/openclaw/pull/502", + LabelsJSON: "[]", + AssigneesJSON: "[]", + RawJSON: "{}", + ContentHash: "hash-502", + UpdatedAt: time.Now().UTC().Format(time.RFC3339Nano), + }) + if err != nil { + t.Fatalf("seed second thread: %v", err) + } + vectors := []store.ThreadVector{ + {ThreadID: firstID, Vector: []float64{1, 0}}, + {ThreadID: secondID, Vector: []float64{0.84, 0.5425863986500217}}, + } + inputs, edgeCount, err := buildDurableClusterInputs(ctx, st, repoID, vectors, clusterBuildOptions{ + Threshold: 0.82, + MinSize: 2, + MaxClusterSize: defaultClusterMaxSize, + Fanout: 16, + CrossKindThreshold: defaultCrossKindMinScore, + }) + if err != nil { + t.Fatalf("build inputs: %v", err) + } + if edgeCount != 0 || len(inputs) != 0 { + t.Fatalf("weak generic title edge should be pruned, edges=%d inputs=%#v", edgeCount, inputs) + } +} + func TestKeepTopEdgesKeepsOneSidedNearestNeighbors(t *testing.T) { edges := keepTopEdges([]clusterer.Edge{ {LeftThreadID: 1, RightThreadID: 2, Score: 0.95},