fix: tighten cluster edge precision

This commit is contained in:
Peter Steinberger 2026-04-28 11:01:05 +01:00
parent 746eda48c6
commit d60c18c2a9
No known key found for this signature in database
4 changed files with 179 additions and 20 deletions

View File

@ -9,3 +9,4 @@
- Refresh clean portable-store checkouts before read-only commands so `search`, `threads`, clusters, and the TUI see freshly published GitHub backup data automatically.
- Show active primary cluster memberships by default in `clusters`, `durable-clusters`, and the TUI, with `--include-closed` reserved for historical audit views.
- Split generated clusters with bounded nearest-neighbor graph safeguards, GitHub reference evidence, and cross-kind score pruning so weak similarity bridges stop merging unrelated reports into one mega-cluster.
- Tighten clustering precision by ignoring ambiguous one-digit prose references and requiring weak embedding edges to share concrete title tokens unless they have high similarity or direct GitHub reference evidence.

View File

@ -35,7 +35,7 @@ gitcrawl tui owner/repo
```
`gitcrawl clusters`, `gitcrawl durable-clusters`, and `gitcrawl tui` show active primary cluster memberships by default. Pass `--include-closed` to inspect closed rows and historical secondary memberships.
`gitcrawl cluster` and `gitcrawl refresh` build bounded nearest-neighbor clusters by default (`--max-cluster-size 40`, `--k 16`, `--cross-kind-threshold 0.93`) and add deterministic GitHub reference evidence for direct issue/PR links such as `#123`, `issues/123`, and `pull/123`.
`gitcrawl cluster` and `gitcrawl refresh` build bounded nearest-neighbor clusters by default (`--max-cluster-size 40`, `--k 16`, `--cross-kind-threshold 0.93`) and add deterministic GitHub reference evidence for direct issue/PR links such as `#123`, `issues/123`, and `pull/123`. Weak embedding edges also need concrete title-token overlap unless their similarity is already high, which keeps generic low-confidence bridges from forming unrelated clusters.
`gitcrawl tui` infers the most recently updated local repository when `owner/repo` is omitted. `serve` is intentionally not part of `gitcrawl`.
`gitcrawl sync` fetches issues and pull requests in every GitHub state by default. Pass `--state open` or `--state closed` to limit a sync to one state.
The TUI starts at `--min-size 5` so maintainer-significant active clusters are visible first; pass `--min-size 1` to include singletons. Mouse support is built in: click rows, wheel panes, and right-click for copy, sort, filter, jump, link, neighbor, local close/reopen, and member triage actions. Press `a` to open the same action menu from the keyboard, `#` to jump directly to an issue or PR number, `p` to switch between repositories already present in the local store, or `n` to load neighbors for the selected issue or PR. Enter from the members pane also loads neighbors before opening detail. The TUI quietly refreshes from the local store every 15 seconds.

View File

@ -33,11 +33,13 @@ const (
defaultClusterMaxSize = 40
defaultClusterFanout = 16
defaultCrossKindMinScore = 0.93
highConfidenceEdgeScore = 0.90
weakEdgeMinTitleOverlap = 0.18
deterministicRefScore = 0.94
sharedRefMaxBucketSize = 8
)
var threadReferencePattern = regexp.MustCompile(`(?i)(?:#|issues/|pull/)(\d+)`)
var threadReferencePattern = regexp.MustCompile(`(?i)(?:\b[\w.-]+/[\w.-]+#(\d+)|(?:issues|pull)/(\d+)|#(\d{2,}))`)
var titleTokenPattern = regexp.MustCompile(`[A-Za-z0-9]{4,}`)
type App struct {
Stdout io.Writer
@ -1966,6 +1968,9 @@ func buildDurableClusterInputs(ctx context.Context, st *store.Store, repoID int6
if score < options.Threshold {
continue
}
if score < highConfidenceEdgeScore && titleTokenOverlap(threads[leftID].Title, threads[rightID].Title) < weakEdgeMinTitleOverlap {
continue
}
if threads[leftID].Kind != threads[rightID].Kind && score < options.CrossKindThreshold {
continue
}
@ -2038,7 +2043,6 @@ func addDeterministicReferenceEdges(edges map[string]clusterer.Edge, nodes []clu
threadIDByNumber[thread.Number] = node.ThreadID
}
refIDsByThreadID := make(map[int64]map[int64]bool, len(nodes))
threadsByReferencedNumber := map[int][]int64{}
for _, node := range nodes {
thread := threads[node.ThreadID]
refNumbers := referencedThreadNumbers(thread)
@ -2047,7 +2051,6 @@ func addDeterministicReferenceEdges(edges map[string]clusterer.Edge, nodes []clu
if referencedID, ok := threadIDByNumber[number]; ok && referencedID != node.ThreadID {
refIDs[referencedID] = true
}
threadsByReferencedNumber[number] = append(threadsByReferencedNumber[number], node.ThreadID)
}
refIDsByThreadID[node.ThreadID] = refIDs
}
@ -2056,27 +2059,14 @@ func addDeterministicReferenceEdges(edges map[string]clusterer.Edge, nodes []clu
upsertClusterEdge(edges, threadID, referencedID, deterministicRefScore)
}
}
for _, threadIDs := range threadsByReferencedNumber {
if len(threadIDs) < 2 || len(threadIDs) > sharedRefMaxBucketSize {
continue
}
sort.Slice(threadIDs, func(i, j int) bool { return threadIDs[i] < threadIDs[j] })
for left := 0; left < len(threadIDs); left++ {
for right := left + 1; right < len(threadIDs); right++ {
upsertClusterEdge(edges, threadIDs[left], threadIDs[right], deterministicRefScore)
}
}
}
}
func referencedThreadNumbers(thread store.Thread) map[int]bool {
value := thread.Title + "\n" + thread.Body
refs := map[int]bool{}
for _, match := range threadReferencePattern.FindAllStringSubmatch(value, -1) {
if len(match) < 2 {
continue
}
number, err := strconv.Atoi(match[1])
numberText := firstNonEmpty(match[1:]...)
number, err := strconv.Atoi(numberText)
if err != nil || number <= 0 || number == thread.Number {
continue
}
@ -2085,6 +2075,33 @@ func referencedThreadNumbers(thread store.Thread) map[int]bool {
return refs
}
func titleTokenOverlap(left, right string) float64 {
leftTokens := titleTokenSet(left)
rightTokens := titleTokenSet(right)
if len(leftTokens) == 0 || len(rightTokens) == 0 {
return 0
}
overlap := 0
for token := range leftTokens {
if rightTokens[token] {
overlap++
}
}
base := len(leftTokens)
if len(rightTokens) < base {
base = len(rightTokens)
}
return float64(overlap) / float64(base)
}
func titleTokenSet(value string) map[string]bool {
out := map[string]bool{}
for _, token := range titleTokenPattern.FindAllString(strings.ToLower(value), -1) {
out[token] = true
}
return out
}
func keepTopEdges(edges []clusterer.Edge, fanout int) []clusterer.Edge {
if fanout <= 0 || len(edges) == 0 {
return edges

View File

@ -1023,6 +1023,147 @@ func TestBuildDurableClusterInputsKeepsDeterministicReferenceEdges(t *testing.T)
}
}
func TestBuildDurableClusterInputsIgnoresBareOneDigitProseRefs(t *testing.T) {
ctx := context.Background()
st, err := store.Open(ctx, filepath.Join(t.TempDir(), "gitcrawl.db"))
if err != nil {
t.Fatalf("open store: %v", err)
}
defer st.Close()
repoID, err := st.UpsertRepository(ctx, store.Repository{
Owner: "openclaw",
Name: "openclaw",
FullName: "openclaw/openclaw",
RawJSON: "{}",
UpdatedAt: time.Now().UTC().Format(time.RFC3339Nano),
})
if err != nil {
t.Fatalf("seed repository: %v", err)
}
firstID, err := st.UpsertThread(ctx, store.Thread{
RepoID: repoID,
GitHubID: "401",
Number: 401,
Kind: "pull_request",
State: "open",
Title: "Background task notification",
Body: "This is the #1 UX gap for orchestration.",
HTMLURL: "https://github.com/openclaw/openclaw/pull/401",
LabelsJSON: "[]",
AssigneesJSON: "[]",
RawJSON: "{}",
ContentHash: "hash-401",
UpdatedAt: time.Now().UTC().Format(time.RFC3339Nano),
})
if err != nil {
t.Fatalf("seed first thread: %v", err)
}
secondID, err := st.UpsertThread(ctx, store.Thread{
RepoID: repoID,
GitHubID: "402",
Number: 402,
Kind: "pull_request",
State: "open",
Title: "Plugin config overlay",
Body: "This is #1 for locked-down deployments.",
HTMLURL: "https://github.com/openclaw/openclaw/pull/402",
LabelsJSON: "[]",
AssigneesJSON: "[]",
RawJSON: "{}",
ContentHash: "hash-402",
UpdatedAt: time.Now().UTC().Format(time.RFC3339Nano),
})
if err != nil {
t.Fatalf("seed second thread: %v", err)
}
inputs, edgeCount, err := buildDurableClusterInputs(ctx, st, repoID, []store.ThreadVector{
{ThreadID: firstID, Vector: []float64{1, 0}},
{ThreadID: secondID, Vector: []float64{0, 1}},
}, clusterBuildOptions{
Threshold: 0.99,
MinSize: 2,
MaxClusterSize: defaultClusterMaxSize,
Fanout: 16,
CrossKindThreshold: 0.99,
})
if err != nil {
t.Fatalf("build inputs: %v", err)
}
if edgeCount != 0 || len(inputs) != 0 {
t.Fatalf("bare one-digit prose refs should not form evidence edges, edges=%d inputs=%#v", edgeCount, inputs)
}
}
func TestBuildDurableClusterInputsPrunesWeakGenericTitleEdges(t *testing.T) {
ctx := context.Background()
st, err := store.Open(ctx, filepath.Join(t.TempDir(), "gitcrawl.db"))
if err != nil {
t.Fatalf("open store: %v", err)
}
defer st.Close()
repoID, err := st.UpsertRepository(ctx, store.Repository{
Owner: "openclaw",
Name: "openclaw",
FullName: "openclaw/openclaw",
RawJSON: "{}",
UpdatedAt: time.Now().UTC().Format(time.RFC3339Nano),
})
if err != nil {
t.Fatalf("seed repository: %v", err)
}
firstID, err := st.UpsertThread(ctx, store.Thread{
RepoID: repoID,
GitHubID: "501",
Number: 501,
Kind: "pull_request",
State: "open",
Title: "fix: improve error handling and logging for security-critical operations",
HTMLURL: "https://github.com/openclaw/openclaw/pull/501",
LabelsJSON: "[]",
AssigneesJSON: "[]",
RawJSON: "{}",
ContentHash: "hash-501",
UpdatedAt: time.Now().UTC().Format(time.RFC3339Nano),
})
if err != nil {
t.Fatalf("seed first thread: %v", err)
}
secondID, err := st.UpsertThread(ctx, store.Thread{
RepoID: repoID,
GitHubID: "502",
Number: 502,
Kind: "pull_request",
State: "open",
Title: "fix(gateway): isolate control-plane write rate limits by connection",
HTMLURL: "https://github.com/openclaw/openclaw/pull/502",
LabelsJSON: "[]",
AssigneesJSON: "[]",
RawJSON: "{}",
ContentHash: "hash-502",
UpdatedAt: time.Now().UTC().Format(time.RFC3339Nano),
})
if err != nil {
t.Fatalf("seed second thread: %v", err)
}
vectors := []store.ThreadVector{
{ThreadID: firstID, Vector: []float64{1, 0}},
{ThreadID: secondID, Vector: []float64{0.84, 0.5425863986500217}},
}
inputs, edgeCount, err := buildDurableClusterInputs(ctx, st, repoID, vectors, clusterBuildOptions{
Threshold: 0.82,
MinSize: 2,
MaxClusterSize: defaultClusterMaxSize,
Fanout: 16,
CrossKindThreshold: defaultCrossKindMinScore,
})
if err != nil {
t.Fatalf("build inputs: %v", err)
}
if edgeCount != 0 || len(inputs) != 0 {
t.Fatalf("weak generic title edge should be pruned, edges=%d inputs=%#v", edgeCount, inputs)
}
}
func TestKeepTopEdgesKeepsOneSidedNearestNeighbors(t *testing.T) {
edges := keepTopEdges([]clusterer.Edge{
{LeftThreadID: 1, RightThreadID: 2, Score: 0.95},