fix: tighten cluster edge precision
This commit is contained in:
parent
746eda48c6
commit
d60c18c2a9
@ -9,3 +9,4 @@
|
||||
- Refresh clean portable-store checkouts before read-only commands so `search`, `threads`, clusters, and the TUI see freshly published GitHub backup data automatically.
|
||||
- Show active primary cluster memberships by default in `clusters`, `durable-clusters`, and the TUI, with `--include-closed` reserved for historical audit views.
|
||||
- Split generated clusters with bounded nearest-neighbor graph safeguards, GitHub reference evidence, and cross-kind score pruning so weak similarity bridges stop merging unrelated reports into one mega-cluster.
|
||||
- Tighten clustering precision by ignoring ambiguous one-digit prose references and requiring weak embedding edges to share concrete title tokens unless they have high similarity or direct GitHub reference evidence.
|
||||
|
||||
@ -35,7 +35,7 @@ gitcrawl tui owner/repo
|
||||
```
|
||||
|
||||
`gitcrawl clusters`, `gitcrawl durable-clusters`, and `gitcrawl tui` show active primary cluster memberships by default. Pass `--include-closed` to inspect closed rows and historical secondary memberships.
|
||||
`gitcrawl cluster` and `gitcrawl refresh` build bounded nearest-neighbor clusters by default (`--max-cluster-size 40`, `--k 16`, `--cross-kind-threshold 0.93`) and add deterministic GitHub reference evidence for direct issue/PR links such as `#123`, `issues/123`, and `pull/123`.
|
||||
`gitcrawl cluster` and `gitcrawl refresh` build bounded nearest-neighbor clusters by default (`--max-cluster-size 40`, `--k 16`, `--cross-kind-threshold 0.93`) and add deterministic GitHub reference evidence for direct issue/PR links such as `#123`, `issues/123`, and `pull/123`. Weak embedding edges also need concrete title-token overlap unless their similarity is already high, which keeps generic low-confidence bridges from forming unrelated clusters.
|
||||
`gitcrawl tui` infers the most recently updated local repository when `owner/repo` is omitted. `serve` is intentionally not part of `gitcrawl`.
|
||||
`gitcrawl sync` fetches issues and pull requests in every GitHub state by default. Pass `--state open` or `--state closed` to limit a sync to one state.
|
||||
The TUI starts at `--min-size 5` so maintainer-significant active clusters are visible first; pass `--min-size 1` to include singletons. Mouse support is built in: click rows, wheel panes, and right-click for copy, sort, filter, jump, link, neighbor, local close/reopen, and member triage actions. Press `a` to open the same action menu from the keyboard, `#` to jump directly to an issue or PR number, `p` to switch between repositories already present in the local store, or `n` to load neighbors for the selected issue or PR. Enter from the members pane also loads neighbors before opening detail. The TUI quietly refreshes from the local store every 15 seconds.
|
||||
|
||||
@ -33,11 +33,13 @@ const (
|
||||
defaultClusterMaxSize = 40
|
||||
defaultClusterFanout = 16
|
||||
defaultCrossKindMinScore = 0.93
|
||||
highConfidenceEdgeScore = 0.90
|
||||
weakEdgeMinTitleOverlap = 0.18
|
||||
deterministicRefScore = 0.94
|
||||
sharedRefMaxBucketSize = 8
|
||||
)
|
||||
|
||||
var threadReferencePattern = regexp.MustCompile(`(?i)(?:#|issues/|pull/)(\d+)`)
|
||||
var threadReferencePattern = regexp.MustCompile(`(?i)(?:\b[\w.-]+/[\w.-]+#(\d+)|(?:issues|pull)/(\d+)|#(\d{2,}))`)
|
||||
var titleTokenPattern = regexp.MustCompile(`[A-Za-z0-9]{4,}`)
|
||||
|
||||
type App struct {
|
||||
Stdout io.Writer
|
||||
@ -1966,6 +1968,9 @@ func buildDurableClusterInputs(ctx context.Context, st *store.Store, repoID int6
|
||||
if score < options.Threshold {
|
||||
continue
|
||||
}
|
||||
if score < highConfidenceEdgeScore && titleTokenOverlap(threads[leftID].Title, threads[rightID].Title) < weakEdgeMinTitleOverlap {
|
||||
continue
|
||||
}
|
||||
if threads[leftID].Kind != threads[rightID].Kind && score < options.CrossKindThreshold {
|
||||
continue
|
||||
}
|
||||
@ -2038,7 +2043,6 @@ func addDeterministicReferenceEdges(edges map[string]clusterer.Edge, nodes []clu
|
||||
threadIDByNumber[thread.Number] = node.ThreadID
|
||||
}
|
||||
refIDsByThreadID := make(map[int64]map[int64]bool, len(nodes))
|
||||
threadsByReferencedNumber := map[int][]int64{}
|
||||
for _, node := range nodes {
|
||||
thread := threads[node.ThreadID]
|
||||
refNumbers := referencedThreadNumbers(thread)
|
||||
@ -2047,7 +2051,6 @@ func addDeterministicReferenceEdges(edges map[string]clusterer.Edge, nodes []clu
|
||||
if referencedID, ok := threadIDByNumber[number]; ok && referencedID != node.ThreadID {
|
||||
refIDs[referencedID] = true
|
||||
}
|
||||
threadsByReferencedNumber[number] = append(threadsByReferencedNumber[number], node.ThreadID)
|
||||
}
|
||||
refIDsByThreadID[node.ThreadID] = refIDs
|
||||
}
|
||||
@ -2056,27 +2059,14 @@ func addDeterministicReferenceEdges(edges map[string]clusterer.Edge, nodes []clu
|
||||
upsertClusterEdge(edges, threadID, referencedID, deterministicRefScore)
|
||||
}
|
||||
}
|
||||
for _, threadIDs := range threadsByReferencedNumber {
|
||||
if len(threadIDs) < 2 || len(threadIDs) > sharedRefMaxBucketSize {
|
||||
continue
|
||||
}
|
||||
sort.Slice(threadIDs, func(i, j int) bool { return threadIDs[i] < threadIDs[j] })
|
||||
for left := 0; left < len(threadIDs); left++ {
|
||||
for right := left + 1; right < len(threadIDs); right++ {
|
||||
upsertClusterEdge(edges, threadIDs[left], threadIDs[right], deterministicRefScore)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func referencedThreadNumbers(thread store.Thread) map[int]bool {
|
||||
value := thread.Title + "\n" + thread.Body
|
||||
refs := map[int]bool{}
|
||||
for _, match := range threadReferencePattern.FindAllStringSubmatch(value, -1) {
|
||||
if len(match) < 2 {
|
||||
continue
|
||||
}
|
||||
number, err := strconv.Atoi(match[1])
|
||||
numberText := firstNonEmpty(match[1:]...)
|
||||
number, err := strconv.Atoi(numberText)
|
||||
if err != nil || number <= 0 || number == thread.Number {
|
||||
continue
|
||||
}
|
||||
@ -2085,6 +2075,33 @@ func referencedThreadNumbers(thread store.Thread) map[int]bool {
|
||||
return refs
|
||||
}
|
||||
|
||||
func titleTokenOverlap(left, right string) float64 {
|
||||
leftTokens := titleTokenSet(left)
|
||||
rightTokens := titleTokenSet(right)
|
||||
if len(leftTokens) == 0 || len(rightTokens) == 0 {
|
||||
return 0
|
||||
}
|
||||
overlap := 0
|
||||
for token := range leftTokens {
|
||||
if rightTokens[token] {
|
||||
overlap++
|
||||
}
|
||||
}
|
||||
base := len(leftTokens)
|
||||
if len(rightTokens) < base {
|
||||
base = len(rightTokens)
|
||||
}
|
||||
return float64(overlap) / float64(base)
|
||||
}
|
||||
|
||||
func titleTokenSet(value string) map[string]bool {
|
||||
out := map[string]bool{}
|
||||
for _, token := range titleTokenPattern.FindAllString(strings.ToLower(value), -1) {
|
||||
out[token] = true
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func keepTopEdges(edges []clusterer.Edge, fanout int) []clusterer.Edge {
|
||||
if fanout <= 0 || len(edges) == 0 {
|
||||
return edges
|
||||
|
||||
@ -1023,6 +1023,147 @@ func TestBuildDurableClusterInputsKeepsDeterministicReferenceEdges(t *testing.T)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildDurableClusterInputsIgnoresBareOneDigitProseRefs(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
st, err := store.Open(ctx, filepath.Join(t.TempDir(), "gitcrawl.db"))
|
||||
if err != nil {
|
||||
t.Fatalf("open store: %v", err)
|
||||
}
|
||||
defer st.Close()
|
||||
repoID, err := st.UpsertRepository(ctx, store.Repository{
|
||||
Owner: "openclaw",
|
||||
Name: "openclaw",
|
||||
FullName: "openclaw/openclaw",
|
||||
RawJSON: "{}",
|
||||
UpdatedAt: time.Now().UTC().Format(time.RFC3339Nano),
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("seed repository: %v", err)
|
||||
}
|
||||
firstID, err := st.UpsertThread(ctx, store.Thread{
|
||||
RepoID: repoID,
|
||||
GitHubID: "401",
|
||||
Number: 401,
|
||||
Kind: "pull_request",
|
||||
State: "open",
|
||||
Title: "Background task notification",
|
||||
Body: "This is the #1 UX gap for orchestration.",
|
||||
HTMLURL: "https://github.com/openclaw/openclaw/pull/401",
|
||||
LabelsJSON: "[]",
|
||||
AssigneesJSON: "[]",
|
||||
RawJSON: "{}",
|
||||
ContentHash: "hash-401",
|
||||
UpdatedAt: time.Now().UTC().Format(time.RFC3339Nano),
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("seed first thread: %v", err)
|
||||
}
|
||||
secondID, err := st.UpsertThread(ctx, store.Thread{
|
||||
RepoID: repoID,
|
||||
GitHubID: "402",
|
||||
Number: 402,
|
||||
Kind: "pull_request",
|
||||
State: "open",
|
||||
Title: "Plugin config overlay",
|
||||
Body: "This is #1 for locked-down deployments.",
|
||||
HTMLURL: "https://github.com/openclaw/openclaw/pull/402",
|
||||
LabelsJSON: "[]",
|
||||
AssigneesJSON: "[]",
|
||||
RawJSON: "{}",
|
||||
ContentHash: "hash-402",
|
||||
UpdatedAt: time.Now().UTC().Format(time.RFC3339Nano),
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("seed second thread: %v", err)
|
||||
}
|
||||
inputs, edgeCount, err := buildDurableClusterInputs(ctx, st, repoID, []store.ThreadVector{
|
||||
{ThreadID: firstID, Vector: []float64{1, 0}},
|
||||
{ThreadID: secondID, Vector: []float64{0, 1}},
|
||||
}, clusterBuildOptions{
|
||||
Threshold: 0.99,
|
||||
MinSize: 2,
|
||||
MaxClusterSize: defaultClusterMaxSize,
|
||||
Fanout: 16,
|
||||
CrossKindThreshold: 0.99,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("build inputs: %v", err)
|
||||
}
|
||||
if edgeCount != 0 || len(inputs) != 0 {
|
||||
t.Fatalf("bare one-digit prose refs should not form evidence edges, edges=%d inputs=%#v", edgeCount, inputs)
|
||||
}
|
||||
}
|
||||
|
||||
func TestBuildDurableClusterInputsPrunesWeakGenericTitleEdges(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
st, err := store.Open(ctx, filepath.Join(t.TempDir(), "gitcrawl.db"))
|
||||
if err != nil {
|
||||
t.Fatalf("open store: %v", err)
|
||||
}
|
||||
defer st.Close()
|
||||
repoID, err := st.UpsertRepository(ctx, store.Repository{
|
||||
Owner: "openclaw",
|
||||
Name: "openclaw",
|
||||
FullName: "openclaw/openclaw",
|
||||
RawJSON: "{}",
|
||||
UpdatedAt: time.Now().UTC().Format(time.RFC3339Nano),
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("seed repository: %v", err)
|
||||
}
|
||||
firstID, err := st.UpsertThread(ctx, store.Thread{
|
||||
RepoID: repoID,
|
||||
GitHubID: "501",
|
||||
Number: 501,
|
||||
Kind: "pull_request",
|
||||
State: "open",
|
||||
Title: "fix: improve error handling and logging for security-critical operations",
|
||||
HTMLURL: "https://github.com/openclaw/openclaw/pull/501",
|
||||
LabelsJSON: "[]",
|
||||
AssigneesJSON: "[]",
|
||||
RawJSON: "{}",
|
||||
ContentHash: "hash-501",
|
||||
UpdatedAt: time.Now().UTC().Format(time.RFC3339Nano),
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("seed first thread: %v", err)
|
||||
}
|
||||
secondID, err := st.UpsertThread(ctx, store.Thread{
|
||||
RepoID: repoID,
|
||||
GitHubID: "502",
|
||||
Number: 502,
|
||||
Kind: "pull_request",
|
||||
State: "open",
|
||||
Title: "fix(gateway): isolate control-plane write rate limits by connection",
|
||||
HTMLURL: "https://github.com/openclaw/openclaw/pull/502",
|
||||
LabelsJSON: "[]",
|
||||
AssigneesJSON: "[]",
|
||||
RawJSON: "{}",
|
||||
ContentHash: "hash-502",
|
||||
UpdatedAt: time.Now().UTC().Format(time.RFC3339Nano),
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("seed second thread: %v", err)
|
||||
}
|
||||
vectors := []store.ThreadVector{
|
||||
{ThreadID: firstID, Vector: []float64{1, 0}},
|
||||
{ThreadID: secondID, Vector: []float64{0.84, 0.5425863986500217}},
|
||||
}
|
||||
inputs, edgeCount, err := buildDurableClusterInputs(ctx, st, repoID, vectors, clusterBuildOptions{
|
||||
Threshold: 0.82,
|
||||
MinSize: 2,
|
||||
MaxClusterSize: defaultClusterMaxSize,
|
||||
Fanout: 16,
|
||||
CrossKindThreshold: defaultCrossKindMinScore,
|
||||
})
|
||||
if err != nil {
|
||||
t.Fatalf("build inputs: %v", err)
|
||||
}
|
||||
if edgeCount != 0 || len(inputs) != 0 {
|
||||
t.Fatalf("weak generic title edge should be pruned, edges=%d inputs=%#v", edgeCount, inputs)
|
||||
}
|
||||
}
|
||||
|
||||
func TestKeepTopEdgesKeepsOneSidedNearestNeighbors(t *testing.T) {
|
||||
edges := keepTopEdges([]clusterer.Edge{
|
||||
{LeftThreadID: 1, RightThreadID: 2, Score: 0.95},
|
||||
|
||||
Loading…
Reference in New Issue
Block a user