fix(cluster): emit ghcrawl-shaped durable clusters

This commit is contained in:
Vincent Koc 2026-04-28 12:40:40 -07:00
parent 832cd09dc0
commit 6e9f4356b0
No known key found for this signature in database
6 changed files with 113 additions and 43 deletions

View File

@ -35,7 +35,7 @@ gitcrawl tui owner/repo
```
`gitcrawl clusters`, `gitcrawl durable-clusters`, and `gitcrawl tui` show active primary cluster memberships by default. Pass `--include-closed` to inspect closed rows and historical secondary memberships.
`gitcrawl cluster` and `gitcrawl refresh` build bounded nearest-neighbor clusters by default (`--threshold 0.80`, `--max-cluster-size 40`, `--k 16`, `--cross-kind-threshold 0.93`) and add deterministic GitHub reference evidence for direct issue/PR links such as `#123`, `issues/123`, and `pull/123`. Weak embedding edges also need concrete title-token overlap unless their similarity is already high, which keeps generic low-confidence bridges from forming unrelated clusters.
`gitcrawl cluster` and `gitcrawl refresh` build ghcrawl-shaped durable clusters by default (`--threshold 0.80`, `--min-size 1`, `--max-cluster-size 40`, `--k 16`, `--cross-kind-threshold 0.93`): every active vector-backed thread is represented, singleton rows use `singleton_orphan`, multi-member rows use `duplicate_candidate`, and stable IDs are derived from the representative thread. They also add deterministic GitHub reference evidence for direct issue/PR links such as `#123`, `issues/123`, and `pull/123`. Weak embedding edges need concrete title-token overlap unless their similarity is already high, which keeps generic low-confidence bridges from forming unrelated clusters.
`gitcrawl tui` infers the most recently updated local repository when `owner/repo` is omitted. `serve` is intentionally not part of `gitcrawl`.
`gitcrawl sync` fetches open issues and pull requests by default. Pass `--state all` or `--state closed` for explicit backfill workflows; incremental open syncs with `--since` also sweep recently closed items so local open state does not rot.
The TUI starts at `--min-size 5` so maintainer-significant active clusters are visible first; pass `--min-size 1` to include singletons. Mouse support is built in: click rows, wheel panes, and right-click for copy, sort, filter, jump, link, neighbor, local close/reopen, and member triage actions. Press `a` to open the same action menu from the keyboard, `#` to jump directly to an issue or PR number, `p` to switch between repositories already present in the local store, or `n` to load neighbors for the selected issue or PR. Enter from the members pane also loads neighbors before opening detail. The TUI quietly refreshes from the local store every 15 seconds.

View File

@ -2,8 +2,6 @@ package cli
import (
"context"
"crypto/sha256"
"encoding/hex"
"encoding/json"
"errors"
"flag"
@ -252,7 +250,7 @@ func (a *App) runRefresh(ctx context.Context, args []string) error {
state := fs.String("state", "", "GitHub issue state: open|closed|all; default open")
limitRaw := fs.String("limit", "", "maximum sync or embedding rows")
thresholdRaw := fs.String("threshold", fmt.Sprintf("%.2f", defaultClusterThreshold), "minimum cluster cosine score")
minSizeRaw := fs.String("min-size", "2", "minimum cluster member count")
minSizeRaw := fs.String("min-size", "1", "minimum cluster member count")
maxClusterSizeRaw := fs.String("max-cluster-size", strconv.Itoa(defaultClusterMaxSize), "maximum members per generated cluster")
fanoutRaw := fs.String("k", strconv.Itoa(defaultClusterFanout), "nearest-neighbor fanout per thread")
crossKindThresholdRaw := fs.String("cross-kind-threshold", fmt.Sprintf("%.2f", defaultCrossKindMinScore), "minimum score for issue/pull request edges")
@ -547,7 +545,7 @@ func (a *App) runCluster(ctx context.Context, args []string) error {
fs := flag.NewFlagSet("cluster", flag.ContinueOnError)
fs.SetOutput(io.Discard)
thresholdRaw := fs.String("threshold", fmt.Sprintf("%.2f", defaultClusterThreshold), "minimum cosine score")
minSizeRaw := fs.String("min-size", "2", "minimum cluster member count")
minSizeRaw := fs.String("min-size", "1", "minimum cluster member count")
maxClusterSizeRaw := fs.String("max-cluster-size", strconv.Itoa(defaultClusterMaxSize), "maximum members per generated cluster")
fanoutRaw := fs.String("k", strconv.Itoa(defaultClusterFanout), "nearest-neighbor fanout per thread")
crossKindThresholdRaw := fs.String("cross-kind-threshold", fmt.Sprintf("%.2f", defaultCrossKindMinScore), "minimum score for issue/pull request edges")
@ -1937,7 +1935,7 @@ func parseClusterShapeOptions(command, maxClusterSizeRaw, fanoutRaw, crossKindTh
func buildDurableClusterInputs(ctx context.Context, st *store.Store, repoID int64, storedVectors []store.ThreadVector, options clusterBuildOptions) ([]store.DurableClusterInput, int, error) {
if options.MinSize <= 0 {
options.MinSize = 2
options.MinSize = 1
}
if options.MaxClusterSize <= 0 {
options.MaxClusterSize = defaultClusterMaxSize
@ -2005,19 +2003,26 @@ func buildDurableClusterInputs(ctx context.Context, st *store.Store, repoID int6
right := threads[builtCluster.Members[j]]
return left.Number < right.Number
})
rep := threads[builtCluster.RepresentativeThreadID]
identity := store.HumanKeyForValue(fmt.Sprintf("repo:%d:cluster-representative:%d", repoID, builtCluster.RepresentativeThreadID))
clusterType := "duplicate_candidate"
if len(builtCluster.Members) == 1 {
clusterType = "singleton_orphan"
}
input := store.DurableClusterInput{
StableKey: durableClusterStableKey(builtCluster.Members, threads),
StableSlug: durableClusterSlug(builtCluster.Members, threads),
StableKey: identity.Hash,
StableSlug: store.HumanKeyStableSlug(identity),
ClusterType: clusterType,
RepresentativeThreadID: builtCluster.RepresentativeThreadID,
Title: rep.Title,
Title: "Cluster " + identity.Slug,
Members: make([]store.DurableClusterMemberInput, 0, len(builtCluster.Members)),
}
for _, threadID := range builtCluster.Members {
role := "member"
role := "related"
var scorePtr *float64
if threadID == builtCluster.RepresentativeThreadID {
role = "representative"
role = "canonical"
scoreCopy := 1.0
scorePtr = &scoreCopy
} else if score, ok := pairScores[threadIDPairKey(threadID, builtCluster.RepresentativeThreadID)]; ok {
scoreCopy := score
scorePtr = &scoreCopy
@ -2194,23 +2199,6 @@ func clusterRepository(ctx context.Context, st *store.Store, repoID int64, store
}, nil
}
func durableClusterStableKey(threadIDs []int64, threads map[int64]store.Thread) string {
parts := make([]string, 0, len(threadIDs))
for _, id := range threadIDs {
if thread, ok := threads[id]; ok && thread.Number > 0 {
parts = append(parts, strconv.Itoa(thread.Number))
continue
}
parts = append(parts, strconv.FormatInt(id, 10))
}
return "numbers:" + strings.Join(parts, ",")
}
func durableClusterSlug(threadIDs []int64, threads map[int64]store.Thread) string {
sum := sha256.Sum256([]byte(durableClusterStableKey(threadIDs, threads)))
return "cluster-" + hex.EncodeToString(sum[:])[:12]
}
func threadIDPairKey(left, right int64) string {
if left > right {
left, right = right, left

View File

@ -9,6 +9,7 @@ import (
"net/http/httptest"
"os"
"path/filepath"
"sort"
"strconv"
"strings"
"testing"
@ -851,7 +852,7 @@ func TestClusterCommandPersistsDurableClusters(t *testing.T) {
if err := run.Run(ctx, []string{"--config", configPath, "cluster", "openclaw/openclaw", "--threshold", "0.90", "--json"}); err != nil {
t.Fatalf("cluster: %v", err)
}
if !strings.Contains(stdout.String(), `"cluster_count": 1`) {
if !strings.Contains(stdout.String(), `"cluster_count": 2`) {
t.Fatalf("cluster output = %q", stdout.String())
}
st, err = store.Open(ctx, dbPath)
@ -863,8 +864,13 @@ func TestClusterCommandPersistsDurableClusters(t *testing.T) {
if err != nil {
t.Fatalf("list clusters: %v", err)
}
if len(clusters) != 1 || clusters[0].MemberCount != 2 {
t.Fatalf("expected one durable cluster, got %#v", clusters)
memberCounts := []int{}
for _, cluster := range clusters {
memberCounts = append(memberCounts, cluster.MemberCount)
}
sort.Ints(memberCounts)
if len(memberCounts) != 2 || memberCounts[0] != 1 || memberCounts[1] != 2 {
t.Fatalf("expected duplicate cluster plus singleton, got %#v", clusters)
}
}
@ -1321,7 +1327,7 @@ func TestRefreshEmbedsAndClustersWithoutSync(t *testing.T) {
if !strings.Contains(out, `"embedded": 3`) {
t.Fatalf("refresh did not embed rows: %q", out)
}
if !strings.Contains(out, `"cluster_count": 1`) {
if !strings.Contains(out, `"cluster_count": 2`) {
t.Fatalf("refresh did not persist cluster: %q", out)
}
@ -1334,8 +1340,13 @@ func TestRefreshEmbedsAndClustersWithoutSync(t *testing.T) {
if err != nil {
t.Fatalf("list clusters: %v", err)
}
if len(clusters) != 1 || clusters[0].MemberCount != 2 {
t.Fatalf("expected one durable cluster, got %#v", clusters)
memberCounts := []int{}
for _, cluster := range clusters {
memberCounts = append(memberCounts, cluster.MemberCount)
}
sort.Ints(memberCounts)
if len(memberCounts) != 2 || memberCounts[0] != 1 || memberCounts[1] != 2 {
t.Fatalf("expected duplicate cluster plus singleton, got %#v", clusters)
}
}

View File

@ -72,6 +72,7 @@ type ClusterMemberOverride struct {
type DurableClusterInput struct {
StableKey string
StableSlug string
ClusterType string
RepresentativeThreadID int64
Title string
Members []DurableClusterMemberInput
@ -725,6 +726,15 @@ func (s *Store) SaveDurableClusters(ctx context.Context, repoID int64, inputs []
return err
}
}
if len(inputs) > 0 {
if _, err := tx.q().ExecContext(ctx, `
delete from cluster_groups
where repo_id = ?
and cluster_type = 'similarity'
`, repoID); err != nil {
return fmt.Errorf("delete legacy similarity clusters: %w", err)
}
}
if _, err := tx.q().ExecContext(ctx, `
update cluster_runs
set finished_at = ?, stats_json = ?
@ -945,12 +955,16 @@ func (s *Store) upsertDurableCluster(ctx context.Context, repoID, runID int64, i
if stableSlug == "" {
stableSlug = stableKey
}
clusterType := strings.TrimSpace(input.ClusterType)
if clusterType == "" {
clusterType = "duplicate_candidate"
}
var clusterID int64
if err := s.q().QueryRowContext(ctx, `
insert into cluster_groups(
repo_id, stable_key, stable_slug, status, cluster_type, representative_thread_id, title, created_at, updated_at
)
values(?, ?, ?, 'active', 'similarity', ?, ?, ?, ?)
values(?, ?, ?, 'active', ?, ?, ?, ?, ?)
on conflict(repo_id, stable_key) do update set
stable_slug = excluded.stable_slug,
cluster_type = excluded.cluster_type,
@ -961,7 +975,7 @@ func (s *Store) upsertDurableCluster(ctx context.Context, repoID, runID int64, i
title = excluded.title,
updated_at = excluded.updated_at
returning id
`, repoID, stableKey, stableSlug, nullInt(input.RepresentativeThreadID), nullString(input.Title), now, now).Scan(&clusterID); err != nil {
`, repoID, stableKey, stableSlug, clusterType, nullInt(input.RepresentativeThreadID), nullString(input.Title), now, now).Scan(&clusterID); err != nil {
return 0, fmt.Errorf("upsert durable cluster: %w", err)
}
if _, err := s.q().ExecContext(ctx, `

View File

@ -2,7 +2,10 @@ package store
import (
"crypto/sha256"
"encoding/hex"
"fmt"
"strconv"
"strings"
)
var humanKeyWords = []string{
@ -44,10 +47,51 @@ func clusterHumanName(repoID, representativeThreadID, clusterID int64) string {
if representativeThreadID != 0 {
key = fmt.Sprintf("repo:%d:cluster-representative:%d", repoID, representativeThreadID)
}
hash := sha256.Sum256([]byte(key))
return fmt.Sprintf("%s-%s-%s",
humanKeyWords[int(hash[0])%len(humanKeyWords)],
humanKeyWords[int(hash[1])%len(humanKeyWords)],
humanKeyWords[int(hash[2])%len(humanKeyWords)],
)
return HumanKeyForValue(key).Slug
}
type HumanKey struct {
Hash string
Slug string
Checksum string
}
func StableHash(value string) string {
sum := sha256.Sum256([]byte(value))
return hex.EncodeToString(sum[:])
}
func HumanKeyForValue(value string) HumanKey {
return HumanKeyFromHash(StableHash(value))
}
func HumanKeyFromHash(hash string) HumanKey {
normalized := strings.ToLower(hash)
index := func(offset int) int {
value, err := strconv.ParseInt(normalized[offset:offset+2], 16, 64)
if err != nil {
return 0
}
return int(value) % len(humanKeyWords)
}
checksumValue, err := strconv.ParseInt(normalized[6:12], 16, 64)
checksum := "0000"
if err == nil {
checksum = strconv.FormatInt(checksumValue, 36)
if len(checksum) < 4 {
checksum = strings.Repeat("0", 4-len(checksum)) + checksum
}
if len(checksum) > 4 {
checksum = checksum[len(checksum)-4:]
}
}
return HumanKey{
Hash: normalized,
Slug: fmt.Sprintf("%s-%s-%s", humanKeyWords[index(0)], humanKeyWords[index(2)], humanKeyWords[index(4)]),
Checksum: checksum,
}
}
func HumanKeyStableSlug(key HumanKey) string {
return key.Slug + "-" + key.Checksum
}

View File

@ -0,0 +1,13 @@
package store
import "testing"
func TestHumanKeyForValueMatchesGhcrawlRepresentativeIdentity(t *testing.T) {
key := HumanKeyForValue("repo:1:cluster-representative:546")
if key.Hash != "e77f18999d72cc6d27c5d3d0aa2c02cdc8cad3c1be077feb70062bc55eae98fd" {
t.Fatalf("hash = %q", key.Hash)
}
if HumanKeyStableSlug(key) != "usage-matrix-binary-zrzm" {
t.Fatalf("stable slug = %q", HumanKeyStableSlug(key))
}
}