diff --git a/README.md b/README.md index 6c43010..74b5e94 100644 --- a/README.md +++ b/README.md @@ -35,7 +35,7 @@ gitcrawl tui owner/repo ``` `gitcrawl clusters`, `gitcrawl durable-clusters`, and `gitcrawl tui` show active primary cluster memberships by default. Pass `--include-closed` to inspect closed rows and historical secondary memberships. -`gitcrawl cluster` and `gitcrawl refresh` build bounded nearest-neighbor clusters by default (`--threshold 0.80`, `--max-cluster-size 40`, `--k 16`, `--cross-kind-threshold 0.93`) and add deterministic GitHub reference evidence for direct issue/PR links such as `#123`, `issues/123`, and `pull/123`. Weak embedding edges also need concrete title-token overlap unless their similarity is already high, which keeps generic low-confidence bridges from forming unrelated clusters. +`gitcrawl cluster` and `gitcrawl refresh` build ghcrawl-shaped durable clusters by default (`--threshold 0.80`, `--min-size 1`, `--max-cluster-size 40`, `--k 16`, `--cross-kind-threshold 0.93`): every active vector-backed thread is represented, singleton rows use `singleton_orphan`, multi-member rows use `duplicate_candidate`, and stable IDs are derived from the representative thread. They also add deterministic GitHub reference evidence for direct issue/PR links such as `#123`, `issues/123`, and `pull/123`. Weak embedding edges need concrete title-token overlap unless their similarity is already high, which keeps generic low-confidence bridges from forming unrelated clusters. `gitcrawl tui` infers the most recently updated local repository when `owner/repo` is omitted. `serve` is intentionally not part of `gitcrawl`. `gitcrawl sync` fetches open issues and pull requests by default. Pass `--state all` or `--state closed` for explicit backfill workflows; incremental open syncs with `--since` also sweep recently closed items so local open state does not rot. The TUI starts at `--min-size 5` so maintainer-significant active clusters are visible first; pass `--min-size 1` to include singletons. Mouse support is built in: click rows, wheel panes, and right-click for copy, sort, filter, jump, link, neighbor, local close/reopen, and member triage actions. Press `a` to open the same action menu from the keyboard, `#` to jump directly to an issue or PR number, `p` to switch between repositories already present in the local store, or `n` to load neighbors for the selected issue or PR. Enter from the members pane also loads neighbors before opening detail. The TUI quietly refreshes from the local store every 15 seconds. diff --git a/internal/cli/app.go b/internal/cli/app.go index f233279..2a5bf13 100644 --- a/internal/cli/app.go +++ b/internal/cli/app.go @@ -2,8 +2,6 @@ package cli import ( "context" - "crypto/sha256" - "encoding/hex" "encoding/json" "errors" "flag" @@ -252,7 +250,7 @@ func (a *App) runRefresh(ctx context.Context, args []string) error { state := fs.String("state", "", "GitHub issue state: open|closed|all; default open") limitRaw := fs.String("limit", "", "maximum sync or embedding rows") thresholdRaw := fs.String("threshold", fmt.Sprintf("%.2f", defaultClusterThreshold), "minimum cluster cosine score") - minSizeRaw := fs.String("min-size", "2", "minimum cluster member count") + minSizeRaw := fs.String("min-size", "1", "minimum cluster member count") maxClusterSizeRaw := fs.String("max-cluster-size", strconv.Itoa(defaultClusterMaxSize), "maximum members per generated cluster") fanoutRaw := fs.String("k", strconv.Itoa(defaultClusterFanout), "nearest-neighbor fanout per thread") crossKindThresholdRaw := fs.String("cross-kind-threshold", fmt.Sprintf("%.2f", defaultCrossKindMinScore), "minimum score for issue/pull request edges") @@ -547,7 +545,7 @@ func (a *App) runCluster(ctx context.Context, args []string) error { fs := flag.NewFlagSet("cluster", flag.ContinueOnError) fs.SetOutput(io.Discard) thresholdRaw := fs.String("threshold", fmt.Sprintf("%.2f", defaultClusterThreshold), "minimum cosine score") - minSizeRaw := fs.String("min-size", "2", "minimum cluster member count") + minSizeRaw := fs.String("min-size", "1", "minimum cluster member count") maxClusterSizeRaw := fs.String("max-cluster-size", strconv.Itoa(defaultClusterMaxSize), "maximum members per generated cluster") fanoutRaw := fs.String("k", strconv.Itoa(defaultClusterFanout), "nearest-neighbor fanout per thread") crossKindThresholdRaw := fs.String("cross-kind-threshold", fmt.Sprintf("%.2f", defaultCrossKindMinScore), "minimum score for issue/pull request edges") @@ -1937,7 +1935,7 @@ func parseClusterShapeOptions(command, maxClusterSizeRaw, fanoutRaw, crossKindTh func buildDurableClusterInputs(ctx context.Context, st *store.Store, repoID int64, storedVectors []store.ThreadVector, options clusterBuildOptions) ([]store.DurableClusterInput, int, error) { if options.MinSize <= 0 { - options.MinSize = 2 + options.MinSize = 1 } if options.MaxClusterSize <= 0 { options.MaxClusterSize = defaultClusterMaxSize @@ -2005,19 +2003,26 @@ func buildDurableClusterInputs(ctx context.Context, st *store.Store, repoID int6 right := threads[builtCluster.Members[j]] return left.Number < right.Number }) - rep := threads[builtCluster.RepresentativeThreadID] + identity := store.HumanKeyForValue(fmt.Sprintf("repo:%d:cluster-representative:%d", repoID, builtCluster.RepresentativeThreadID)) + clusterType := "duplicate_candidate" + if len(builtCluster.Members) == 1 { + clusterType = "singleton_orphan" + } input := store.DurableClusterInput{ - StableKey: durableClusterStableKey(builtCluster.Members, threads), - StableSlug: durableClusterSlug(builtCluster.Members, threads), + StableKey: identity.Hash, + StableSlug: store.HumanKeyStableSlug(identity), + ClusterType: clusterType, RepresentativeThreadID: builtCluster.RepresentativeThreadID, - Title: rep.Title, + Title: "Cluster " + identity.Slug, Members: make([]store.DurableClusterMemberInput, 0, len(builtCluster.Members)), } for _, threadID := range builtCluster.Members { - role := "member" + role := "related" var scorePtr *float64 if threadID == builtCluster.RepresentativeThreadID { - role = "representative" + role = "canonical" + scoreCopy := 1.0 + scorePtr = &scoreCopy } else if score, ok := pairScores[threadIDPairKey(threadID, builtCluster.RepresentativeThreadID)]; ok { scoreCopy := score scorePtr = &scoreCopy @@ -2194,23 +2199,6 @@ func clusterRepository(ctx context.Context, st *store.Store, repoID int64, store }, nil } -func durableClusterStableKey(threadIDs []int64, threads map[int64]store.Thread) string { - parts := make([]string, 0, len(threadIDs)) - for _, id := range threadIDs { - if thread, ok := threads[id]; ok && thread.Number > 0 { - parts = append(parts, strconv.Itoa(thread.Number)) - continue - } - parts = append(parts, strconv.FormatInt(id, 10)) - } - return "numbers:" + strings.Join(parts, ",") -} - -func durableClusterSlug(threadIDs []int64, threads map[int64]store.Thread) string { - sum := sha256.Sum256([]byte(durableClusterStableKey(threadIDs, threads))) - return "cluster-" + hex.EncodeToString(sum[:])[:12] -} - func threadIDPairKey(left, right int64) string { if left > right { left, right = right, left diff --git a/internal/cli/app_test.go b/internal/cli/app_test.go index 10674e6..08530a8 100644 --- a/internal/cli/app_test.go +++ b/internal/cli/app_test.go @@ -9,6 +9,7 @@ import ( "net/http/httptest" "os" "path/filepath" + "sort" "strconv" "strings" "testing" @@ -851,7 +852,7 @@ func TestClusterCommandPersistsDurableClusters(t *testing.T) { if err := run.Run(ctx, []string{"--config", configPath, "cluster", "openclaw/openclaw", "--threshold", "0.90", "--json"}); err != nil { t.Fatalf("cluster: %v", err) } - if !strings.Contains(stdout.String(), `"cluster_count": 1`) { + if !strings.Contains(stdout.String(), `"cluster_count": 2`) { t.Fatalf("cluster output = %q", stdout.String()) } st, err = store.Open(ctx, dbPath) @@ -863,8 +864,13 @@ func TestClusterCommandPersistsDurableClusters(t *testing.T) { if err != nil { t.Fatalf("list clusters: %v", err) } - if len(clusters) != 1 || clusters[0].MemberCount != 2 { - t.Fatalf("expected one durable cluster, got %#v", clusters) + memberCounts := []int{} + for _, cluster := range clusters { + memberCounts = append(memberCounts, cluster.MemberCount) + } + sort.Ints(memberCounts) + if len(memberCounts) != 2 || memberCounts[0] != 1 || memberCounts[1] != 2 { + t.Fatalf("expected duplicate cluster plus singleton, got %#v", clusters) } } @@ -1321,7 +1327,7 @@ func TestRefreshEmbedsAndClustersWithoutSync(t *testing.T) { if !strings.Contains(out, `"embedded": 3`) { t.Fatalf("refresh did not embed rows: %q", out) } - if !strings.Contains(out, `"cluster_count": 1`) { + if !strings.Contains(out, `"cluster_count": 2`) { t.Fatalf("refresh did not persist cluster: %q", out) } @@ -1334,8 +1340,13 @@ func TestRefreshEmbedsAndClustersWithoutSync(t *testing.T) { if err != nil { t.Fatalf("list clusters: %v", err) } - if len(clusters) != 1 || clusters[0].MemberCount != 2 { - t.Fatalf("expected one durable cluster, got %#v", clusters) + memberCounts := []int{} + for _, cluster := range clusters { + memberCounts = append(memberCounts, cluster.MemberCount) + } + sort.Ints(memberCounts) + if len(memberCounts) != 2 || memberCounts[0] != 1 || memberCounts[1] != 2 { + t.Fatalf("expected duplicate cluster plus singleton, got %#v", clusters) } } diff --git a/internal/store/clusters.go b/internal/store/clusters.go index ab62f96..a66992d 100644 --- a/internal/store/clusters.go +++ b/internal/store/clusters.go @@ -72,6 +72,7 @@ type ClusterMemberOverride struct { type DurableClusterInput struct { StableKey string StableSlug string + ClusterType string RepresentativeThreadID int64 Title string Members []DurableClusterMemberInput @@ -725,6 +726,15 @@ func (s *Store) SaveDurableClusters(ctx context.Context, repoID int64, inputs [] return err } } + if len(inputs) > 0 { + if _, err := tx.q().ExecContext(ctx, ` + delete from cluster_groups + where repo_id = ? + and cluster_type = 'similarity' + `, repoID); err != nil { + return fmt.Errorf("delete legacy similarity clusters: %w", err) + } + } if _, err := tx.q().ExecContext(ctx, ` update cluster_runs set finished_at = ?, stats_json = ? @@ -945,12 +955,16 @@ func (s *Store) upsertDurableCluster(ctx context.Context, repoID, runID int64, i if stableSlug == "" { stableSlug = stableKey } + clusterType := strings.TrimSpace(input.ClusterType) + if clusterType == "" { + clusterType = "duplicate_candidate" + } var clusterID int64 if err := s.q().QueryRowContext(ctx, ` insert into cluster_groups( repo_id, stable_key, stable_slug, status, cluster_type, representative_thread_id, title, created_at, updated_at ) - values(?, ?, ?, 'active', 'similarity', ?, ?, ?, ?) + values(?, ?, ?, 'active', ?, ?, ?, ?, ?) on conflict(repo_id, stable_key) do update set stable_slug = excluded.stable_slug, cluster_type = excluded.cluster_type, @@ -961,7 +975,7 @@ func (s *Store) upsertDurableCluster(ctx context.Context, repoID, runID int64, i title = excluded.title, updated_at = excluded.updated_at returning id - `, repoID, stableKey, stableSlug, nullInt(input.RepresentativeThreadID), nullString(input.Title), now, now).Scan(&clusterID); err != nil { + `, repoID, stableKey, stableSlug, clusterType, nullInt(input.RepresentativeThreadID), nullString(input.Title), now, now).Scan(&clusterID); err != nil { return 0, fmt.Errorf("upsert durable cluster: %w", err) } if _, err := s.q().ExecContext(ctx, ` diff --git a/internal/store/human_key.go b/internal/store/human_key.go index 16a0a17..06759bd 100644 --- a/internal/store/human_key.go +++ b/internal/store/human_key.go @@ -2,7 +2,10 @@ package store import ( "crypto/sha256" + "encoding/hex" "fmt" + "strconv" + "strings" ) var humanKeyWords = []string{ @@ -44,10 +47,51 @@ func clusterHumanName(repoID, representativeThreadID, clusterID int64) string { if representativeThreadID != 0 { key = fmt.Sprintf("repo:%d:cluster-representative:%d", repoID, representativeThreadID) } - hash := sha256.Sum256([]byte(key)) - return fmt.Sprintf("%s-%s-%s", - humanKeyWords[int(hash[0])%len(humanKeyWords)], - humanKeyWords[int(hash[1])%len(humanKeyWords)], - humanKeyWords[int(hash[2])%len(humanKeyWords)], - ) + return HumanKeyForValue(key).Slug +} + +type HumanKey struct { + Hash string + Slug string + Checksum string +} + +func StableHash(value string) string { + sum := sha256.Sum256([]byte(value)) + return hex.EncodeToString(sum[:]) +} + +func HumanKeyForValue(value string) HumanKey { + return HumanKeyFromHash(StableHash(value)) +} + +func HumanKeyFromHash(hash string) HumanKey { + normalized := strings.ToLower(hash) + index := func(offset int) int { + value, err := strconv.ParseInt(normalized[offset:offset+2], 16, 64) + if err != nil { + return 0 + } + return int(value) % len(humanKeyWords) + } + checksumValue, err := strconv.ParseInt(normalized[6:12], 16, 64) + checksum := "0000" + if err == nil { + checksum = strconv.FormatInt(checksumValue, 36) + if len(checksum) < 4 { + checksum = strings.Repeat("0", 4-len(checksum)) + checksum + } + if len(checksum) > 4 { + checksum = checksum[len(checksum)-4:] + } + } + return HumanKey{ + Hash: normalized, + Slug: fmt.Sprintf("%s-%s-%s", humanKeyWords[index(0)], humanKeyWords[index(2)], humanKeyWords[index(4)]), + Checksum: checksum, + } +} + +func HumanKeyStableSlug(key HumanKey) string { + return key.Slug + "-" + key.Checksum } diff --git a/internal/store/human_key_test.go b/internal/store/human_key_test.go new file mode 100644 index 0000000..477215e --- /dev/null +++ b/internal/store/human_key_test.go @@ -0,0 +1,13 @@ +package store + +import "testing" + +func TestHumanKeyForValueMatchesGhcrawlRepresentativeIdentity(t *testing.T) { + key := HumanKeyForValue("repo:1:cluster-representative:546") + if key.Hash != "e77f18999d72cc6d27c5d3d0aa2c02cdc8cad3c1be077feb70062bc55eae98fd" { + t.Fatalf("hash = %q", key.Hash) + } + if HumanKeyStableSlug(key) != "usage-matrix-binary-zrzm" { + t.Fatalf("stable slug = %q", HumanKeyStableSlug(key)) + } +}