fix(cluster): emit ghcrawl-shaped durable clusters
This commit is contained in:
parent
832cd09dc0
commit
6e9f4356b0
@ -35,7 +35,7 @@ gitcrawl tui owner/repo
|
||||
```
|
||||
|
||||
`gitcrawl clusters`, `gitcrawl durable-clusters`, and `gitcrawl tui` show active primary cluster memberships by default. Pass `--include-closed` to inspect closed rows and historical secondary memberships.
|
||||
`gitcrawl cluster` and `gitcrawl refresh` build bounded nearest-neighbor clusters by default (`--threshold 0.80`, `--max-cluster-size 40`, `--k 16`, `--cross-kind-threshold 0.93`) and add deterministic GitHub reference evidence for direct issue/PR links such as `#123`, `issues/123`, and `pull/123`. Weak embedding edges also need concrete title-token overlap unless their similarity is already high, which keeps generic low-confidence bridges from forming unrelated clusters.
|
||||
`gitcrawl cluster` and `gitcrawl refresh` build ghcrawl-shaped durable clusters by default (`--threshold 0.80`, `--min-size 1`, `--max-cluster-size 40`, `--k 16`, `--cross-kind-threshold 0.93`): every active vector-backed thread is represented, singleton rows use `singleton_orphan`, multi-member rows use `duplicate_candidate`, and stable IDs are derived from the representative thread. They also add deterministic GitHub reference evidence for direct issue/PR links such as `#123`, `issues/123`, and `pull/123`. Weak embedding edges need concrete title-token overlap unless their similarity is already high, which keeps generic low-confidence bridges from forming unrelated clusters.
|
||||
`gitcrawl tui` infers the most recently updated local repository when `owner/repo` is omitted. `serve` is intentionally not part of `gitcrawl`.
|
||||
`gitcrawl sync` fetches open issues and pull requests by default. Pass `--state all` or `--state closed` for explicit backfill workflows; incremental open syncs with `--since` also sweep recently closed items so local open state does not rot.
|
||||
The TUI starts at `--min-size 5` so maintainer-significant active clusters are visible first; pass `--min-size 1` to include singletons. Mouse support is built in: click rows, wheel panes, and right-click for copy, sort, filter, jump, link, neighbor, local close/reopen, and member triage actions. Press `a` to open the same action menu from the keyboard, `#` to jump directly to an issue or PR number, `p` to switch between repositories already present in the local store, or `n` to load neighbors for the selected issue or PR. Enter from the members pane also loads neighbors before opening detail. The TUI quietly refreshes from the local store every 15 seconds.
|
||||
|
||||
@ -2,8 +2,6 @@ package cli
|
||||
|
||||
import (
|
||||
"context"
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"flag"
|
||||
@ -252,7 +250,7 @@ func (a *App) runRefresh(ctx context.Context, args []string) error {
|
||||
state := fs.String("state", "", "GitHub issue state: open|closed|all; default open")
|
||||
limitRaw := fs.String("limit", "", "maximum sync or embedding rows")
|
||||
thresholdRaw := fs.String("threshold", fmt.Sprintf("%.2f", defaultClusterThreshold), "minimum cluster cosine score")
|
||||
minSizeRaw := fs.String("min-size", "2", "minimum cluster member count")
|
||||
minSizeRaw := fs.String("min-size", "1", "minimum cluster member count")
|
||||
maxClusterSizeRaw := fs.String("max-cluster-size", strconv.Itoa(defaultClusterMaxSize), "maximum members per generated cluster")
|
||||
fanoutRaw := fs.String("k", strconv.Itoa(defaultClusterFanout), "nearest-neighbor fanout per thread")
|
||||
crossKindThresholdRaw := fs.String("cross-kind-threshold", fmt.Sprintf("%.2f", defaultCrossKindMinScore), "minimum score for issue/pull request edges")
|
||||
@ -547,7 +545,7 @@ func (a *App) runCluster(ctx context.Context, args []string) error {
|
||||
fs := flag.NewFlagSet("cluster", flag.ContinueOnError)
|
||||
fs.SetOutput(io.Discard)
|
||||
thresholdRaw := fs.String("threshold", fmt.Sprintf("%.2f", defaultClusterThreshold), "minimum cosine score")
|
||||
minSizeRaw := fs.String("min-size", "2", "minimum cluster member count")
|
||||
minSizeRaw := fs.String("min-size", "1", "minimum cluster member count")
|
||||
maxClusterSizeRaw := fs.String("max-cluster-size", strconv.Itoa(defaultClusterMaxSize), "maximum members per generated cluster")
|
||||
fanoutRaw := fs.String("k", strconv.Itoa(defaultClusterFanout), "nearest-neighbor fanout per thread")
|
||||
crossKindThresholdRaw := fs.String("cross-kind-threshold", fmt.Sprintf("%.2f", defaultCrossKindMinScore), "minimum score for issue/pull request edges")
|
||||
@ -1937,7 +1935,7 @@ func parseClusterShapeOptions(command, maxClusterSizeRaw, fanoutRaw, crossKindTh
|
||||
|
||||
func buildDurableClusterInputs(ctx context.Context, st *store.Store, repoID int64, storedVectors []store.ThreadVector, options clusterBuildOptions) ([]store.DurableClusterInput, int, error) {
|
||||
if options.MinSize <= 0 {
|
||||
options.MinSize = 2
|
||||
options.MinSize = 1
|
||||
}
|
||||
if options.MaxClusterSize <= 0 {
|
||||
options.MaxClusterSize = defaultClusterMaxSize
|
||||
@ -2005,19 +2003,26 @@ func buildDurableClusterInputs(ctx context.Context, st *store.Store, repoID int6
|
||||
right := threads[builtCluster.Members[j]]
|
||||
return left.Number < right.Number
|
||||
})
|
||||
rep := threads[builtCluster.RepresentativeThreadID]
|
||||
identity := store.HumanKeyForValue(fmt.Sprintf("repo:%d:cluster-representative:%d", repoID, builtCluster.RepresentativeThreadID))
|
||||
clusterType := "duplicate_candidate"
|
||||
if len(builtCluster.Members) == 1 {
|
||||
clusterType = "singleton_orphan"
|
||||
}
|
||||
input := store.DurableClusterInput{
|
||||
StableKey: durableClusterStableKey(builtCluster.Members, threads),
|
||||
StableSlug: durableClusterSlug(builtCluster.Members, threads),
|
||||
StableKey: identity.Hash,
|
||||
StableSlug: store.HumanKeyStableSlug(identity),
|
||||
ClusterType: clusterType,
|
||||
RepresentativeThreadID: builtCluster.RepresentativeThreadID,
|
||||
Title: rep.Title,
|
||||
Title: "Cluster " + identity.Slug,
|
||||
Members: make([]store.DurableClusterMemberInput, 0, len(builtCluster.Members)),
|
||||
}
|
||||
for _, threadID := range builtCluster.Members {
|
||||
role := "member"
|
||||
role := "related"
|
||||
var scorePtr *float64
|
||||
if threadID == builtCluster.RepresentativeThreadID {
|
||||
role = "representative"
|
||||
role = "canonical"
|
||||
scoreCopy := 1.0
|
||||
scorePtr = &scoreCopy
|
||||
} else if score, ok := pairScores[threadIDPairKey(threadID, builtCluster.RepresentativeThreadID)]; ok {
|
||||
scoreCopy := score
|
||||
scorePtr = &scoreCopy
|
||||
@ -2194,23 +2199,6 @@ func clusterRepository(ctx context.Context, st *store.Store, repoID int64, store
|
||||
}, nil
|
||||
}
|
||||
|
||||
func durableClusterStableKey(threadIDs []int64, threads map[int64]store.Thread) string {
|
||||
parts := make([]string, 0, len(threadIDs))
|
||||
for _, id := range threadIDs {
|
||||
if thread, ok := threads[id]; ok && thread.Number > 0 {
|
||||
parts = append(parts, strconv.Itoa(thread.Number))
|
||||
continue
|
||||
}
|
||||
parts = append(parts, strconv.FormatInt(id, 10))
|
||||
}
|
||||
return "numbers:" + strings.Join(parts, ",")
|
||||
}
|
||||
|
||||
func durableClusterSlug(threadIDs []int64, threads map[int64]store.Thread) string {
|
||||
sum := sha256.Sum256([]byte(durableClusterStableKey(threadIDs, threads)))
|
||||
return "cluster-" + hex.EncodeToString(sum[:])[:12]
|
||||
}
|
||||
|
||||
func threadIDPairKey(left, right int64) string {
|
||||
if left > right {
|
||||
left, right = right, left
|
||||
|
||||
@ -9,6 +9,7 @@ import (
|
||||
"net/http/httptest"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"testing"
|
||||
@ -851,7 +852,7 @@ func TestClusterCommandPersistsDurableClusters(t *testing.T) {
|
||||
if err := run.Run(ctx, []string{"--config", configPath, "cluster", "openclaw/openclaw", "--threshold", "0.90", "--json"}); err != nil {
|
||||
t.Fatalf("cluster: %v", err)
|
||||
}
|
||||
if !strings.Contains(stdout.String(), `"cluster_count": 1`) {
|
||||
if !strings.Contains(stdout.String(), `"cluster_count": 2`) {
|
||||
t.Fatalf("cluster output = %q", stdout.String())
|
||||
}
|
||||
st, err = store.Open(ctx, dbPath)
|
||||
@ -863,8 +864,13 @@ func TestClusterCommandPersistsDurableClusters(t *testing.T) {
|
||||
if err != nil {
|
||||
t.Fatalf("list clusters: %v", err)
|
||||
}
|
||||
if len(clusters) != 1 || clusters[0].MemberCount != 2 {
|
||||
t.Fatalf("expected one durable cluster, got %#v", clusters)
|
||||
memberCounts := []int{}
|
||||
for _, cluster := range clusters {
|
||||
memberCounts = append(memberCounts, cluster.MemberCount)
|
||||
}
|
||||
sort.Ints(memberCounts)
|
||||
if len(memberCounts) != 2 || memberCounts[0] != 1 || memberCounts[1] != 2 {
|
||||
t.Fatalf("expected duplicate cluster plus singleton, got %#v", clusters)
|
||||
}
|
||||
}
|
||||
|
||||
@ -1321,7 +1327,7 @@ func TestRefreshEmbedsAndClustersWithoutSync(t *testing.T) {
|
||||
if !strings.Contains(out, `"embedded": 3`) {
|
||||
t.Fatalf("refresh did not embed rows: %q", out)
|
||||
}
|
||||
if !strings.Contains(out, `"cluster_count": 1`) {
|
||||
if !strings.Contains(out, `"cluster_count": 2`) {
|
||||
t.Fatalf("refresh did not persist cluster: %q", out)
|
||||
}
|
||||
|
||||
@ -1334,8 +1340,13 @@ func TestRefreshEmbedsAndClustersWithoutSync(t *testing.T) {
|
||||
if err != nil {
|
||||
t.Fatalf("list clusters: %v", err)
|
||||
}
|
||||
if len(clusters) != 1 || clusters[0].MemberCount != 2 {
|
||||
t.Fatalf("expected one durable cluster, got %#v", clusters)
|
||||
memberCounts := []int{}
|
||||
for _, cluster := range clusters {
|
||||
memberCounts = append(memberCounts, cluster.MemberCount)
|
||||
}
|
||||
sort.Ints(memberCounts)
|
||||
if len(memberCounts) != 2 || memberCounts[0] != 1 || memberCounts[1] != 2 {
|
||||
t.Fatalf("expected duplicate cluster plus singleton, got %#v", clusters)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -72,6 +72,7 @@ type ClusterMemberOverride struct {
|
||||
type DurableClusterInput struct {
|
||||
StableKey string
|
||||
StableSlug string
|
||||
ClusterType string
|
||||
RepresentativeThreadID int64
|
||||
Title string
|
||||
Members []DurableClusterMemberInput
|
||||
@ -725,6 +726,15 @@ func (s *Store) SaveDurableClusters(ctx context.Context, repoID int64, inputs []
|
||||
return err
|
||||
}
|
||||
}
|
||||
if len(inputs) > 0 {
|
||||
if _, err := tx.q().ExecContext(ctx, `
|
||||
delete from cluster_groups
|
||||
where repo_id = ?
|
||||
and cluster_type = 'similarity'
|
||||
`, repoID); err != nil {
|
||||
return fmt.Errorf("delete legacy similarity clusters: %w", err)
|
||||
}
|
||||
}
|
||||
if _, err := tx.q().ExecContext(ctx, `
|
||||
update cluster_runs
|
||||
set finished_at = ?, stats_json = ?
|
||||
@ -945,12 +955,16 @@ func (s *Store) upsertDurableCluster(ctx context.Context, repoID, runID int64, i
|
||||
if stableSlug == "" {
|
||||
stableSlug = stableKey
|
||||
}
|
||||
clusterType := strings.TrimSpace(input.ClusterType)
|
||||
if clusterType == "" {
|
||||
clusterType = "duplicate_candidate"
|
||||
}
|
||||
var clusterID int64
|
||||
if err := s.q().QueryRowContext(ctx, `
|
||||
insert into cluster_groups(
|
||||
repo_id, stable_key, stable_slug, status, cluster_type, representative_thread_id, title, created_at, updated_at
|
||||
)
|
||||
values(?, ?, ?, 'active', 'similarity', ?, ?, ?, ?)
|
||||
values(?, ?, ?, 'active', ?, ?, ?, ?, ?)
|
||||
on conflict(repo_id, stable_key) do update set
|
||||
stable_slug = excluded.stable_slug,
|
||||
cluster_type = excluded.cluster_type,
|
||||
@ -961,7 +975,7 @@ func (s *Store) upsertDurableCluster(ctx context.Context, repoID, runID int64, i
|
||||
title = excluded.title,
|
||||
updated_at = excluded.updated_at
|
||||
returning id
|
||||
`, repoID, stableKey, stableSlug, nullInt(input.RepresentativeThreadID), nullString(input.Title), now, now).Scan(&clusterID); err != nil {
|
||||
`, repoID, stableKey, stableSlug, clusterType, nullInt(input.RepresentativeThreadID), nullString(input.Title), now, now).Scan(&clusterID); err != nil {
|
||||
return 0, fmt.Errorf("upsert durable cluster: %w", err)
|
||||
}
|
||||
if _, err := s.q().ExecContext(ctx, `
|
||||
|
||||
@ -2,7 +2,10 @@ package store
|
||||
|
||||
import (
|
||||
"crypto/sha256"
|
||||
"encoding/hex"
|
||||
"fmt"
|
||||
"strconv"
|
||||
"strings"
|
||||
)
|
||||
|
||||
var humanKeyWords = []string{
|
||||
@ -44,10 +47,51 @@ func clusterHumanName(repoID, representativeThreadID, clusterID int64) string {
|
||||
if representativeThreadID != 0 {
|
||||
key = fmt.Sprintf("repo:%d:cluster-representative:%d", repoID, representativeThreadID)
|
||||
}
|
||||
hash := sha256.Sum256([]byte(key))
|
||||
return fmt.Sprintf("%s-%s-%s",
|
||||
humanKeyWords[int(hash[0])%len(humanKeyWords)],
|
||||
humanKeyWords[int(hash[1])%len(humanKeyWords)],
|
||||
humanKeyWords[int(hash[2])%len(humanKeyWords)],
|
||||
)
|
||||
return HumanKeyForValue(key).Slug
|
||||
}
|
||||
|
||||
type HumanKey struct {
|
||||
Hash string
|
||||
Slug string
|
||||
Checksum string
|
||||
}
|
||||
|
||||
func StableHash(value string) string {
|
||||
sum := sha256.Sum256([]byte(value))
|
||||
return hex.EncodeToString(sum[:])
|
||||
}
|
||||
|
||||
func HumanKeyForValue(value string) HumanKey {
|
||||
return HumanKeyFromHash(StableHash(value))
|
||||
}
|
||||
|
||||
func HumanKeyFromHash(hash string) HumanKey {
|
||||
normalized := strings.ToLower(hash)
|
||||
index := func(offset int) int {
|
||||
value, err := strconv.ParseInt(normalized[offset:offset+2], 16, 64)
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
return int(value) % len(humanKeyWords)
|
||||
}
|
||||
checksumValue, err := strconv.ParseInt(normalized[6:12], 16, 64)
|
||||
checksum := "0000"
|
||||
if err == nil {
|
||||
checksum = strconv.FormatInt(checksumValue, 36)
|
||||
if len(checksum) < 4 {
|
||||
checksum = strings.Repeat("0", 4-len(checksum)) + checksum
|
||||
}
|
||||
if len(checksum) > 4 {
|
||||
checksum = checksum[len(checksum)-4:]
|
||||
}
|
||||
}
|
||||
return HumanKey{
|
||||
Hash: normalized,
|
||||
Slug: fmt.Sprintf("%s-%s-%s", humanKeyWords[index(0)], humanKeyWords[index(2)], humanKeyWords[index(4)]),
|
||||
Checksum: checksum,
|
||||
}
|
||||
}
|
||||
|
||||
func HumanKeyStableSlug(key HumanKey) string {
|
||||
return key.Slug + "-" + key.Checksum
|
||||
}
|
||||
|
||||
13
internal/store/human_key_test.go
Normal file
13
internal/store/human_key_test.go
Normal file
@ -0,0 +1,13 @@
|
||||
package store
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestHumanKeyForValueMatchesGhcrawlRepresentativeIdentity(t *testing.T) {
|
||||
key := HumanKeyForValue("repo:1:cluster-representative:546")
|
||||
if key.Hash != "e77f18999d72cc6d27c5d3d0aa2c02cdc8cad3c1be077feb70062bc55eae98fd" {
|
||||
t.Fatalf("hash = %q", key.Hash)
|
||||
}
|
||||
if HumanKeyStableSlug(key) != "usage-matrix-binary-zrzm" {
|
||||
t.Fatalf("stable slug = %q", HumanKeyStableSlug(key))
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue
Block a user