From aa74be7b7968914531a65dbcf85c718ff60e5fc4 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Tue, 21 Apr 2026 21:45:36 -0700 Subject: [PATCH] perf: harden normalized content and add read-path indexes (#35) * perf(store): add read-path indexes for messages and mentions * fix(syncer): sanitize normalized message text before indexing * docs: note query hardening improvements in changelog --- CHANGELOG.md | 2 + go.mod | 1 + go.sum | 2 + internal/store/store.go | 37 +++++++++++++- internal/store/store_test.go | 77 +++++++++++++++++++++++++++++ internal/syncer/records.go | 38 +++++++++++++- internal/syncer/syncer_tail_test.go | 20 ++++++++ 7 files changed, 174 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index be73589..e544f55 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ All notable changes to `discrawl` will be documented in this file. ## 0.4.0 - Unreleased - Git-backed snapshot imports are now much faster on large archives by using import-only SQLite pragmas and bulk-load FTS5 settings during search index rebuilds +- `messages` and `mentions` now use composite read-path indexes so larger archives spend less time sorting/filtering common guild, channel, and author queries +- normalized message text is now sanitized before it reaches SQLite and FTS5, repairing malformed UTF-8 and stripping invisible/control-character noise that can poison search content ## 0.3.0 - 2026-04-21 diff --git a/go.mod b/go.mod index 7969a41..3ce3d90 100644 --- a/go.mod +++ b/go.mod @@ -7,6 +7,7 @@ require ( github.com/gorilla/websocket v1.5.3 github.com/pelletier/go-toml/v2 v2.3.0 github.com/stretchr/testify v1.11.1 + golang.org/x/text v0.35.0 modernc.org/sqlite v1.49.1 ) diff --git a/go.sum b/go.sum index 9403b82..6d0f4c6 100644 --- a/go.sum +++ b/go.sum @@ -39,6 +39,8 @@ golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo= golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= +golang.org/x/text v0.35.0 h1:JOVx6vVDFokkpaq1AEptVzLTpDe9KGpj5tR4/X+ybL8= +golang.org/x/text v0.35.0/go.mod h1:khi/HExzZJ2pGnjenulevKNX1W67CUy0AsXcNubPGCA= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k= golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0= diff --git a/internal/store/store.go b/internal/store/store.go index aeb1d53..c06dfb4 100644 --- a/internal/store/store.go +++ b/internal/store/store.go @@ -18,7 +18,7 @@ const ( timeLayout = time.RFC3339Nano messageFTSVersion = "2" memberFTSVersion = "1" - storeSchemaVersion = 1 + storeSchemaVersion = 2 ) type Store struct { @@ -199,6 +199,15 @@ func (s *Store) migrate(ctx context.Context) error { if err := s.applyBaselineSchema(ctx); err != nil { return err } + if err := s.setSchemaVersion(ctx, 1); err != nil { + return err + } + currentVersion = 1 + } + if currentVersion < 2 { + if err := s.applyQueryIndexMigration(ctx); err != nil { + return err + } if err := s.setSchemaVersion(ctx, storeSchemaVersion); err != nil { return err } @@ -382,10 +391,15 @@ func (s *Store) applyBaselineSchema(ctx context.Context) error { `create index if not exists idx_members_guild_id on members(guild_id);`, `create index if not exists idx_messages_channel_id on messages(channel_id);`, `create index if not exists idx_messages_guild_id on messages(guild_id);`, + `create index if not exists idx_messages_guild_created_id on messages(guild_id, created_at, id);`, + `create index if not exists idx_messages_channel_created_id on messages(channel_id, created_at, id);`, + `create index if not exists idx_messages_author_created_id on messages(author_id, created_at, id);`, `create index if not exists idx_events_message_id on message_events(message_id);`, `create index if not exists idx_attachments_message_id on message_attachments(message_id);`, `create index if not exists idx_attachments_channel_id on message_attachments(channel_id);`, `create index if not exists idx_mentions_message_id on mention_events(message_id);`, + `create index if not exists idx_mentions_guild_event on mention_events(guild_id, event_at, event_id);`, + `create index if not exists idx_mentions_channel_event on mention_events(channel_id, event_at, event_id);`, `create index if not exists idx_mentions_target on mention_events(target_type, target_id, event_at);`, `create index if not exists idx_mentions_author on mention_events(author_id, event_at);`, } @@ -397,6 +411,27 @@ func (s *Store) applyBaselineSchema(ctx context.Context) error { return tx.Commit() } +func (s *Store) applyQueryIndexMigration(ctx context.Context) error { + tx, err := s.db.BeginTx(ctx, nil) + if err != nil { + return err + } + defer rollback(tx) + stmts := []string{ + `create index if not exists idx_messages_guild_created_id on messages(guild_id, created_at, id);`, + `create index if not exists idx_messages_channel_created_id on messages(channel_id, created_at, id);`, + `create index if not exists idx_messages_author_created_id on messages(author_id, created_at, id);`, + `create index if not exists idx_mentions_guild_event on mention_events(guild_id, event_at, event_id);`, + `create index if not exists idx_mentions_channel_event on mention_events(channel_id, event_at, event_id);`, + } + for _, stmt := range stmts { + if _, err := tx.ExecContext(ctx, stmt); err != nil { + return fmt.Errorf("migrate query indexes: %w", err) + } + } + return tx.Commit() +} + func (s *Store) ensureFTSRowIDs(ctx context.Context) error { var version sql.NullString err := s.db.QueryRowContext(ctx, ` diff --git a/internal/store/store_test.go b/internal/store/store_test.go index ad2600f..48ad8a9 100644 --- a/internal/store/store_test.go +++ b/internal/store/store_test.go @@ -427,6 +427,83 @@ func TestOpenTightensDBFilePerms(t *testing.T) { require.Equal(t, os.FileMode(0o600), info.Mode().Perm()) } +func TestOpenCreatesQueryIndexes(t *testing.T) { + t.Parallel() + + ctx := context.Background() + s, err := Open(ctx, filepath.Join(t.TempDir(), "discrawl.db")) + require.NoError(t, err) + defer func() { _ = s.Close() }() + + messageIndexes := indexNames(t, ctx, s.DB(), "messages") + require.Contains(t, messageIndexes, "idx_messages_guild_created_id") + require.Contains(t, messageIndexes, "idx_messages_channel_created_id") + require.Contains(t, messageIndexes, "idx_messages_author_created_id") + + mentionIndexes := indexNames(t, ctx, s.DB(), "mention_events") + require.Contains(t, mentionIndexes, "idx_mentions_guild_event") + require.Contains(t, mentionIndexes, "idx_mentions_channel_event") +} + +func TestOpenMigratesLegacyQueryIndexes(t *testing.T) { + t.Parallel() + + ctx := context.Background() + dbPath := filepath.Join(t.TempDir(), "discrawl.db") + + sqlDB, err := sql.Open("sqlite", dbPath) + require.NoError(t, err) + legacy := &Store{db: sqlDB, path: dbPath} + require.NoError(t, legacy.applyBaselineSchema(ctx)) + require.NoError(t, legacy.setSchemaVersion(ctx, 1)) + for _, indexName := range []string{ + "idx_messages_guild_created_id", + "idx_messages_channel_created_id", + "idx_messages_author_created_id", + "idx_mentions_guild_event", + "idx_mentions_channel_event", + } { + _, err = sqlDB.ExecContext(ctx, `drop index if exists `+indexName) + require.NoError(t, err) + } + require.NoError(t, sqlDB.Close()) + + s, err := Open(ctx, dbPath) + require.NoError(t, err) + defer func() { _ = s.Close() }() + + version, err := s.schemaVersion(ctx) + require.NoError(t, err) + require.Equal(t, storeSchemaVersion, version) + require.Contains(t, indexNames(t, ctx, s.DB(), "messages"), "idx_messages_channel_created_id") + require.Contains(t, indexNames(t, ctx, s.DB(), "mention_events"), "idx_mentions_guild_event") +} + +func indexNames(t *testing.T, ctx context.Context, db *sql.DB, table string) []string { + t.Helper() + + rows, err := db.QueryContext(ctx, `pragma index_list(`+quoteSQLString(table)+`)`) + require.NoError(t, err) + defer func() { _ = rows.Close() }() + + var out []string + for rows.Next() { + var seq int + var name string + var unique int + var origin string + var partial int + require.NoError(t, rows.Scan(&seq, &name, &unique, &origin, &partial)) + out = append(out, name) + } + require.NoError(t, rows.Err()) + return out +} + +func quoteSQLString(value string) string { + return "'" + value + "'" +} + func TestEventsSyncStateAndHelpers(t *testing.T) { t.Parallel() diff --git a/internal/syncer/records.go b/internal/syncer/records.go index 88df191..0eff966 100644 --- a/internal/syncer/records.go +++ b/internal/syncer/records.go @@ -5,9 +5,11 @@ import ( "strconv" "strings" "time" + "unicode" "github.com/bwmarrin/discordgo" "github.com/steipete/discrawl/internal/store" + "golang.org/x/text/unicode/norm" ) func toMemberRecord(guildID string, member *discordgo.Member) store.MemberRecord { @@ -72,7 +74,7 @@ func normalizeMessage(message *discordgo.Message) string { } func normalizeMessageParts(message *discordgo.Message, attachmentParts []string) string { - parts := []string{strings.TrimSpace(message.Content)} + parts := []string{message.Content} if len(attachmentParts) != 0 { parts = append(parts, attachmentParts...) } else { @@ -106,7 +108,7 @@ func normalizeMessageParts(message *discordgo.Message, attachmentParts []string) } filtered := make([]string, 0, len(parts)) for _, part := range parts { - part = strings.TrimSpace(part) + part = sanitizeNormalizedPart(part) if part != "" { filtered = append(filtered, part) } @@ -114,6 +116,38 @@ func normalizeMessageParts(message *discordgo.Message, attachmentParts []string) return strings.Join(filtered, "\n") } +func sanitizeNormalizedPart(raw string) string { + raw = strings.ToValidUTF8(raw, "") + raw = norm.NFKC.String(raw) + + var b strings.Builder + b.Grow(len(raw)) + spacePending := false + for _, r := range raw { + switch { + case isDroppedNormalizedRune(r): + continue + case unicode.IsSpace(r): + spacePending = b.Len() > 0 + default: + if spacePending { + b.WriteByte(' ') + spacePending = false + } + b.WriteRune(r) + } + } + return strings.TrimSpace(b.String()) +} + +func isDroppedNormalizedRune(r rune) bool { + switch r { + case '\u200b', '\u200c', '\u200d', '\ufeff': + return true + } + return unicode.IsControl(r) +} + func displayName(member *discordgo.Member) string { if member == nil || member.User == nil { return "" diff --git a/internal/syncer/syncer_tail_test.go b/internal/syncer/syncer_tail_test.go index ee9659b..72a8a25 100644 --- a/internal/syncer/syncer_tail_test.go +++ b/internal/syncer/syncer_tail_test.go @@ -44,6 +44,26 @@ func TestNormalizeMessageIncludesRichFields(t *testing.T) { require.Contains(t, content, "answer") } +func TestNormalizeMessageSanitizesMalformedUnicodeAndWhitespace(t *testing.T) { + t.Parallel() + + message := &discordgo.Message{ + Content: string([]byte{'h', 'i', 0xff, ' ', 't', 'h', 'e', 'r', 'e'}) + "\u200b", + Attachments: []*discordgo.MessageAttachment{ + {Filename: "Foo\u200d.txt"}, + }, + Embeds: []*discordgo.MessageEmbed{ + {Title: " spaced\u00a0out ", Description: "line\u0000break"}, + }, + ReferencedMessage: &discordgo.Message{Content: "prior reply"}, + } + + content := normalizeMessage(message) + require.Equal(t, "hi there\nFoo.txt\nspaced out\nlinebreak\nreply:prior reply", content) + require.NotContains(t, content, "\u200b") + require.NotContains(t, content, "\u200d") +} + func TestTailHandlerWritesEvents(t *testing.T) { t.Parallel()