perf: harden normalized content and add read-path indexes (#35)

* perf(store): add read-path indexes for messages and mentions

* fix(syncer): sanitize normalized message text before indexing

* docs: note query hardening improvements in changelog
This commit is contained in:
Vincent Koc 2026-04-21 21:45:36 -07:00 committed by GitHub
parent 9e2fd991ba
commit aa74be7b79
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
7 changed files with 174 additions and 3 deletions

View File

@ -5,6 +5,8 @@ All notable changes to `discrawl` will be documented in this file.
## 0.4.0 - Unreleased
- Git-backed snapshot imports are now much faster on large archives by using import-only SQLite pragmas and bulk-load FTS5 settings during search index rebuilds
- `messages` and `mentions` now use composite read-path indexes so larger archives spend less time sorting/filtering common guild, channel, and author queries
- normalized message text is now sanitized before it reaches SQLite and FTS5, repairing malformed UTF-8 and stripping invisible/control-character noise that can poison search content
## 0.3.0 - 2026-04-21

1
go.mod
View File

@ -7,6 +7,7 @@ require (
github.com/gorilla/websocket v1.5.3
github.com/pelletier/go-toml/v2 v2.3.0
github.com/stretchr/testify v1.11.1
golang.org/x/text v0.35.0
modernc.org/sqlite v1.49.1
)

2
go.sum
View File

@ -39,6 +39,8 @@ golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo=
golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
golang.org/x/text v0.35.0 h1:JOVx6vVDFokkpaq1AEptVzLTpDe9KGpj5tR4/X+ybL8=
golang.org/x/text v0.35.0/go.mod h1:khi/HExzZJ2pGnjenulevKNX1W67CUy0AsXcNubPGCA=
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k=
golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0=

View File

@ -18,7 +18,7 @@ const (
timeLayout = time.RFC3339Nano
messageFTSVersion = "2"
memberFTSVersion = "1"
storeSchemaVersion = 1
storeSchemaVersion = 2
)
type Store struct {
@ -199,6 +199,15 @@ func (s *Store) migrate(ctx context.Context) error {
if err := s.applyBaselineSchema(ctx); err != nil {
return err
}
if err := s.setSchemaVersion(ctx, 1); err != nil {
return err
}
currentVersion = 1
}
if currentVersion < 2 {
if err := s.applyQueryIndexMigration(ctx); err != nil {
return err
}
if err := s.setSchemaVersion(ctx, storeSchemaVersion); err != nil {
return err
}
@ -382,10 +391,15 @@ func (s *Store) applyBaselineSchema(ctx context.Context) error {
`create index if not exists idx_members_guild_id on members(guild_id);`,
`create index if not exists idx_messages_channel_id on messages(channel_id);`,
`create index if not exists idx_messages_guild_id on messages(guild_id);`,
`create index if not exists idx_messages_guild_created_id on messages(guild_id, created_at, id);`,
`create index if not exists idx_messages_channel_created_id on messages(channel_id, created_at, id);`,
`create index if not exists idx_messages_author_created_id on messages(author_id, created_at, id);`,
`create index if not exists idx_events_message_id on message_events(message_id);`,
`create index if not exists idx_attachments_message_id on message_attachments(message_id);`,
`create index if not exists idx_attachments_channel_id on message_attachments(channel_id);`,
`create index if not exists idx_mentions_message_id on mention_events(message_id);`,
`create index if not exists idx_mentions_guild_event on mention_events(guild_id, event_at, event_id);`,
`create index if not exists idx_mentions_channel_event on mention_events(channel_id, event_at, event_id);`,
`create index if not exists idx_mentions_target on mention_events(target_type, target_id, event_at);`,
`create index if not exists idx_mentions_author on mention_events(author_id, event_at);`,
}
@ -397,6 +411,27 @@ func (s *Store) applyBaselineSchema(ctx context.Context) error {
return tx.Commit()
}
func (s *Store) applyQueryIndexMigration(ctx context.Context) error {
tx, err := s.db.BeginTx(ctx, nil)
if err != nil {
return err
}
defer rollback(tx)
stmts := []string{
`create index if not exists idx_messages_guild_created_id on messages(guild_id, created_at, id);`,
`create index if not exists idx_messages_channel_created_id on messages(channel_id, created_at, id);`,
`create index if not exists idx_messages_author_created_id on messages(author_id, created_at, id);`,
`create index if not exists idx_mentions_guild_event on mention_events(guild_id, event_at, event_id);`,
`create index if not exists idx_mentions_channel_event on mention_events(channel_id, event_at, event_id);`,
}
for _, stmt := range stmts {
if _, err := tx.ExecContext(ctx, stmt); err != nil {
return fmt.Errorf("migrate query indexes: %w", err)
}
}
return tx.Commit()
}
func (s *Store) ensureFTSRowIDs(ctx context.Context) error {
var version sql.NullString
err := s.db.QueryRowContext(ctx, `

View File

@ -427,6 +427,83 @@ func TestOpenTightensDBFilePerms(t *testing.T) {
require.Equal(t, os.FileMode(0o600), info.Mode().Perm())
}
func TestOpenCreatesQueryIndexes(t *testing.T) {
t.Parallel()
ctx := context.Background()
s, err := Open(ctx, filepath.Join(t.TempDir(), "discrawl.db"))
require.NoError(t, err)
defer func() { _ = s.Close() }()
messageIndexes := indexNames(t, ctx, s.DB(), "messages")
require.Contains(t, messageIndexes, "idx_messages_guild_created_id")
require.Contains(t, messageIndexes, "idx_messages_channel_created_id")
require.Contains(t, messageIndexes, "idx_messages_author_created_id")
mentionIndexes := indexNames(t, ctx, s.DB(), "mention_events")
require.Contains(t, mentionIndexes, "idx_mentions_guild_event")
require.Contains(t, mentionIndexes, "idx_mentions_channel_event")
}
func TestOpenMigratesLegacyQueryIndexes(t *testing.T) {
t.Parallel()
ctx := context.Background()
dbPath := filepath.Join(t.TempDir(), "discrawl.db")
sqlDB, err := sql.Open("sqlite", dbPath)
require.NoError(t, err)
legacy := &Store{db: sqlDB, path: dbPath}
require.NoError(t, legacy.applyBaselineSchema(ctx))
require.NoError(t, legacy.setSchemaVersion(ctx, 1))
for _, indexName := range []string{
"idx_messages_guild_created_id",
"idx_messages_channel_created_id",
"idx_messages_author_created_id",
"idx_mentions_guild_event",
"idx_mentions_channel_event",
} {
_, err = sqlDB.ExecContext(ctx, `drop index if exists `+indexName)
require.NoError(t, err)
}
require.NoError(t, sqlDB.Close())
s, err := Open(ctx, dbPath)
require.NoError(t, err)
defer func() { _ = s.Close() }()
version, err := s.schemaVersion(ctx)
require.NoError(t, err)
require.Equal(t, storeSchemaVersion, version)
require.Contains(t, indexNames(t, ctx, s.DB(), "messages"), "idx_messages_channel_created_id")
require.Contains(t, indexNames(t, ctx, s.DB(), "mention_events"), "idx_mentions_guild_event")
}
func indexNames(t *testing.T, ctx context.Context, db *sql.DB, table string) []string {
t.Helper()
rows, err := db.QueryContext(ctx, `pragma index_list(`+quoteSQLString(table)+`)`)
require.NoError(t, err)
defer func() { _ = rows.Close() }()
var out []string
for rows.Next() {
var seq int
var name string
var unique int
var origin string
var partial int
require.NoError(t, rows.Scan(&seq, &name, &unique, &origin, &partial))
out = append(out, name)
}
require.NoError(t, rows.Err())
return out
}
func quoteSQLString(value string) string {
return "'" + value + "'"
}
func TestEventsSyncStateAndHelpers(t *testing.T) {
t.Parallel()

View File

@ -5,9 +5,11 @@ import (
"strconv"
"strings"
"time"
"unicode"
"github.com/bwmarrin/discordgo"
"github.com/steipete/discrawl/internal/store"
"golang.org/x/text/unicode/norm"
)
func toMemberRecord(guildID string, member *discordgo.Member) store.MemberRecord {
@ -72,7 +74,7 @@ func normalizeMessage(message *discordgo.Message) string {
}
func normalizeMessageParts(message *discordgo.Message, attachmentParts []string) string {
parts := []string{strings.TrimSpace(message.Content)}
parts := []string{message.Content}
if len(attachmentParts) != 0 {
parts = append(parts, attachmentParts...)
} else {
@ -106,7 +108,7 @@ func normalizeMessageParts(message *discordgo.Message, attachmentParts []string)
}
filtered := make([]string, 0, len(parts))
for _, part := range parts {
part = strings.TrimSpace(part)
part = sanitizeNormalizedPart(part)
if part != "" {
filtered = append(filtered, part)
}
@ -114,6 +116,38 @@ func normalizeMessageParts(message *discordgo.Message, attachmentParts []string)
return strings.Join(filtered, "\n")
}
func sanitizeNormalizedPart(raw string) string {
raw = strings.ToValidUTF8(raw, "")
raw = norm.NFKC.String(raw)
var b strings.Builder
b.Grow(len(raw))
spacePending := false
for _, r := range raw {
switch {
case isDroppedNormalizedRune(r):
continue
case unicode.IsSpace(r):
spacePending = b.Len() > 0
default:
if spacePending {
b.WriteByte(' ')
spacePending = false
}
b.WriteRune(r)
}
}
return strings.TrimSpace(b.String())
}
func isDroppedNormalizedRune(r rune) bool {
switch r {
case '\u200b', '\u200c', '\u200d', '\ufeff':
return true
}
return unicode.IsControl(r)
}
func displayName(member *discordgo.Member) string {
if member == nil || member.User == nil {
return ""

View File

@ -44,6 +44,26 @@ func TestNormalizeMessageIncludesRichFields(t *testing.T) {
require.Contains(t, content, "answer")
}
func TestNormalizeMessageSanitizesMalformedUnicodeAndWhitespace(t *testing.T) {
t.Parallel()
message := &discordgo.Message{
Content: string([]byte{'h', 'i', 0xff, ' ', 't', 'h', 'e', 'r', 'e'}) + "\u200b",
Attachments: []*discordgo.MessageAttachment{
{Filename: "\u200d.txt"},
},
Embeds: []*discordgo.MessageEmbed{
{Title: " spaced\u00a0out ", Description: "line\u0000break"},
},
ReferencedMessage: &discordgo.Message{Content: "prior reply"},
}
content := normalizeMessage(message)
require.Equal(t, "hi there\nFoo.txt\nspaced out\nlinebreak\nreply:prior reply", content)
require.NotContains(t, content, "\u200b")
require.NotContains(t, content, "\u200d")
}
func TestTailHandlerWritesEvents(t *testing.T) {
t.Parallel()