perf: harden normalized content and add read-path indexes (#35)
* perf(store): add read-path indexes for messages and mentions * fix(syncer): sanitize normalized message text before indexing * docs: note query hardening improvements in changelog
This commit is contained in:
parent
9e2fd991ba
commit
aa74be7b79
@ -5,6 +5,8 @@ All notable changes to `discrawl` will be documented in this file.
|
||||
## 0.4.0 - Unreleased
|
||||
|
||||
- Git-backed snapshot imports are now much faster on large archives by using import-only SQLite pragmas and bulk-load FTS5 settings during search index rebuilds
|
||||
- `messages` and `mentions` now use composite read-path indexes so larger archives spend less time sorting/filtering common guild, channel, and author queries
|
||||
- normalized message text is now sanitized before it reaches SQLite and FTS5, repairing malformed UTF-8 and stripping invisible/control-character noise that can poison search content
|
||||
|
||||
## 0.3.0 - 2026-04-21
|
||||
|
||||
|
||||
1
go.mod
1
go.mod
@ -7,6 +7,7 @@ require (
|
||||
github.com/gorilla/websocket v1.5.3
|
||||
github.com/pelletier/go-toml/v2 v2.3.0
|
||||
github.com/stretchr/testify v1.11.1
|
||||
golang.org/x/text v0.35.0
|
||||
modernc.org/sqlite v1.49.1
|
||||
)
|
||||
|
||||
|
||||
2
go.sum
2
go.sum
@ -39,6 +39,8 @@ golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo=
|
||||
golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
|
||||
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
|
||||
golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ=
|
||||
golang.org/x/text v0.35.0 h1:JOVx6vVDFokkpaq1AEptVzLTpDe9KGpj5tR4/X+ybL8=
|
||||
golang.org/x/text v0.35.0/go.mod h1:khi/HExzZJ2pGnjenulevKNX1W67CUy0AsXcNubPGCA=
|
||||
golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ=
|
||||
golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k=
|
||||
golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0=
|
||||
|
||||
@ -18,7 +18,7 @@ const (
|
||||
timeLayout = time.RFC3339Nano
|
||||
messageFTSVersion = "2"
|
||||
memberFTSVersion = "1"
|
||||
storeSchemaVersion = 1
|
||||
storeSchemaVersion = 2
|
||||
)
|
||||
|
||||
type Store struct {
|
||||
@ -199,6 +199,15 @@ func (s *Store) migrate(ctx context.Context) error {
|
||||
if err := s.applyBaselineSchema(ctx); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := s.setSchemaVersion(ctx, 1); err != nil {
|
||||
return err
|
||||
}
|
||||
currentVersion = 1
|
||||
}
|
||||
if currentVersion < 2 {
|
||||
if err := s.applyQueryIndexMigration(ctx); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := s.setSchemaVersion(ctx, storeSchemaVersion); err != nil {
|
||||
return err
|
||||
}
|
||||
@ -382,10 +391,15 @@ func (s *Store) applyBaselineSchema(ctx context.Context) error {
|
||||
`create index if not exists idx_members_guild_id on members(guild_id);`,
|
||||
`create index if not exists idx_messages_channel_id on messages(channel_id);`,
|
||||
`create index if not exists idx_messages_guild_id on messages(guild_id);`,
|
||||
`create index if not exists idx_messages_guild_created_id on messages(guild_id, created_at, id);`,
|
||||
`create index if not exists idx_messages_channel_created_id on messages(channel_id, created_at, id);`,
|
||||
`create index if not exists idx_messages_author_created_id on messages(author_id, created_at, id);`,
|
||||
`create index if not exists idx_events_message_id on message_events(message_id);`,
|
||||
`create index if not exists idx_attachments_message_id on message_attachments(message_id);`,
|
||||
`create index if not exists idx_attachments_channel_id on message_attachments(channel_id);`,
|
||||
`create index if not exists idx_mentions_message_id on mention_events(message_id);`,
|
||||
`create index if not exists idx_mentions_guild_event on mention_events(guild_id, event_at, event_id);`,
|
||||
`create index if not exists idx_mentions_channel_event on mention_events(channel_id, event_at, event_id);`,
|
||||
`create index if not exists idx_mentions_target on mention_events(target_type, target_id, event_at);`,
|
||||
`create index if not exists idx_mentions_author on mention_events(author_id, event_at);`,
|
||||
}
|
||||
@ -397,6 +411,27 @@ func (s *Store) applyBaselineSchema(ctx context.Context) error {
|
||||
return tx.Commit()
|
||||
}
|
||||
|
||||
func (s *Store) applyQueryIndexMigration(ctx context.Context) error {
|
||||
tx, err := s.db.BeginTx(ctx, nil)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer rollback(tx)
|
||||
stmts := []string{
|
||||
`create index if not exists idx_messages_guild_created_id on messages(guild_id, created_at, id);`,
|
||||
`create index if not exists idx_messages_channel_created_id on messages(channel_id, created_at, id);`,
|
||||
`create index if not exists idx_messages_author_created_id on messages(author_id, created_at, id);`,
|
||||
`create index if not exists idx_mentions_guild_event on mention_events(guild_id, event_at, event_id);`,
|
||||
`create index if not exists idx_mentions_channel_event on mention_events(channel_id, event_at, event_id);`,
|
||||
}
|
||||
for _, stmt := range stmts {
|
||||
if _, err := tx.ExecContext(ctx, stmt); err != nil {
|
||||
return fmt.Errorf("migrate query indexes: %w", err)
|
||||
}
|
||||
}
|
||||
return tx.Commit()
|
||||
}
|
||||
|
||||
func (s *Store) ensureFTSRowIDs(ctx context.Context) error {
|
||||
var version sql.NullString
|
||||
err := s.db.QueryRowContext(ctx, `
|
||||
|
||||
@ -427,6 +427,83 @@ func TestOpenTightensDBFilePerms(t *testing.T) {
|
||||
require.Equal(t, os.FileMode(0o600), info.Mode().Perm())
|
||||
}
|
||||
|
||||
func TestOpenCreatesQueryIndexes(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
ctx := context.Background()
|
||||
s, err := Open(ctx, filepath.Join(t.TempDir(), "discrawl.db"))
|
||||
require.NoError(t, err)
|
||||
defer func() { _ = s.Close() }()
|
||||
|
||||
messageIndexes := indexNames(t, ctx, s.DB(), "messages")
|
||||
require.Contains(t, messageIndexes, "idx_messages_guild_created_id")
|
||||
require.Contains(t, messageIndexes, "idx_messages_channel_created_id")
|
||||
require.Contains(t, messageIndexes, "idx_messages_author_created_id")
|
||||
|
||||
mentionIndexes := indexNames(t, ctx, s.DB(), "mention_events")
|
||||
require.Contains(t, mentionIndexes, "idx_mentions_guild_event")
|
||||
require.Contains(t, mentionIndexes, "idx_mentions_channel_event")
|
||||
}
|
||||
|
||||
func TestOpenMigratesLegacyQueryIndexes(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
ctx := context.Background()
|
||||
dbPath := filepath.Join(t.TempDir(), "discrawl.db")
|
||||
|
||||
sqlDB, err := sql.Open("sqlite", dbPath)
|
||||
require.NoError(t, err)
|
||||
legacy := &Store{db: sqlDB, path: dbPath}
|
||||
require.NoError(t, legacy.applyBaselineSchema(ctx))
|
||||
require.NoError(t, legacy.setSchemaVersion(ctx, 1))
|
||||
for _, indexName := range []string{
|
||||
"idx_messages_guild_created_id",
|
||||
"idx_messages_channel_created_id",
|
||||
"idx_messages_author_created_id",
|
||||
"idx_mentions_guild_event",
|
||||
"idx_mentions_channel_event",
|
||||
} {
|
||||
_, err = sqlDB.ExecContext(ctx, `drop index if exists `+indexName)
|
||||
require.NoError(t, err)
|
||||
}
|
||||
require.NoError(t, sqlDB.Close())
|
||||
|
||||
s, err := Open(ctx, dbPath)
|
||||
require.NoError(t, err)
|
||||
defer func() { _ = s.Close() }()
|
||||
|
||||
version, err := s.schemaVersion(ctx)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, storeSchemaVersion, version)
|
||||
require.Contains(t, indexNames(t, ctx, s.DB(), "messages"), "idx_messages_channel_created_id")
|
||||
require.Contains(t, indexNames(t, ctx, s.DB(), "mention_events"), "idx_mentions_guild_event")
|
||||
}
|
||||
|
||||
func indexNames(t *testing.T, ctx context.Context, db *sql.DB, table string) []string {
|
||||
t.Helper()
|
||||
|
||||
rows, err := db.QueryContext(ctx, `pragma index_list(`+quoteSQLString(table)+`)`)
|
||||
require.NoError(t, err)
|
||||
defer func() { _ = rows.Close() }()
|
||||
|
||||
var out []string
|
||||
for rows.Next() {
|
||||
var seq int
|
||||
var name string
|
||||
var unique int
|
||||
var origin string
|
||||
var partial int
|
||||
require.NoError(t, rows.Scan(&seq, &name, &unique, &origin, &partial))
|
||||
out = append(out, name)
|
||||
}
|
||||
require.NoError(t, rows.Err())
|
||||
return out
|
||||
}
|
||||
|
||||
func quoteSQLString(value string) string {
|
||||
return "'" + value + "'"
|
||||
}
|
||||
|
||||
func TestEventsSyncStateAndHelpers(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
|
||||
@ -5,9 +5,11 @@ import (
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
"unicode"
|
||||
|
||||
"github.com/bwmarrin/discordgo"
|
||||
"github.com/steipete/discrawl/internal/store"
|
||||
"golang.org/x/text/unicode/norm"
|
||||
)
|
||||
|
||||
func toMemberRecord(guildID string, member *discordgo.Member) store.MemberRecord {
|
||||
@ -72,7 +74,7 @@ func normalizeMessage(message *discordgo.Message) string {
|
||||
}
|
||||
|
||||
func normalizeMessageParts(message *discordgo.Message, attachmentParts []string) string {
|
||||
parts := []string{strings.TrimSpace(message.Content)}
|
||||
parts := []string{message.Content}
|
||||
if len(attachmentParts) != 0 {
|
||||
parts = append(parts, attachmentParts...)
|
||||
} else {
|
||||
@ -106,7 +108,7 @@ func normalizeMessageParts(message *discordgo.Message, attachmentParts []string)
|
||||
}
|
||||
filtered := make([]string, 0, len(parts))
|
||||
for _, part := range parts {
|
||||
part = strings.TrimSpace(part)
|
||||
part = sanitizeNormalizedPart(part)
|
||||
if part != "" {
|
||||
filtered = append(filtered, part)
|
||||
}
|
||||
@ -114,6 +116,38 @@ func normalizeMessageParts(message *discordgo.Message, attachmentParts []string)
|
||||
return strings.Join(filtered, "\n")
|
||||
}
|
||||
|
||||
func sanitizeNormalizedPart(raw string) string {
|
||||
raw = strings.ToValidUTF8(raw, "")
|
||||
raw = norm.NFKC.String(raw)
|
||||
|
||||
var b strings.Builder
|
||||
b.Grow(len(raw))
|
||||
spacePending := false
|
||||
for _, r := range raw {
|
||||
switch {
|
||||
case isDroppedNormalizedRune(r):
|
||||
continue
|
||||
case unicode.IsSpace(r):
|
||||
spacePending = b.Len() > 0
|
||||
default:
|
||||
if spacePending {
|
||||
b.WriteByte(' ')
|
||||
spacePending = false
|
||||
}
|
||||
b.WriteRune(r)
|
||||
}
|
||||
}
|
||||
return strings.TrimSpace(b.String())
|
||||
}
|
||||
|
||||
func isDroppedNormalizedRune(r rune) bool {
|
||||
switch r {
|
||||
case '\u200b', '\u200c', '\u200d', '\ufeff':
|
||||
return true
|
||||
}
|
||||
return unicode.IsControl(r)
|
||||
}
|
||||
|
||||
func displayName(member *discordgo.Member) string {
|
||||
if member == nil || member.User == nil {
|
||||
return ""
|
||||
|
||||
@ -44,6 +44,26 @@ func TestNormalizeMessageIncludesRichFields(t *testing.T) {
|
||||
require.Contains(t, content, "answer")
|
||||
}
|
||||
|
||||
func TestNormalizeMessageSanitizesMalformedUnicodeAndWhitespace(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
message := &discordgo.Message{
|
||||
Content: string([]byte{'h', 'i', 0xff, ' ', 't', 'h', 'e', 'r', 'e'}) + "\u200b",
|
||||
Attachments: []*discordgo.MessageAttachment{
|
||||
{Filename: "Foo\u200d.txt"},
|
||||
},
|
||||
Embeds: []*discordgo.MessageEmbed{
|
||||
{Title: " spaced\u00a0out ", Description: "line\u0000break"},
|
||||
},
|
||||
ReferencedMessage: &discordgo.Message{Content: "prior reply"},
|
||||
}
|
||||
|
||||
content := normalizeMessage(message)
|
||||
require.Equal(t, "hi there\nFoo.txt\nspaced out\nlinebreak\nreply:prior reply", content)
|
||||
require.NotContains(t, content, "\u200b")
|
||||
require.NotContains(t, content, "\u200d")
|
||||
}
|
||||
|
||||
func TestTailHandlerWritesEvents(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user