discrawl/internal/discorddesktop/import_pipeline_test.go
2026-05-05 01:22:48 +01:00

383 lines
16 KiB
Go

package discorddesktop
import (
"context"
"fmt"
"os"
"path/filepath"
"testing"
"github.com/stretchr/testify/require"
"github.com/steipete/discrawl/internal/store"
)
func TestImportFastCacheSkipsUnroutedCacheDataUnlessFullCache(t *testing.T) {
ctx := context.Background()
dir := t.TempDir()
cachePath := filepath.Join(dir, "Cache", "Cache_Data")
require.NoError(t, os.MkdirAll(cachePath, 0o755))
require.NoError(t, os.WriteFile(filepath.Join(cachePath, "entry_0"), []byte(`
{"id":"111111111111111121","guild_id":"999999999999999996","type":0,"name":"slow-cache"}
{"id":"333333333333333346","channel_id":"111111111111111121","content":"unrouted historical cache","timestamp":"2026-04-23T18:20:43Z","author":{"id":"222222222222222232","username":"alice"}}
`), 0o600))
fastStore, err := store.Open(ctx, filepath.Join(dir, "fast.db"))
require.NoError(t, err)
defer func() { _ = fastStore.Close() }()
stats, err := Import(ctx, fastStore, Options{Path: dir})
require.NoError(t, err)
require.Equal(t, 0, stats.FilesScanned)
require.Equal(t, 1, stats.CacheFilesFastSkipped)
require.Equal(t, 0, stats.Messages)
results, err := fastStore.SearchMessages(ctx, store.SearchOptions{Query: "unrouted historical", Limit: 10})
require.NoError(t, err)
require.Empty(t, results)
stats, err = Import(ctx, fastStore, Options{Path: dir})
require.NoError(t, err)
require.Equal(t, 0, stats.FilesScanned)
require.Equal(t, 0, stats.CacheFilesFastSkipped)
require.Equal(t, 1, stats.FilesUnchanged)
stats, err = Import(ctx, fastStore, Options{Path: dir, FullCache: true})
require.NoError(t, err)
require.Equal(t, 1, stats.FilesScanned)
require.Equal(t, 1, stats.Messages)
fullStore, err := store.Open(ctx, filepath.Join(dir, "full.db"))
require.NoError(t, err)
defer func() { _ = fullStore.Close() }()
stats, err = Import(ctx, fullStore, Options{Path: dir, FullCache: true})
require.NoError(t, err)
require.Equal(t, 1, stats.FilesScanned)
require.Equal(t, 0, stats.CacheFilesFastSkipped)
require.Equal(t, 1, stats.Messages)
results, err = fullStore.SearchMessages(ctx, store.SearchOptions{Query: "unrouted historical", Limit: 10})
require.NoError(t, err)
require.Len(t, results, 1)
require.Equal(t, "slow-cache", results[0].ChannelName)
}
func TestImportCheckpointsCacheBatches(t *testing.T) {
ctx := context.Background()
dir := t.TempDir()
cachePath := filepath.Join(dir, "Cache", "Cache_Data")
require.NoError(t, os.MkdirAll(cachePath, 0o755))
for i := range checkpointEveryFiles + 1 {
channelID := "111111111111111121"
messageID := 333333333333333346 + i
body := []byte(fmt.Sprintf(`https://discord.com/channels/999999999999999996/%s
{"id":"%d","channel_id":"%s","content":"checkpoint cache %d","timestamp":"2026-04-23T18:20:43Z","author":{"id":"222222222222222232","username":"alice"}}
`, channelID, messageID, channelID, i))
require.NoError(t, os.WriteFile(filepath.Join(cachePath, fmt.Sprintf("entry_%03d", i)), body, 0o600))
}
st, err := store.Open(ctx, filepath.Join(dir, "discrawl.db"))
require.NoError(t, err)
defer func() { _ = st.Close() }()
stats, err := Import(ctx, st, Options{Path: dir})
require.NoError(t, err)
require.Equal(t, checkpointEveryFiles+1, stats.FilesScanned)
require.Equal(t, checkpointEveryFiles+1, stats.Messages)
require.GreaterOrEqual(t, stats.Checkpoints, 2)
stats, err = Import(ctx, st, Options{Path: dir})
require.NoError(t, err)
require.Equal(t, 0, stats.FilesScanned)
require.Equal(t, checkpointEveryFiles+1, stats.FilesUnchanged)
}
func TestImportUsesLaterCacheMetadataBeforeCheckpointingEarlierBatch(t *testing.T) {
ctx := context.Background()
dir := t.TempDir()
cachePath := filepath.Join(dir, "Cache", "Cache_Data")
require.NoError(t, os.MkdirAll(cachePath, 0o755))
channelID := "111111111111111121"
guildID := "999999999999999996"
require.NoError(t, os.WriteFile(filepath.Join(cachePath, "entry_000"), []byte(fmt.Sprintf(`https://discord.com/api/v9/channels/%s/messages?limit=50
{"id":"333333333333333346","channel_id":"%s","content":"needs later channel metadata","timestamp":"2026-04-23T18:20:43Z","author":{"id":"222222222222222232","username":"alice"}}
`, channelID, channelID)), 0o600))
for i := 1; i < checkpointEveryFiles; i++ {
require.NoError(t, os.WriteFile(filepath.Join(cachePath, fmt.Sprintf("entry_%03d", i)), []byte(fmt.Sprintf(
"https://discord.com/api/v9/channels/%s/messages?limit=50\n",
channelID,
)), 0o600))
}
require.NoError(t, os.WriteFile(filepath.Join(cachePath, fmt.Sprintf("entry_%03d", checkpointEveryFiles)), []byte(fmt.Sprintf(`https://discord.com/api/v9/channels/%s/messages?limit=50
{"id":"%s","guild_id":"%s","type":0,"name":"later-metadata"}
`, channelID, channelID, guildID)), 0o600))
st, err := store.Open(ctx, filepath.Join(dir, "discrawl.db"))
require.NoError(t, err)
defer func() { _ = st.Close() }()
stats, err := Import(ctx, st, Options{Path: dir})
require.NoError(t, err)
require.Equal(t, checkpointEveryFiles+1+checkpointEveryFiles, stats.FilesScanned)
require.Equal(t, 1, stats.Messages)
require.GreaterOrEqual(t, stats.Checkpoints, 2)
results, err := st.SearchMessages(ctx, store.SearchOptions{Query: "needs later channel metadata", Limit: 10})
require.NoError(t, err)
require.Len(t, results, 1)
require.Equal(t, guildID, results[0].GuildID)
require.Equal(t, "later-metadata", results[0].ChannelName)
requireMessageCount(t, ctx, st, "message_events", 1)
stats, err = Import(ctx, st, Options{Path: dir})
require.NoError(t, err)
require.Equal(t, 0, stats.FilesScanned)
require.Equal(t, checkpointEveryFiles+1, stats.FilesUnchanged)
requireMessageCount(t, ctx, st, "message_events", 1)
}
func TestImportCheckpointsPartiallyResolvedRetryBatch(t *testing.T) {
ctx := context.Background()
dir := t.TempDir()
cachePath := filepath.Join(dir, "Cache", "Cache_Data")
require.NoError(t, os.MkdirAll(cachePath, 0o755))
resolvedChannelID := "111111111111111121"
unresolvedChannelID := "111111111111111122"
guildID := "999999999999999996"
require.NoError(t, os.WriteFile(filepath.Join(cachePath, "entry_000"), []byte(fmt.Sprintf(`https://discord.com/api/v9/channels/%s/messages?limit=50
https://discord.com/api/v9/channels/%s/messages?limit=50
{"id":"333333333333333346","channel_id":"%s","content":"partially resolved retry message","timestamp":"2026-04-23T18:20:43Z","author":{"id":"222222222222222232","username":"alice"}}
{"id":"333333333333333347","channel_id":"%s","content":"still unresolved retry message","timestamp":"2026-04-23T18:20:44Z","author":{"id":"222222222222222232","username":"alice"}}
`, resolvedChannelID, unresolvedChannelID, resolvedChannelID, unresolvedChannelID)), 0o600))
for i := 1; i < checkpointEveryFiles; i++ {
require.NoError(t, os.WriteFile(filepath.Join(cachePath, fmt.Sprintf("entry_%03d", i)), []byte(fmt.Sprintf(
"https://discord.com/api/v9/channels/%s/messages?limit=50\n",
resolvedChannelID,
)), 0o600))
}
require.NoError(t, os.WriteFile(filepath.Join(cachePath, fmt.Sprintf("entry_%03d", checkpointEveryFiles)), []byte(fmt.Sprintf(`https://discord.com/api/v9/channels/%s/messages?limit=50
{"id":"%s","guild_id":"%s","type":0,"name":"partially-resolved"}
`, resolvedChannelID, resolvedChannelID, guildID)), 0o600))
st, err := store.Open(ctx, filepath.Join(dir, "discrawl.db"))
require.NoError(t, err)
defer func() { _ = st.Close() }()
stats, err := Import(ctx, st, Options{Path: dir})
require.NoError(t, err)
require.Equal(t, checkpointEveryFiles+1+checkpointEveryFiles, stats.FilesScanned)
require.Equal(t, 1, stats.Messages)
require.Equal(t, 1, stats.SkippedMessages)
require.GreaterOrEqual(t, stats.Checkpoints, 2)
results, err := st.SearchMessages(ctx, store.SearchOptions{Query: "partially resolved retry", Limit: 10})
require.NoError(t, err)
require.Len(t, results, 1)
require.Equal(t, "partially-resolved", results[0].ChannelName)
results, err = st.SearchMessages(ctx, store.SearchOptions{Query: "still unresolved retry", Limit: 10})
require.NoError(t, err)
require.Empty(t, results)
requireMessageCount(t, ctx, st, "message_events", 1)
stats, err = Import(ctx, st, Options{Path: dir})
require.NoError(t, err)
require.Equal(t, 0, stats.FilesScanned)
require.Equal(t, checkpointEveryFiles+1, stats.FilesUnchanged)
requireMessageCount(t, ctx, st, "message_events", 1)
}
func TestImportCheckpointsUnresolvableRouteBearingCacheMisses(t *testing.T) {
ctx := context.Background()
dir := t.TempDir()
cachePath := filepath.Join(dir, "Cache", "Cache_Data")
require.NoError(t, os.MkdirAll(cachePath, 0o755))
channelID := "111111111111111121"
require.NoError(t, os.WriteFile(filepath.Join(cachePath, "entry_000"), []byte(fmt.Sprintf(`https://discord.com/api/v9/channels/%s/messages?limit=50
{"id":"333333333333333346","channel_id":"%s","content":"permanent unresolved cache miss","timestamp":"2026-04-23T18:20:43Z","author":{"id":"222222222222222232","username":"alice"}}
`, channelID, channelID)), 0o600))
st, err := store.Open(ctx, filepath.Join(dir, "discrawl.db"))
require.NoError(t, err)
defer func() { _ = st.Close() }()
stats, err := Import(ctx, st, Options{Path: dir})
require.NoError(t, err)
require.Equal(t, 1, stats.FilesScanned)
require.Equal(t, 1, stats.SkippedMessages)
require.Equal(t, 1, stats.Checkpoints)
results, err := st.SearchMessages(ctx, store.SearchOptions{Query: "permanent unresolved", Limit: 10})
require.NoError(t, err)
require.Empty(t, results)
stats, err = Import(ctx, st, Options{Path: dir})
require.NoError(t, err)
require.Equal(t, 0, stats.FilesScanned)
require.Equal(t, 1, stats.FilesUnchanged)
}
func TestImportDoesNotAppendEventsForSkippedMixedBatch(t *testing.T) {
ctx := context.Background()
dir := t.TempDir()
cachePath := filepath.Join(dir, "Cache", "Cache_Data")
require.NoError(t, os.MkdirAll(cachePath, 0o755))
guildID := "999999999999999996"
resolvedChannelID := "111111111111111121"
unresolvedChannelID := "111111111111111122"
require.NoError(t, os.WriteFile(filepath.Join(cachePath, "entry_000"), []byte(fmt.Sprintf(`https://discord.com/channels/%s/%s
https://discord.com/api/v9/channels/%s/messages?limit=50
{"id":"333333333333333346","channel_id":"%s","content":"mixed resolved message","timestamp":"2026-04-23T18:20:43Z","author":{"id":"222222222222222232","username":"alice"}}
{"id":"333333333333333347","channel_id":"%s","content":"mixed unresolved message","timestamp":"2026-04-23T18:20:44Z","author":{"id":"222222222222222232","username":"alice"}}
`, guildID, resolvedChannelID, unresolvedChannelID, resolvedChannelID, unresolvedChannelID)), 0o600))
st, err := store.Open(ctx, filepath.Join(dir, "discrawl.db"))
require.NoError(t, err)
defer func() { _ = st.Close() }()
stats, err := Import(ctx, st, Options{Path: dir})
require.NoError(t, err)
require.Equal(t, 1, stats.FilesScanned)
require.Equal(t, 1, stats.Checkpoints)
requireMessageCount(t, ctx, st, "message_events", 0)
results, err := st.SearchMessages(ctx, store.SearchOptions{Query: "mixed resolved", Limit: 10})
require.NoError(t, err)
require.Len(t, results, 1)
results, err = st.SearchMessages(ctx, store.SearchOptions{Query: "mixed unresolved", Limit: 10})
require.NoError(t, err)
require.Empty(t, results)
stats, err = Import(ctx, st, Options{Path: dir})
require.NoError(t, err)
require.Equal(t, 0, stats.FilesScanned)
require.Equal(t, 1, stats.FilesUnchanged)
requireMessageCount(t, ctx, st, "message_events", 0)
}
func TestImportDoesNotDuplicateEventsWhenSwitchingFullCacheModes(t *testing.T) {
ctx := context.Background()
dir := t.TempDir()
cachePath := filepath.Join(dir, "Cache", "Cache_Data")
require.NoError(t, os.MkdirAll(cachePath, 0o755))
channelID := "111111111111111121"
guildID := "999999999999999996"
require.NoError(t, os.WriteFile(filepath.Join(cachePath, "entry_000"), []byte(fmt.Sprintf(`https://discord.com/channels/%s/%s
{"id":"%s","guild_id":"%s","type":0,"name":"mode-switch"}
{"id":"333333333333333346","channel_id":"%s","content":"mode switch event once","timestamp":"2026-04-23T18:20:43Z","author":{"id":"222222222222222232","username":"alice"}}
`, guildID, channelID, channelID, guildID, channelID)), 0o600))
t.Run("full then default", func(t *testing.T) {
st, err := store.Open(ctx, filepath.Join(dir, "full-first.db"))
require.NoError(t, err)
defer func() { _ = st.Close() }()
stats, err := Import(ctx, st, Options{Path: dir, FullCache: true})
require.NoError(t, err)
require.Equal(t, 1, stats.FilesScanned)
require.Equal(t, 1, stats.Messages)
requireMessageCount(t, ctx, st, "message_events", 1)
stats, err = Import(ctx, st, Options{Path: dir})
require.NoError(t, err)
require.Equal(t, 0, stats.FilesScanned)
require.Equal(t, 1, stats.FilesUnchanged)
requireMessageCount(t, ctx, st, "message_events", 1)
})
t.Run("default then full", func(t *testing.T) {
st, err := store.Open(ctx, filepath.Join(dir, "default-first.db"))
require.NoError(t, err)
defer func() { _ = st.Close() }()
stats, err := Import(ctx, st, Options{Path: dir})
require.NoError(t, err)
require.Equal(t, 1, stats.FilesScanned)
require.Equal(t, 1, stats.Messages)
requireMessageCount(t, ctx, st, "message_events", 1)
stats, err = Import(ctx, st, Options{Path: dir, FullCache: true})
require.NoError(t, err)
require.Equal(t, 0, stats.FilesScanned)
require.Equal(t, 1, stats.FilesUnchanged)
requireMessageCount(t, ctx, st, "message_events", 1)
})
}
func TestImportFastCachePreservesKnownChannelMetadataAcrossBatches(t *testing.T) {
ctx := context.Background()
dir := t.TempDir()
leveldbPath := filepath.Join(dir, "Local Storage", "leveldb")
cachePath := filepath.Join(dir, "Cache", "Cache_Data")
require.NoError(t, os.MkdirAll(leveldbPath, 0o755))
require.NoError(t, os.MkdirAll(cachePath, 0o755))
channelID := "111111111111111121"
guildID := "999999999999999996"
require.NoError(t, os.WriteFile(filepath.Join(leveldbPath, "000001.log"), []byte(fmt.Sprintf(
`{"id":"%s","guild_id":"%s","type":11,"name":"known-thread","thread_metadata":{"archived":false}}`,
channelID,
guildID,
)), 0o600))
require.NoError(t, os.WriteFile(filepath.Join(cachePath, "entry_0"), []byte(fmt.Sprintf(`https://discord.com/channels/%s/%s
{"id":"333333333333333346","channel_id":"%s","content":"thread metadata cache","timestamp":"2026-04-23T18:20:43Z","author":{"id":"222222222222222232","username":"alice"}}
`, guildID, channelID, channelID)), 0o600))
st, err := store.Open(ctx, filepath.Join(dir, "discrawl.db"))
require.NoError(t, err)
defer func() { _ = st.Close() }()
stats, err := Import(ctx, st, Options{Path: dir})
require.NoError(t, err)
require.Equal(t, 1, stats.Messages)
channels, err := st.Channels(ctx, guildID)
require.NoError(t, err)
require.Len(t, channels, 1)
require.Equal(t, "known-thread", channels[0].Name)
require.Equal(t, "thread_public", channels[0].Kind)
_, rows, err := st.ReadOnlyQuery(ctx, "select raw_json from channels where id = '111111111111111121'")
require.NoError(t, err)
require.Len(t, rows, 1)
require.Contains(t, rows[0][0], `"type":11`)
}
func TestImportFastCacheRouteFiltersServiceWorkerCacheStorage(t *testing.T) {
ctx := context.Background()
dir := t.TempDir()
cachePath := filepath.Join(dir, "Service Worker", "CacheStorage", "cache-id")
require.NoError(t, os.MkdirAll(cachePath, 0o755))
require.NoError(t, os.WriteFile(filepath.Join(cachePath, "unrouted"), []byte(`
{"id":"111111111111111121","guild_id":"999999999999999996","type":0,"name":"service-worker-cache"}
{"id":"333333333333333346","channel_id":"111111111111111121","content":"service worker historical cache","timestamp":"2026-04-23T18:20:43Z","author":{"id":"222222222222222232","username":"alice"}}
`), 0o600))
st, err := store.Open(ctx, filepath.Join(dir, "discrawl.db"))
require.NoError(t, err)
defer func() { _ = st.Close() }()
stats, err := Import(ctx, st, Options{Path: dir})
require.NoError(t, err)
require.Equal(t, 0, stats.FilesScanned)
require.Equal(t, 1, stats.CacheFilesFastSkipped)
results, err := st.SearchMessages(ctx, store.SearchOptions{Query: "service worker historical", Limit: 10})
require.NoError(t, err)
require.Empty(t, results)
}
func requireMessageCount(t *testing.T, ctx context.Context, st *store.Store, table string, expected int) {
t.Helper()
_, rows, err := st.ReadOnlyQuery(ctx, fmt.Sprintf("select count(*) from %s", table))
require.NoError(t, err)
require.Len(t, rows, 1)
require.Len(t, rows[0], 1)
require.Equal(t, fmt.Sprint(expected), rows[0][0])
}