fix: stabilize wiretap cache checkpointing
This commit is contained in:
parent
68b49c90a5
commit
78fcca8204
@ -2,6 +2,12 @@
|
|||||||
|
|
||||||
All notable changes to `discrawl` will be documented in this file.
|
All notable changes to `discrawl` will be documented in this file.
|
||||||
|
|
||||||
|
## Unreleased
|
||||||
|
|
||||||
|
### Fixes
|
||||||
|
|
||||||
|
- `wiretap` now uses a fast default path for Discord Chromium cache imports: it scans cheap context files plus route-bearing HTTP cache entries, checkpoints file progress in batches, and leaves exhaustive historical cache archaeology behind `--full-cache` / `desktop.full_cache`.
|
||||||
|
|
||||||
## 0.6.5 - 2026-05-03
|
## 0.6.5 - 2026-05-03
|
||||||
|
|
||||||
### Fixes
|
### Fixes
|
||||||
|
|||||||
@ -253,6 +253,7 @@ This is the path for searchable DMs because bot tokens cannot read personal dire
|
|||||||
discrawl wiretap
|
discrawl wiretap
|
||||||
discrawl wiretap --path "$HOME/Library/Application Support/discord"
|
discrawl wiretap --path "$HOME/Library/Application Support/discord"
|
||||||
discrawl wiretap --dry-run
|
discrawl wiretap --dry-run
|
||||||
|
discrawl wiretap --full-cache
|
||||||
discrawl wiretap --watch-every 2m
|
discrawl wiretap --watch-every 2m
|
||||||
```
|
```
|
||||||
|
|
||||||
@ -264,7 +265,8 @@ Notes:
|
|||||||
- preserves existing local `@me` guilds, channels, messages, and attachments when importing a Git snapshot, so a shared guild mirror refresh does not wipe local wiretap DM search
|
- preserves existing local `@me` guilds, channels, messages, and attachments when importing a Git snapshot, so a shared guild mirror refresh does not wipe local wiretap DM search
|
||||||
- drops message payloads whose channel cannot be classified from cached channel metadata or Discord route URLs; dropped rows are counted as `skipped_messages`
|
- drops message payloads whose channel cannot be classified from cached channel metadata or Discord route URLs; dropped rows are counted as `skipped_messages`
|
||||||
- imports what Discord Desktop has cached locally, not complete live DM history
|
- imports what Discord Desktop has cached locally, not complete live DM history
|
||||||
- scans local `.ldb`, `.log`, `.json`, and `.txt` artifacts for Discord message JSON
|
- scans local `.ldb`, `.log`, `.json`, and `.txt` artifacts for Discord message JSON, plus route-bearing Chromium HTTP cache entries by default
|
||||||
|
- use `--full-cache` or `desktop.full_cache = true` for exhaustive Chromium cache import when you want slower historical guild-cache archaeology
|
||||||
- does not extract, store, or print Discord auth tokens
|
- does not extract, store, or print Discord auth tokens
|
||||||
- `--max-file-bytes` skips unusually large files; default is 64 MiB
|
- `--max-file-bytes` skips unusually large files; default is 64 MiB
|
||||||
|
|
||||||
@ -573,6 +575,7 @@ attachment_text = true
|
|||||||
[desktop]
|
[desktop]
|
||||||
path = "~/.config/discord" # macOS default: "~/Library/Application Support/discord"
|
path = "~/.config/discord" # macOS default: "~/Library/Application Support/discord"
|
||||||
max_file_bytes = 67108864
|
max_file_bytes = 67108864
|
||||||
|
full_cache = false
|
||||||
|
|
||||||
[search]
|
[search]
|
||||||
default_mode = "fts"
|
default_mode = "fts"
|
||||||
|
|||||||
2
SPEC.md
2
SPEC.md
@ -465,12 +465,14 @@ Expected flags:
|
|||||||
- `--dry-run`
|
- `--dry-run`
|
||||||
- `--watch-every <duration>`
|
- `--watch-every <duration>`
|
||||||
- `--max-file-bytes <bytes>`
|
- `--max-file-bytes <bytes>`
|
||||||
|
- `--full-cache`
|
||||||
|
|
||||||
Requirements:
|
Requirements:
|
||||||
|
|
||||||
- never use Discord user tokens
|
- never use Discord user tokens
|
||||||
- never extract or persist auth tokens from desktop cache
|
- never extract or persist auth tokens from desktop cache
|
||||||
- scan bounded local files only
|
- scan bounded local files only
|
||||||
|
- default to route-bearing HTTP cache entries; exhaustive Chromium cache scans require explicit full-cache mode
|
||||||
- store sanitized raw metadata, not full arbitrary cache blobs
|
- store sanitized raw metadata, not full arbitrary cache blobs
|
||||||
|
|
||||||
### `search`
|
### `search`
|
||||||
|
|||||||
@ -181,6 +181,7 @@ func (r *runtime) runSyncLocked(sources syncSources, opts syncer.SyncOptions) er
|
|||||||
stats, err := discorddesktop.Import(r.ctx, r.store, discorddesktop.Options{
|
stats, err := discorddesktop.Import(r.ctx, r.store, discorddesktop.Options{
|
||||||
Path: r.cfg.Desktop.Path,
|
Path: r.cfg.Desktop.Path,
|
||||||
MaxFileBytes: r.cfg.Desktop.MaxFileBytes,
|
MaxFileBytes: r.cfg.Desktop.MaxFileBytes,
|
||||||
|
FullCache: r.cfg.Desktop.FullCache,
|
||||||
Now: r.now,
|
Now: r.now,
|
||||||
})
|
})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@ -262,6 +263,7 @@ func (r *runtime) runWiretap(args []string) error {
|
|||||||
fs.SetOutput(io.Discard)
|
fs.SetOutput(io.Discard)
|
||||||
path := fs.String("path", r.cfg.Desktop.Path, "")
|
path := fs.String("path", r.cfg.Desktop.Path, "")
|
||||||
maxFileBytes := fs.Int64("max-file-bytes", r.cfg.Desktop.MaxFileBytes, "")
|
maxFileBytes := fs.Int64("max-file-bytes", r.cfg.Desktop.MaxFileBytes, "")
|
||||||
|
fullCache := fs.Bool("full-cache", r.cfg.Desktop.FullCache, "")
|
||||||
dryRun := fs.Bool("dry-run", false, "")
|
dryRun := fs.Bool("dry-run", false, "")
|
||||||
watchEvery := fs.Duration("watch-every", 0, "")
|
watchEvery := fs.Duration("watch-every", 0, "")
|
||||||
if err := fs.Parse(args); err != nil {
|
if err := fs.Parse(args); err != nil {
|
||||||
@ -277,6 +279,7 @@ func (r *runtime) runWiretap(args []string) error {
|
|||||||
stats, err := discorddesktop.Import(ctx, r.store, discorddesktop.Options{
|
stats, err := discorddesktop.Import(ctx, r.store, discorddesktop.Options{
|
||||||
Path: *path,
|
Path: *path,
|
||||||
MaxFileBytes: *maxFileBytes,
|
MaxFileBytes: *maxFileBytes,
|
||||||
|
FullCache: *fullCache,
|
||||||
DryRun: *dryRun,
|
DryRun: *dryRun,
|
||||||
Now: r.now,
|
Now: r.now,
|
||||||
})
|
})
|
||||||
|
|||||||
@ -142,8 +142,8 @@ func printHuman(w io.Writer, value any) error {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
if v.Wiretap != nil {
|
if v.Wiretap != nil {
|
||||||
if _, err := fmt.Fprintf(w, "wiretap_files=%d\nwiretap_unchanged=%d\nwiretap_messages=%d\nwiretap_dm_messages=%d\nwiretap_dm_channels=%d\nwiretap_guild_messages=%d\nwiretap_skipped_messages=%d\nwiretap_skipped_channels=%d\n",
|
if _, err := fmt.Fprintf(w, "wiretap_visited=%d\nwiretap_files=%d\nwiretap_unchanged=%d\nwiretap_fast_skipped=%d\nwiretap_messages=%d\nwiretap_dm_messages=%d\nwiretap_dm_channels=%d\nwiretap_guild_messages=%d\nwiretap_skipped_messages=%d\nwiretap_skipped_channels=%d\nwiretap_checkpoints=%d\n",
|
||||||
v.Wiretap.FilesScanned, v.Wiretap.FilesUnchanged, v.Wiretap.Messages, v.Wiretap.DMMessages, v.Wiretap.DMChannels, v.Wiretap.GuildMessages, v.Wiretap.SkippedMessages, v.Wiretap.SkippedChannels); err != nil {
|
v.Wiretap.FilesVisited, v.Wiretap.FilesScanned, v.Wiretap.FilesUnchanged, v.Wiretap.CacheFilesFastSkipped, v.Wiretap.Messages, v.Wiretap.DMMessages, v.Wiretap.DMChannels, v.Wiretap.GuildMessages, v.Wiretap.SkippedMessages, v.Wiretap.SkippedChannels, v.Wiretap.Checkpoints); err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -152,8 +152,8 @@ func printHuman(w io.Writer, value any) error {
|
|||||||
_, err := fmt.Fprintf(w, "guilds=%d channels=%d threads=%d members=%d messages=%d\n", v.Guilds, v.Channels, v.Threads, v.Members, v.Messages)
|
_, err := fmt.Fprintf(w, "guilds=%d channels=%d threads=%d members=%d messages=%d\n", v.Guilds, v.Channels, v.Threads, v.Members, v.Messages)
|
||||||
return err
|
return err
|
||||||
case discorddesktop.Stats:
|
case discorddesktop.Stats:
|
||||||
_, err := fmt.Fprintf(w, "path=%s\nfiles=%d\nskipped=%d\nunchanged=%d\nobjects=%d\nguilds=%d\nchannels=%d\nmessages=%d\ndm_messages=%d\ndm_channels=%d\nguild_messages=%d\nskipped_messages=%d\nskipped_channels=%d\ndry_run=%t\n",
|
_, err := fmt.Fprintf(w, "path=%s\nvisited=%d\nfiles=%d\nskipped=%d\nunchanged=%d\nfast_skipped=%d\nobjects=%d\nguilds=%d\nchannels=%d\nmessages=%d\ndm_messages=%d\ndm_channels=%d\nguild_messages=%d\nskipped_messages=%d\nskipped_channels=%d\ncheckpoints=%d\nfull_cache=%t\ndry_run=%t\n",
|
||||||
v.Path, v.FilesScanned, v.FilesSkipped, v.FilesUnchanged, v.JSONObjects, v.Guilds, v.Channels, v.Messages, v.DMMessages, v.DMChannels, v.GuildMessages, v.SkippedMessages, v.SkippedChannels, v.DryRun)
|
v.Path, v.FilesVisited, v.FilesScanned, v.FilesSkipped, v.FilesUnchanged, v.CacheFilesFastSkipped, v.JSONObjects, v.Guilds, v.Channels, v.Messages, v.DMMessages, v.DMChannels, v.GuildMessages, v.SkippedMessages, v.SkippedChannels, v.Checkpoints, v.FullCache, v.DryRun)
|
||||||
return err
|
return err
|
||||||
case store.Status:
|
case store.Status:
|
||||||
_, err := fmt.Fprintf(w, "db=%s\nguilds=%d\nchannels=%d\nthreads=%d\nmessages=%d\nmembers=%d\nembedding_backlog=%d\nlast_sync=%s\nlast_tail_event=%s\n",
|
_, err := fmt.Fprintf(w, "db=%s\nguilds=%d\nchannels=%d\nthreads=%d\nmessages=%d\nmembers=%d\nembedding_backlog=%d\nlast_sync=%s\nlast_tail_event=%s\n",
|
||||||
|
|||||||
@ -44,6 +44,7 @@ type DiscordConfig struct {
|
|||||||
type DesktopConfig struct {
|
type DesktopConfig struct {
|
||||||
Path string `toml:"path"`
|
Path string `toml:"path"`
|
||||||
MaxFileBytes int64 `toml:"max_file_bytes"`
|
MaxFileBytes int64 `toml:"max_file_bytes"`
|
||||||
|
FullCache bool `toml:"full_cache"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type SyncConfig struct {
|
type SyncConfig struct {
|
||||||
|
|||||||
@ -27,35 +27,43 @@ const (
|
|||||||
DirectMessageGuildName = "Discord Direct Messages"
|
DirectMessageGuildName = "Discord Direct Messages"
|
||||||
defaultMaxFileBytes = 64 << 20
|
defaultMaxFileBytes = 64 << 20
|
||||||
maxObjectBytes = 4 << 20
|
maxObjectBytes = 4 << 20
|
||||||
|
cacheSniffBytes = 1 << 20
|
||||||
|
checkpointEveryFiles = 256
|
||||||
)
|
)
|
||||||
|
|
||||||
var channelRouteRE = regexp.MustCompile(`/channels/(@me|[0-9]{12,24})/([0-9]{12,24})`)
|
var channelRouteRE = regexp.MustCompile(`/channels/(@me|[0-9]{12,24})/([0-9]{12,24})`)
|
||||||
|
var apiMessagesRouteRE = regexp.MustCompile(`/api/v[0-9]+/channels/[0-9]{12,24}/messages`)
|
||||||
|
|
||||||
type Options struct {
|
type Options struct {
|
||||||
Path string
|
Path string
|
||||||
MaxFileBytes int64
|
MaxFileBytes int64
|
||||||
DryRun bool
|
DryRun bool
|
||||||
|
FullCache bool
|
||||||
Now func() time.Time
|
Now func() time.Time
|
||||||
}
|
}
|
||||||
|
|
||||||
type Stats struct {
|
type Stats struct {
|
||||||
Path string `json:"path"`
|
Path string `json:"path"`
|
||||||
FilesScanned int `json:"files_scanned"`
|
FilesVisited int `json:"files_visited"`
|
||||||
FilesSkipped int `json:"files_skipped"`
|
FilesScanned int `json:"files_scanned"`
|
||||||
FilesUnchanged int `json:"files_unchanged"`
|
FilesSkipped int `json:"files_skipped"`
|
||||||
BytesScanned int64 `json:"bytes_scanned"`
|
FilesUnchanged int `json:"files_unchanged"`
|
||||||
JSONObjects int `json:"json_objects"`
|
CacheFilesFastSkipped int `json:"cache_files_fast_skipped"`
|
||||||
Guilds int `json:"guilds"`
|
BytesScanned int64 `json:"bytes_scanned"`
|
||||||
Channels int `json:"channels"`
|
JSONObjects int `json:"json_objects"`
|
||||||
Messages int `json:"messages"`
|
Guilds int `json:"guilds"`
|
||||||
DMMessages int `json:"dm_messages"`
|
Channels int `json:"channels"`
|
||||||
DMChannels int `json:"dm_channels"`
|
Messages int `json:"messages"`
|
||||||
GuildMessages int `json:"guild_messages"`
|
DMMessages int `json:"dm_messages"`
|
||||||
SkippedMessages int `json:"skipped_messages"`
|
DMChannels int `json:"dm_channels"`
|
||||||
SkippedChannels int `json:"skipped_channels"`
|
GuildMessages int `json:"guild_messages"`
|
||||||
DryRun bool `json:"dry_run,omitempty"`
|
SkippedMessages int `json:"skipped_messages"`
|
||||||
StartedAt time.Time `json:"started_at"`
|
SkippedChannels int `json:"skipped_channels"`
|
||||||
FinishedAt time.Time `json:"finished_at"`
|
Checkpoints int `json:"checkpoints"`
|
||||||
|
DryRun bool `json:"dry_run,omitempty"`
|
||||||
|
FullCache bool `json:"full_cache,omitempty"`
|
||||||
|
StartedAt time.Time `json:"started_at"`
|
||||||
|
FinishedAt time.Time `json:"finished_at"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type snapshot struct {
|
type snapshot struct {
|
||||||
@ -67,8 +75,9 @@ type snapshot struct {
|
|||||||
}
|
}
|
||||||
|
|
||||||
type fileFingerprint struct {
|
type fileFingerprint struct {
|
||||||
Size int64 `json:"size"`
|
Size int64 `json:"size"`
|
||||||
ModUnixNS int64 `json:"mod_unix_ns"`
|
ModUnixNS int64 `json:"mod_unix_ns"`
|
||||||
|
Status string `json:"status,omitempty"`
|
||||||
}
|
}
|
||||||
|
|
||||||
type scanState struct {
|
type scanState struct {
|
||||||
@ -77,8 +86,42 @@ type scanState struct {
|
|||||||
channels map[string]store.ChannelRecord
|
channels map[string]store.ChannelRecord
|
||||||
}
|
}
|
||||||
|
|
||||||
|
type fileSource int
|
||||||
|
|
||||||
|
const (
|
||||||
|
fileSourceContext fileSource = iota
|
||||||
|
fileSourceCacheData
|
||||||
|
)
|
||||||
|
|
||||||
|
type fileCandidate struct {
|
||||||
|
absPath string
|
||||||
|
relPath string
|
||||||
|
relKey string
|
||||||
|
source fileSource
|
||||||
|
info fs.FileInfo
|
||||||
|
fingerprint fileFingerprint
|
||||||
|
}
|
||||||
|
|
||||||
|
type scanTotals struct {
|
||||||
|
guilds map[string]struct{}
|
||||||
|
channels map[string]struct{}
|
||||||
|
messages map[string]struct{}
|
||||||
|
dmMessages map[string]struct{}
|
||||||
|
guildMessages map[string]struct{}
|
||||||
|
dmChannels map[string]struct{}
|
||||||
|
skippedMessages map[string]struct{}
|
||||||
|
skippedChannels map[string]struct{}
|
||||||
|
}
|
||||||
|
|
||||||
|
type unresolvedMessages map[string]string
|
||||||
|
|
||||||
const wiretapFileIndexScope = "wiretap:file_index:v1"
|
const wiretapFileIndexScope = "wiretap:file_index:v1"
|
||||||
|
|
||||||
|
const (
|
||||||
|
fileStatusImported = "imported"
|
||||||
|
fileStatusSkipped = "skipped"
|
||||||
|
)
|
||||||
|
|
||||||
func DefaultPath() string {
|
func DefaultPath() string {
|
||||||
home, _ := os.UserHomeDir()
|
home, _ := os.UserHomeDir()
|
||||||
switch runtime.GOOS {
|
switch runtime.GOOS {
|
||||||
@ -105,25 +148,29 @@ func Import(ctx context.Context, st *store.Store, opts Options) (Stats, error) {
|
|||||||
if err != nil {
|
if err != nil {
|
||||||
return Stats{}, err
|
return Stats{}, err
|
||||||
}
|
}
|
||||||
stats, snap, err := scan(ctx, opts, state)
|
if opts.FullCache {
|
||||||
|
stats, snap, err := scanFullCache(ctx, opts, state)
|
||||||
|
if err != nil {
|
||||||
|
return stats, err
|
||||||
|
}
|
||||||
|
stats.DryRun = opts.DryRun
|
||||||
|
if opts.DryRun {
|
||||||
|
return stats, nil
|
||||||
|
}
|
||||||
|
if err := writeSnapshot(ctx, st, snap, len(state.previous) == 0); err != nil {
|
||||||
|
return stats, err
|
||||||
|
}
|
||||||
|
if err := saveFileIndex(ctx, st, opts, state.current); err != nil {
|
||||||
|
return stats, err
|
||||||
|
}
|
||||||
|
stats.Checkpoints = 1
|
||||||
|
return stats, nil
|
||||||
|
}
|
||||||
|
stats, err := scanAndImport(ctx, st, opts, state)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return stats, err
|
return stats, err
|
||||||
}
|
}
|
||||||
stats.DryRun = opts.DryRun
|
stats.DryRun = opts.DryRun
|
||||||
if opts.DryRun {
|
|
||||||
return stats, nil
|
|
||||||
}
|
|
||||||
fullScan := len(state.previous) == 0
|
|
||||||
if snapshotHasChanges(snap) || fullScan {
|
|
||||||
if err := writeSnapshot(ctx, st, snap, fullScan); err != nil {
|
|
||||||
return stats, err
|
|
||||||
}
|
|
||||||
} else if err := st.SetSyncState(ctx, "wiretap:last_import", time.Now().UTC().Format(time.RFC3339Nano)); err != nil {
|
|
||||||
return stats, err
|
|
||||||
}
|
|
||||||
if err := saveFileIndex(ctx, st, state.current); err != nil {
|
|
||||||
return stats, err
|
|
||||||
}
|
|
||||||
return stats, nil
|
return stats, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -136,7 +183,7 @@ func loadScanState(ctx context.Context, st *store.Store, opts Options) (scanStat
|
|||||||
if st == nil || opts.DryRun {
|
if st == nil || opts.DryRun {
|
||||||
return state, nil
|
return state, nil
|
||||||
}
|
}
|
||||||
raw, err := st.GetSyncState(ctx, wiretapFileIndexScope)
|
raw, err := st.GetSyncState(ctx, fileIndexScope(opts))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return state, err
|
return state, err
|
||||||
}
|
}
|
||||||
@ -160,19 +207,107 @@ func loadScanState(ctx context.Context, st *store.Store, opts Options) (scanStat
|
|||||||
return state, nil
|
return state, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func saveFileIndex(ctx context.Context, st *store.Store, index map[string]fileFingerprint) error {
|
func fileIndexScope(Options) string {
|
||||||
|
return wiretapFileIndexScope
|
||||||
|
}
|
||||||
|
|
||||||
|
func saveFileIndex(ctx context.Context, st *store.Store, opts Options, index map[string]fileFingerprint) error {
|
||||||
body, err := json.Marshal(index)
|
body, err := json.Marshal(index)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
return err
|
return err
|
||||||
}
|
}
|
||||||
return st.SetSyncState(ctx, wiretapFileIndexScope, string(body))
|
return st.SetSyncState(ctx, fileIndexScope(opts), string(body))
|
||||||
|
}
|
||||||
|
|
||||||
|
func sameFileFingerprint(a, b fileFingerprint) bool {
|
||||||
|
return a.Size == b.Size && a.ModUnixNS == b.ModUnixNS
|
||||||
|
}
|
||||||
|
|
||||||
|
func isImportedFingerprint(fingerprint fileFingerprint) bool {
|
||||||
|
return fingerprint.Status == "" || fingerprint.Status == fileStatusImported
|
||||||
|
}
|
||||||
|
|
||||||
|
func importedFingerprint(fingerprint fileFingerprint) fileFingerprint {
|
||||||
|
fingerprint.Status = fileStatusImported
|
||||||
|
return fingerprint
|
||||||
|
}
|
||||||
|
|
||||||
|
func skippedFingerprint(fingerprint fileFingerprint) fileFingerprint {
|
||||||
|
fingerprint.Status = fileStatusSkipped
|
||||||
|
return fingerprint
|
||||||
}
|
}
|
||||||
|
|
||||||
func snapshotHasChanges(snap snapshot) bool {
|
func snapshotHasChanges(snap snapshot) bool {
|
||||||
return len(snap.guilds) > 0 || len(snap.channels) > 0 || len(snap.messages) > 0
|
return len(snap.guilds) > 0 || len(snap.channels) > 0 || len(snap.messages) > 0
|
||||||
}
|
}
|
||||||
|
|
||||||
func scan(ctx context.Context, opts Options, state scanState) (Stats, snapshot, error) {
|
func scanAndImport(ctx context.Context, st *store.Store, opts Options, state scanState) (Stats, error) {
|
||||||
|
now := opts.Now
|
||||||
|
if now == nil {
|
||||||
|
now = time.Now
|
||||||
|
}
|
||||||
|
root := strings.TrimSpace(opts.Path)
|
||||||
|
if root == "" {
|
||||||
|
root = DefaultPath()
|
||||||
|
}
|
||||||
|
stats := Stats{Path: root, FullCache: opts.FullCache, StartedAt: now().UTC()}
|
||||||
|
rootFS, err := os.OpenRoot(root)
|
||||||
|
if err != nil {
|
||||||
|
stats.FinishedAt = now().UTC()
|
||||||
|
return stats, ignoreCacheFileError(err)
|
||||||
|
}
|
||||||
|
defer func() { _ = rootFS.Close() }()
|
||||||
|
contextFiles, cacheFiles, err := discoverCandidates(ctx, root, rootFS, opts, state, &stats)
|
||||||
|
if err != nil {
|
||||||
|
stats.FinishedAt = now().UTC()
|
||||||
|
return stats, err
|
||||||
|
}
|
||||||
|
fullScan := len(state.previous) == 0
|
||||||
|
if fullScan && !opts.DryRun {
|
||||||
|
if err := st.DeleteGuildData(ctx, "@unknown"); err != nil {
|
||||||
|
stats.FinishedAt = now().UTC()
|
||||||
|
return stats, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
run := newImportRun(ctx, st, opts, state, rootFS, &stats)
|
||||||
|
if err := run.scanContext(contextFiles); err != nil {
|
||||||
|
stats.FinishedAt = now().UTC()
|
||||||
|
return stats, err
|
||||||
|
}
|
||||||
|
if err := collectCacheRouteHints(ctx, rootFS, cacheFiles, run.base); err != nil {
|
||||||
|
stats.FinishedAt = now().UTC()
|
||||||
|
return stats, err
|
||||||
|
}
|
||||||
|
if err := run.scanCacheBatches(cacheFiles); err != nil {
|
||||||
|
stats.FinishedAt = now().UTC()
|
||||||
|
return stats, err
|
||||||
|
}
|
||||||
|
if err := run.retryPending(); err != nil {
|
||||||
|
stats.FinishedAt = now().UTC()
|
||||||
|
return stats, err
|
||||||
|
}
|
||||||
|
if !opts.DryRun {
|
||||||
|
if len(contextFiles) == 0 && len(cacheFiles) == 0 {
|
||||||
|
if err := st.SetSyncState(ctx, "wiretap:last_import", time.Now().UTC().Format(time.RFC3339Nano)); err != nil {
|
||||||
|
stats.FinishedAt = now().UTC()
|
||||||
|
return stats, err
|
||||||
|
}
|
||||||
|
if err := saveFileIndex(ctx, st, opts, state.current); err != nil {
|
||||||
|
stats.FinishedAt = now().UTC()
|
||||||
|
return stats, err
|
||||||
|
}
|
||||||
|
stats.Checkpoints++
|
||||||
|
}
|
||||||
|
if err := st.DeleteOrphanChannels(ctx, DirectMessageGuildID); err != nil {
|
||||||
|
stats.FinishedAt = now().UTC()
|
||||||
|
return stats, err
|
||||||
|
}
|
||||||
|
}
|
||||||
|
stats.FinishedAt = now().UTC()
|
||||||
|
return stats, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func scanFullCache(ctx context.Context, opts Options, state scanState) (Stats, snapshot, error) {
|
||||||
now := opts.Now
|
now := opts.Now
|
||||||
if now == nil {
|
if now == nil {
|
||||||
now = time.Now
|
now = time.Now
|
||||||
@ -185,14 +320,8 @@ func scan(ctx context.Context, opts Options, state scanState) (Stats, snapshot,
|
|||||||
if maxBytes <= 0 {
|
if maxBytes <= 0 {
|
||||||
maxBytes = defaultMaxFileBytes
|
maxBytes = defaultMaxFileBytes
|
||||||
}
|
}
|
||||||
stats := Stats{Path: root, StartedAt: now().UTC()}
|
stats := Stats{Path: root, FullCache: true, StartedAt: now().UTC()}
|
||||||
snap := snapshot{
|
snap := newSnapshot()
|
||||||
guilds: map[string]store.GuildRecord{},
|
|
||||||
channels: map[string]store.ChannelRecord{},
|
|
||||||
messages: map[string]store.MessageMutation{},
|
|
||||||
routes: map[string]string{},
|
|
||||||
userLabels: map[string]userLabel{},
|
|
||||||
}
|
|
||||||
rootFS, err := os.OpenRoot(root)
|
rootFS, err := os.OpenRoot(root)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
stats.FinishedAt = now().UTC()
|
stats.FinishedAt = now().UTC()
|
||||||
@ -212,6 +341,7 @@ func scan(ctx context.Context, opts Options, state scanState) (Stats, snapshot,
|
|||||||
}
|
}
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
|
stats.FilesVisited++
|
||||||
info, err := entry.Info()
|
info, err := entry.Info()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
stats.FilesSkipped++
|
stats.FilesSkipped++
|
||||||
@ -231,8 +361,8 @@ func scan(ctx context.Context, opts Options, state scanState) (Stats, snapshot,
|
|||||||
Size: info.Size(),
|
Size: info.Size(),
|
||||||
ModUnixNS: info.ModTime().UnixNano(),
|
ModUnixNS: info.ModTime().UnixNano(),
|
||||||
}
|
}
|
||||||
state.current[relKey] = fingerprint
|
state.current[relKey] = importedFingerprint(fingerprint)
|
||||||
if previous, ok := state.previous[relKey]; ok && previous == fingerprint {
|
if previous, ok := state.previous[relKey]; ok && sameFileFingerprint(previous, fingerprint) && isImportedFingerprint(previous) {
|
||||||
stats.FilesUnchanged++
|
stats.FilesUnchanged++
|
||||||
return nil
|
return nil
|
||||||
}
|
}
|
||||||
@ -267,15 +397,181 @@ func scan(ctx context.Context, opts Options, state scanState) (Stats, snapshot,
|
|||||||
}); err != nil {
|
}); err != nil {
|
||||||
return stats, snap, err
|
return stats, snap, err
|
||||||
}
|
}
|
||||||
reconcileMessages(snap, state.channels)
|
totals := newScanTotals()
|
||||||
|
finalizeSnapshot(snap, state.channels, totals, &stats, true)
|
||||||
|
stats.FinishedAt = now().UTC()
|
||||||
|
return stats, snap, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func discoverCandidates(ctx context.Context, root string, rootFS *os.Root, opts Options, state scanState, stats *Stats) ([]fileCandidate, []fileCandidate, error) {
|
||||||
|
var contextFiles []fileCandidate
|
||||||
|
var cacheFiles []fileCandidate
|
||||||
|
maxBytes := opts.MaxFileBytes
|
||||||
|
if maxBytes <= 0 {
|
||||||
|
maxBytes = defaultMaxFileBytes
|
||||||
|
}
|
||||||
|
err := filepath.WalkDir(root, func(path string, entry fs.DirEntry, err error) error {
|
||||||
|
if err != nil {
|
||||||
|
return ignoreCacheFileError(err)
|
||||||
|
}
|
||||||
|
if ctx.Err() != nil {
|
||||||
|
return ctx.Err()
|
||||||
|
}
|
||||||
|
if entry.IsDir() {
|
||||||
|
if shouldSkipDir(entry.Name()) && path != root {
|
||||||
|
return filepath.SkipDir
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
stats.FilesVisited++
|
||||||
|
info, err := entry.Info()
|
||||||
|
if err != nil {
|
||||||
|
stats.FilesSkipped++
|
||||||
|
return ignoreCacheFileError(err)
|
||||||
|
}
|
||||||
|
if !isCandidateFile(path) || info.Size() <= 0 || info.Size() > maxBytes {
|
||||||
|
stats.FilesSkipped++
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
relPath, err := filepath.Rel(root, path)
|
||||||
|
if err != nil {
|
||||||
|
stats.FilesSkipped++
|
||||||
|
return ignoreCacheFileError(err)
|
||||||
|
}
|
||||||
|
relKey := filepath.ToSlash(relPath)
|
||||||
|
fingerprint := fileFingerprint{
|
||||||
|
Size: info.Size(),
|
||||||
|
ModUnixNS: info.ModTime().UnixNano(),
|
||||||
|
}
|
||||||
|
candidate := fileCandidate{
|
||||||
|
absPath: path,
|
||||||
|
relPath: relPath,
|
||||||
|
relKey: relKey,
|
||||||
|
source: sourceForPath(root, path, relPath),
|
||||||
|
info: info,
|
||||||
|
fingerprint: fingerprint,
|
||||||
|
}
|
||||||
|
if candidate.source == fileSourceCacheData {
|
||||||
|
if previous, ok := state.previous[relKey]; ok && sameFileFingerprint(previous, fingerprint) {
|
||||||
|
if !opts.FullCache || isImportedFingerprint(previous) {
|
||||||
|
state.current[relKey] = previous
|
||||||
|
stats.FilesUnchanged++
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if !opts.FullCache {
|
||||||
|
ok, err := cacheFileHasRouteHint(rootFS, relPath)
|
||||||
|
if err != nil {
|
||||||
|
stats.FilesSkipped++
|
||||||
|
return ignoreCacheFileError(err)
|
||||||
|
}
|
||||||
|
if !ok {
|
||||||
|
state.current[relKey] = skippedFingerprint(fingerprint)
|
||||||
|
stats.FilesSkipped++
|
||||||
|
stats.CacheFilesFastSkipped++
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
}
|
||||||
|
cacheFiles = append(cacheFiles, candidate)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if previous, ok := state.previous[relKey]; ok && sameFileFingerprint(previous, fingerprint) {
|
||||||
|
state.current[relKey] = previous
|
||||||
|
stats.FilesUnchanged++
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
contextFiles = append(contextFiles, candidate)
|
||||||
|
return nil
|
||||||
|
})
|
||||||
|
return contextFiles, cacheFiles, err
|
||||||
|
}
|
||||||
|
|
||||||
|
func scanCandidates(ctx context.Context, rootFS *os.Root, opts Options, candidates []fileCandidate, snap snapshot, channelLookup map[string]store.ChannelRecord, stats *Stats) error {
|
||||||
|
maxBytes := opts.MaxFileBytes
|
||||||
|
if maxBytes <= 0 {
|
||||||
|
maxBytes = defaultMaxFileBytes
|
||||||
|
}
|
||||||
|
for _, candidate := range candidates {
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
data, err := rootFS.ReadFile(candidate.relPath)
|
||||||
|
if err != nil {
|
||||||
|
stats.FilesSkipped++
|
||||||
|
if err := ignoreCacheFileError(err); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
stats.FilesScanned++
|
||||||
|
stats.BytesScanned += int64(len(data))
|
||||||
|
collectChannelRoutes(snap, bytes.ToValidUTF8(data, nil))
|
||||||
|
objects := extractJSONValues(bytes.ToValidUTF8(data, nil))
|
||||||
|
for _, payload := range extractGzipPayloads(data, maxBytes) {
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
collectChannelRoutes(snap, bytes.ToValidUTF8(payload, nil))
|
||||||
|
objects = append(objects, extractJSONValues(bytes.ToValidUTF8(payload, nil))...)
|
||||||
|
}
|
||||||
|
stats.JSONObjects += len(objects)
|
||||||
|
for _, raw := range objects {
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
var value any
|
||||||
|
if err := json.Unmarshal(raw, &value); err != nil {
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
collectValue(snap, channelLookup, value, candidate.info.ModTime().UTC())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func collectCacheRouteHints(ctx context.Context, rootFS *os.Root, candidates []fileCandidate, snap snapshot) error {
|
||||||
|
for _, candidate := range candidates {
|
||||||
|
if err := ctx.Err(); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
data, err := readFilePrefix(rootFS, candidate.relPath)
|
||||||
|
if err != nil {
|
||||||
|
if err := ignoreCacheFileError(err); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
continue
|
||||||
|
}
|
||||||
|
collectChannelRoutes(snap, bytes.ToValidUTF8(data, nil))
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func newScanTotals() scanTotals {
|
||||||
|
return scanTotals{
|
||||||
|
guilds: map[string]struct{}{},
|
||||||
|
channels: map[string]struct{}{},
|
||||||
|
messages: map[string]struct{}{},
|
||||||
|
dmMessages: map[string]struct{}{},
|
||||||
|
guildMessages: map[string]struct{}{},
|
||||||
|
dmChannels: map[string]struct{}{},
|
||||||
|
skippedMessages: map[string]struct{}{},
|
||||||
|
skippedChannels: map[string]struct{}{},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func finalizeSnapshot(snap snapshot, channelLookup map[string]store.ChannelRecord, totals scanTotals, stats *Stats, recordSkipped bool) unresolvedMessages {
|
||||||
|
reconcileMessages(snap, channelLookup)
|
||||||
inferDirectMessageNames(snap)
|
inferDirectMessageNames(snap)
|
||||||
reconcileMessages(snap, state.channels)
|
reconcileMessages(snap, channelLookup)
|
||||||
skippedChannels := map[string]struct{}{}
|
unresolved := unresolvedMessages{}
|
||||||
for id, msg := range snap.messages {
|
for id, msg := range snap.messages {
|
||||||
guildID := msg.Record.GuildID
|
guildID := msg.Record.GuildID
|
||||||
if guildID == "" {
|
if guildID == "" {
|
||||||
stats.SkippedMessages++
|
unresolved[id] = msg.Record.ChannelID
|
||||||
skippedChannels[msg.Record.ChannelID] = struct{}{}
|
if recordSkipped {
|
||||||
|
totals.skippedMessages[id] = struct{}{}
|
||||||
|
totals.skippedChannels[msg.Record.ChannelID] = struct{}{}
|
||||||
|
}
|
||||||
delete(snap.messages, id)
|
delete(snap.messages, id)
|
||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
@ -283,34 +579,195 @@ func scan(ctx context.Context, opts Options, state scanState) (Stats, snapshot,
|
|||||||
snap.guilds[guildID] = syntheticGuild(guildID, guildName(guildID))
|
snap.guilds[guildID] = syntheticGuild(guildID, guildName(guildID))
|
||||||
}
|
}
|
||||||
if _, ok := snap.channels[msg.Record.ChannelID]; !ok {
|
if _, ok := snap.channels[msg.Record.ChannelID]; !ok {
|
||||||
snap.channels[msg.Record.ChannelID] = syntheticChannel(msg.Record.ChannelID, guildID, msg.Record.ChannelName)
|
if channel, ok := channelLookup[msg.Record.ChannelID]; ok && channel.GuildID != "" {
|
||||||
|
snap.channels[msg.Record.ChannelID] = channel
|
||||||
|
} else {
|
||||||
|
snap.channels[msg.Record.ChannelID] = syntheticChannel(msg.Record.ChannelID, guildID, msg.Record.ChannelName)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
snap.messages[id] = msg
|
snap.messages[id] = msg
|
||||||
}
|
}
|
||||||
messageChannels := map[string]struct{}{}
|
|
||||||
dmChannels := map[string]struct{}{}
|
|
||||||
for _, msg := range snap.messages {
|
for _, msg := range snap.messages {
|
||||||
messageChannels[msg.Record.ChannelID] = struct{}{}
|
totals.messages[msg.Record.ID] = struct{}{}
|
||||||
switch msg.Record.GuildID {
|
switch msg.Record.GuildID {
|
||||||
case DirectMessageGuildID:
|
case DirectMessageGuildID:
|
||||||
stats.DMMessages++
|
totals.dmMessages[msg.Record.ID] = struct{}{}
|
||||||
dmChannels[msg.Record.ChannelID] = struct{}{}
|
totals.dmChannels[msg.Record.ChannelID] = struct{}{}
|
||||||
default:
|
default:
|
||||||
stats.GuildMessages++
|
totals.guildMessages[msg.Record.ID] = struct{}{}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
for id := range snap.channels {
|
for id, channel := range snap.channels {
|
||||||
if _, ok := messageChannels[id]; !ok {
|
channelLookup[id] = channel
|
||||||
delete(snap.channels, id)
|
totals.channels[id] = struct{}{}
|
||||||
}
|
|
||||||
}
|
}
|
||||||
stats.DMChannels = len(dmChannels)
|
for id := range snap.guilds {
|
||||||
stats.SkippedChannels = len(skippedChannels)
|
totals.guilds[id] = struct{}{}
|
||||||
stats.Guilds = len(snap.guilds)
|
}
|
||||||
stats.Channels = len(snap.channels)
|
stats.DMChannels = len(totals.dmChannels)
|
||||||
stats.Messages = len(snap.messages)
|
stats.SkippedChannels = len(totals.skippedChannels)
|
||||||
stats.FinishedAt = now().UTC()
|
stats.Guilds = len(totals.guilds)
|
||||||
return stats, snap, nil
|
stats.Channels = len(totals.channels)
|
||||||
|
stats.Messages = len(totals.messages)
|
||||||
|
stats.DMMessages = len(totals.dmMessages)
|
||||||
|
stats.GuildMessages = len(totals.guildMessages)
|
||||||
|
stats.SkippedMessages = len(totals.skippedMessages)
|
||||||
|
return unresolved
|
||||||
|
}
|
||||||
|
|
||||||
|
func mergeUnresolved(dst, src unresolvedMessages) {
|
||||||
|
for messageID, channelID := range src {
|
||||||
|
dst[messageID] = channelID
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func recordUnresolved(unresolved unresolvedMessages, totals scanTotals, stats *Stats) {
|
||||||
|
for messageID, channelID := range unresolved {
|
||||||
|
totals.skippedMessages[messageID] = struct{}{}
|
||||||
|
totals.skippedChannels[channelID] = struct{}{}
|
||||||
|
}
|
||||||
|
stats.SkippedChannels = len(totals.skippedChannels)
|
||||||
|
stats.SkippedMessages = len(totals.skippedMessages)
|
||||||
|
}
|
||||||
|
|
||||||
|
func commitSnapshot(ctx context.Context, st *store.Store, opts Options, state scanState, candidates []fileCandidate, snap snapshot, checkpoint bool, stats *Stats) error {
|
||||||
|
if opts.DryRun {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if !checkpoint {
|
||||||
|
if snapshotHasChanges(snap) {
|
||||||
|
return writeSnapshot(ctx, st, snapshotWithoutMessageEvents(snap), false)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if snapshotHasChanges(snap) {
|
||||||
|
if err := writeSnapshot(ctx, st, snap, false); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
} else if err := st.SetSyncState(ctx, "wiretap:last_import", time.Now().UTC().Format(time.RFC3339Nano)); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
for _, candidate := range candidates {
|
||||||
|
state.current[candidate.relKey] = importedFingerprint(candidate.fingerprint)
|
||||||
|
}
|
||||||
|
if err := saveFileIndex(ctx, st, opts, state.current); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
stats.Checkpoints++
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func checkpointScannedCandidates(ctx context.Context, st *store.Store, opts Options, state scanState, candidates []fileCandidate, stats *Stats) error {
|
||||||
|
if opts.DryRun {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if err := st.SetSyncState(ctx, "wiretap:last_import", time.Now().UTC().Format(time.RFC3339Nano)); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
for _, candidate := range candidates {
|
||||||
|
state.current[candidate.relKey] = importedFingerprint(candidate.fingerprint)
|
||||||
|
}
|
||||||
|
if err := saveFileIndex(ctx, st, opts, state.current); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
stats.Checkpoints++
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func snapshotWithoutMessageEvents(snap snapshot) snapshot {
|
||||||
|
out := snapshot{
|
||||||
|
guilds: snap.guilds,
|
||||||
|
channels: snap.channels,
|
||||||
|
messages: make(map[string]store.MessageMutation, len(snap.messages)),
|
||||||
|
routes: snap.routes,
|
||||||
|
userLabels: snap.userLabels,
|
||||||
|
}
|
||||||
|
for id, message := range snap.messages {
|
||||||
|
message.Options.AppendEvent = false
|
||||||
|
out.messages[id] = message
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func newSnapshot() snapshot {
|
||||||
|
return snapshot{
|
||||||
|
guilds: map[string]store.GuildRecord{},
|
||||||
|
channels: map[string]store.ChannelRecord{},
|
||||||
|
messages: map[string]store.MessageMutation{},
|
||||||
|
routes: map[string]string{},
|
||||||
|
userLabels: map[string]userLabel{},
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func newSnapshotWithContext(base snapshot) snapshot {
|
||||||
|
snap := newSnapshot()
|
||||||
|
for channelID, guildID := range base.routes {
|
||||||
|
snap.routes[channelID] = guildID
|
||||||
|
}
|
||||||
|
for userID, label := range base.userLabels {
|
||||||
|
snap.userLabels[userID] = label
|
||||||
|
}
|
||||||
|
return snap
|
||||||
|
}
|
||||||
|
|
||||||
|
func mergeSnapshotContext(base snapshot, next snapshot) {
|
||||||
|
for channelID, guildID := range next.routes {
|
||||||
|
collectChannelRoute(base, channelID, guildID)
|
||||||
|
}
|
||||||
|
for userID, label := range next.userLabels {
|
||||||
|
base.userLabels[userID] = label
|
||||||
|
}
|
||||||
|
for channelID, channel := range next.channels {
|
||||||
|
base.channels[channelID] = channel
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func copyChannelLookup(in map[string]store.ChannelRecord) map[string]store.ChannelRecord {
|
||||||
|
out := make(map[string]store.ChannelRecord, len(in))
|
||||||
|
for id, channel := range in {
|
||||||
|
out[id] = channel
|
||||||
|
}
|
||||||
|
return out
|
||||||
|
}
|
||||||
|
|
||||||
|
func sourceForPath(root, path, relPath string) fileSource {
|
||||||
|
if isRouteFilteredCachePath(root, path, relPath) {
|
||||||
|
return fileSourceCacheData
|
||||||
|
}
|
||||||
|
return fileSourceContext
|
||||||
|
}
|
||||||
|
|
||||||
|
func isRouteFilteredCachePath(root, path, relPath string) bool {
|
||||||
|
cleanRoot := filepath.ToSlash(root)
|
||||||
|
cleanPath := filepath.ToSlash(path)
|
||||||
|
cleanRel := filepath.ToSlash(relPath)
|
||||||
|
return filepath.Base(cleanRoot) == "Cache_Data" ||
|
||||||
|
filepath.Base(cleanRoot) == "CacheStorage" ||
|
||||||
|
strings.Contains(cleanPath, "/Cache/Cache_Data/") ||
|
||||||
|
strings.Contains(cleanPath, "/Service Worker/CacheStorage/") ||
|
||||||
|
strings.HasPrefix(cleanRel, "Cache_Data/") ||
|
||||||
|
strings.HasPrefix(cleanRel, "Service Worker/CacheStorage/")
|
||||||
|
}
|
||||||
|
|
||||||
|
func cacheFileHasRouteHint(rootFS *os.Root, relPath string) (bool, error) {
|
||||||
|
data, err := readFilePrefix(rootFS, relPath)
|
||||||
|
if err != nil {
|
||||||
|
return false, err
|
||||||
|
}
|
||||||
|
return channelRouteRE.Match(data) || apiMessagesRouteRE.Match(data), nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func readFilePrefix(rootFS *os.Root, relPath string) ([]byte, error) {
|
||||||
|
file, err := rootFS.Open(relPath)
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
defer func() { _ = file.Close() }()
|
||||||
|
data, err := io.ReadAll(io.LimitReader(file, cacheSniffBytes))
|
||||||
|
if err != nil {
|
||||||
|
return nil, err
|
||||||
|
}
|
||||||
|
return data, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func ignoreCacheFileError(error) error {
|
func ignoreCacheFileError(error) error {
|
||||||
|
|||||||
382
internal/discorddesktop/import_pipeline_test.go
Normal file
382
internal/discorddesktop/import_pipeline_test.go
Normal file
@ -0,0 +1,382 @@
|
|||||||
|
package discorddesktop
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"fmt"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"github.com/stretchr/testify/require"
|
||||||
|
|
||||||
|
"github.com/steipete/discrawl/internal/store"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestImportFastCacheSkipsUnroutedCacheDataUnlessFullCache(t *testing.T) {
|
||||||
|
ctx := context.Background()
|
||||||
|
dir := t.TempDir()
|
||||||
|
cachePath := filepath.Join(dir, "Cache", "Cache_Data")
|
||||||
|
require.NoError(t, os.MkdirAll(cachePath, 0o755))
|
||||||
|
require.NoError(t, os.WriteFile(filepath.Join(cachePath, "entry_0"), []byte(`
|
||||||
|
{"id":"111111111111111121","guild_id":"999999999999999996","type":0,"name":"slow-cache"}
|
||||||
|
{"id":"333333333333333346","channel_id":"111111111111111121","content":"unrouted historical cache","timestamp":"2026-04-23T18:20:43Z","author":{"id":"222222222222222232","username":"alice"}}
|
||||||
|
`), 0o600))
|
||||||
|
|
||||||
|
fastStore, err := store.Open(ctx, filepath.Join(dir, "fast.db"))
|
||||||
|
require.NoError(t, err)
|
||||||
|
defer func() { _ = fastStore.Close() }()
|
||||||
|
|
||||||
|
stats, err := Import(ctx, fastStore, Options{Path: dir})
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.Equal(t, 0, stats.FilesScanned)
|
||||||
|
require.Equal(t, 1, stats.CacheFilesFastSkipped)
|
||||||
|
require.Equal(t, 0, stats.Messages)
|
||||||
|
|
||||||
|
results, err := fastStore.SearchMessages(ctx, store.SearchOptions{Query: "unrouted historical", Limit: 10})
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.Empty(t, results)
|
||||||
|
|
||||||
|
stats, err = Import(ctx, fastStore, Options{Path: dir})
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.Equal(t, 0, stats.FilesScanned)
|
||||||
|
require.Equal(t, 0, stats.CacheFilesFastSkipped)
|
||||||
|
require.Equal(t, 1, stats.FilesUnchanged)
|
||||||
|
|
||||||
|
stats, err = Import(ctx, fastStore, Options{Path: dir, FullCache: true})
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.Equal(t, 1, stats.FilesScanned)
|
||||||
|
require.Equal(t, 1, stats.Messages)
|
||||||
|
|
||||||
|
fullStore, err := store.Open(ctx, filepath.Join(dir, "full.db"))
|
||||||
|
require.NoError(t, err)
|
||||||
|
defer func() { _ = fullStore.Close() }()
|
||||||
|
|
||||||
|
stats, err = Import(ctx, fullStore, Options{Path: dir, FullCache: true})
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.Equal(t, 1, stats.FilesScanned)
|
||||||
|
require.Equal(t, 0, stats.CacheFilesFastSkipped)
|
||||||
|
require.Equal(t, 1, stats.Messages)
|
||||||
|
|
||||||
|
results, err = fullStore.SearchMessages(ctx, store.SearchOptions{Query: "unrouted historical", Limit: 10})
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.Len(t, results, 1)
|
||||||
|
require.Equal(t, "slow-cache", results[0].ChannelName)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestImportCheckpointsCacheBatches(t *testing.T) {
|
||||||
|
ctx := context.Background()
|
||||||
|
dir := t.TempDir()
|
||||||
|
cachePath := filepath.Join(dir, "Cache", "Cache_Data")
|
||||||
|
require.NoError(t, os.MkdirAll(cachePath, 0o755))
|
||||||
|
for i := range checkpointEveryFiles + 1 {
|
||||||
|
channelID := "111111111111111121"
|
||||||
|
messageID := 333333333333333346 + i
|
||||||
|
body := []byte(fmt.Sprintf(`https://discord.com/channels/999999999999999996/%s
|
||||||
|
{"id":"%d","channel_id":"%s","content":"checkpoint cache %d","timestamp":"2026-04-23T18:20:43Z","author":{"id":"222222222222222232","username":"alice"}}
|
||||||
|
`, channelID, messageID, channelID, i))
|
||||||
|
require.NoError(t, os.WriteFile(filepath.Join(cachePath, fmt.Sprintf("entry_%03d", i)), body, 0o600))
|
||||||
|
}
|
||||||
|
|
||||||
|
st, err := store.Open(ctx, filepath.Join(dir, "discrawl.db"))
|
||||||
|
require.NoError(t, err)
|
||||||
|
defer func() { _ = st.Close() }()
|
||||||
|
|
||||||
|
stats, err := Import(ctx, st, Options{Path: dir})
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.Equal(t, checkpointEveryFiles+1, stats.FilesScanned)
|
||||||
|
require.Equal(t, checkpointEveryFiles+1, stats.Messages)
|
||||||
|
require.GreaterOrEqual(t, stats.Checkpoints, 2)
|
||||||
|
|
||||||
|
stats, err = Import(ctx, st, Options{Path: dir})
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.Equal(t, 0, stats.FilesScanned)
|
||||||
|
require.Equal(t, checkpointEveryFiles+1, stats.FilesUnchanged)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestImportUsesLaterCacheMetadataBeforeCheckpointingEarlierBatch(t *testing.T) {
|
||||||
|
ctx := context.Background()
|
||||||
|
dir := t.TempDir()
|
||||||
|
cachePath := filepath.Join(dir, "Cache", "Cache_Data")
|
||||||
|
require.NoError(t, os.MkdirAll(cachePath, 0o755))
|
||||||
|
|
||||||
|
channelID := "111111111111111121"
|
||||||
|
guildID := "999999999999999996"
|
||||||
|
require.NoError(t, os.WriteFile(filepath.Join(cachePath, "entry_000"), []byte(fmt.Sprintf(`https://discord.com/api/v9/channels/%s/messages?limit=50
|
||||||
|
{"id":"333333333333333346","channel_id":"%s","content":"needs later channel metadata","timestamp":"2026-04-23T18:20:43Z","author":{"id":"222222222222222232","username":"alice"}}
|
||||||
|
`, channelID, channelID)), 0o600))
|
||||||
|
for i := 1; i < checkpointEveryFiles; i++ {
|
||||||
|
require.NoError(t, os.WriteFile(filepath.Join(cachePath, fmt.Sprintf("entry_%03d", i)), []byte(fmt.Sprintf(
|
||||||
|
"https://discord.com/api/v9/channels/%s/messages?limit=50\n",
|
||||||
|
channelID,
|
||||||
|
)), 0o600))
|
||||||
|
}
|
||||||
|
require.NoError(t, os.WriteFile(filepath.Join(cachePath, fmt.Sprintf("entry_%03d", checkpointEveryFiles)), []byte(fmt.Sprintf(`https://discord.com/api/v9/channels/%s/messages?limit=50
|
||||||
|
{"id":"%s","guild_id":"%s","type":0,"name":"later-metadata"}
|
||||||
|
`, channelID, channelID, guildID)), 0o600))
|
||||||
|
|
||||||
|
st, err := store.Open(ctx, filepath.Join(dir, "discrawl.db"))
|
||||||
|
require.NoError(t, err)
|
||||||
|
defer func() { _ = st.Close() }()
|
||||||
|
|
||||||
|
stats, err := Import(ctx, st, Options{Path: dir})
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.Equal(t, checkpointEveryFiles+1+checkpointEveryFiles, stats.FilesScanned)
|
||||||
|
require.Equal(t, 1, stats.Messages)
|
||||||
|
require.GreaterOrEqual(t, stats.Checkpoints, 2)
|
||||||
|
|
||||||
|
results, err := st.SearchMessages(ctx, store.SearchOptions{Query: "needs later channel metadata", Limit: 10})
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.Len(t, results, 1)
|
||||||
|
require.Equal(t, guildID, results[0].GuildID)
|
||||||
|
require.Equal(t, "later-metadata", results[0].ChannelName)
|
||||||
|
requireMessageCount(t, ctx, st, "message_events", 1)
|
||||||
|
|
||||||
|
stats, err = Import(ctx, st, Options{Path: dir})
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.Equal(t, 0, stats.FilesScanned)
|
||||||
|
require.Equal(t, checkpointEveryFiles+1, stats.FilesUnchanged)
|
||||||
|
requireMessageCount(t, ctx, st, "message_events", 1)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestImportCheckpointsPartiallyResolvedRetryBatch(t *testing.T) {
|
||||||
|
ctx := context.Background()
|
||||||
|
dir := t.TempDir()
|
||||||
|
cachePath := filepath.Join(dir, "Cache", "Cache_Data")
|
||||||
|
require.NoError(t, os.MkdirAll(cachePath, 0o755))
|
||||||
|
|
||||||
|
resolvedChannelID := "111111111111111121"
|
||||||
|
unresolvedChannelID := "111111111111111122"
|
||||||
|
guildID := "999999999999999996"
|
||||||
|
require.NoError(t, os.WriteFile(filepath.Join(cachePath, "entry_000"), []byte(fmt.Sprintf(`https://discord.com/api/v9/channels/%s/messages?limit=50
|
||||||
|
https://discord.com/api/v9/channels/%s/messages?limit=50
|
||||||
|
{"id":"333333333333333346","channel_id":"%s","content":"partially resolved retry message","timestamp":"2026-04-23T18:20:43Z","author":{"id":"222222222222222232","username":"alice"}}
|
||||||
|
{"id":"333333333333333347","channel_id":"%s","content":"still unresolved retry message","timestamp":"2026-04-23T18:20:44Z","author":{"id":"222222222222222232","username":"alice"}}
|
||||||
|
`, resolvedChannelID, unresolvedChannelID, resolvedChannelID, unresolvedChannelID)), 0o600))
|
||||||
|
for i := 1; i < checkpointEveryFiles; i++ {
|
||||||
|
require.NoError(t, os.WriteFile(filepath.Join(cachePath, fmt.Sprintf("entry_%03d", i)), []byte(fmt.Sprintf(
|
||||||
|
"https://discord.com/api/v9/channels/%s/messages?limit=50\n",
|
||||||
|
resolvedChannelID,
|
||||||
|
)), 0o600))
|
||||||
|
}
|
||||||
|
require.NoError(t, os.WriteFile(filepath.Join(cachePath, fmt.Sprintf("entry_%03d", checkpointEveryFiles)), []byte(fmt.Sprintf(`https://discord.com/api/v9/channels/%s/messages?limit=50
|
||||||
|
{"id":"%s","guild_id":"%s","type":0,"name":"partially-resolved"}
|
||||||
|
`, resolvedChannelID, resolvedChannelID, guildID)), 0o600))
|
||||||
|
|
||||||
|
st, err := store.Open(ctx, filepath.Join(dir, "discrawl.db"))
|
||||||
|
require.NoError(t, err)
|
||||||
|
defer func() { _ = st.Close() }()
|
||||||
|
|
||||||
|
stats, err := Import(ctx, st, Options{Path: dir})
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.Equal(t, checkpointEveryFiles+1+checkpointEveryFiles, stats.FilesScanned)
|
||||||
|
require.Equal(t, 1, stats.Messages)
|
||||||
|
require.Equal(t, 1, stats.SkippedMessages)
|
||||||
|
require.GreaterOrEqual(t, stats.Checkpoints, 2)
|
||||||
|
|
||||||
|
results, err := st.SearchMessages(ctx, store.SearchOptions{Query: "partially resolved retry", Limit: 10})
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.Len(t, results, 1)
|
||||||
|
require.Equal(t, "partially-resolved", results[0].ChannelName)
|
||||||
|
results, err = st.SearchMessages(ctx, store.SearchOptions{Query: "still unresolved retry", Limit: 10})
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.Empty(t, results)
|
||||||
|
requireMessageCount(t, ctx, st, "message_events", 1)
|
||||||
|
|
||||||
|
stats, err = Import(ctx, st, Options{Path: dir})
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.Equal(t, 0, stats.FilesScanned)
|
||||||
|
require.Equal(t, checkpointEveryFiles+1, stats.FilesUnchanged)
|
||||||
|
requireMessageCount(t, ctx, st, "message_events", 1)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestImportCheckpointsUnresolvableRouteBearingCacheMisses(t *testing.T) {
|
||||||
|
ctx := context.Background()
|
||||||
|
dir := t.TempDir()
|
||||||
|
cachePath := filepath.Join(dir, "Cache", "Cache_Data")
|
||||||
|
require.NoError(t, os.MkdirAll(cachePath, 0o755))
|
||||||
|
|
||||||
|
channelID := "111111111111111121"
|
||||||
|
require.NoError(t, os.WriteFile(filepath.Join(cachePath, "entry_000"), []byte(fmt.Sprintf(`https://discord.com/api/v9/channels/%s/messages?limit=50
|
||||||
|
{"id":"333333333333333346","channel_id":"%s","content":"permanent unresolved cache miss","timestamp":"2026-04-23T18:20:43Z","author":{"id":"222222222222222232","username":"alice"}}
|
||||||
|
`, channelID, channelID)), 0o600))
|
||||||
|
|
||||||
|
st, err := store.Open(ctx, filepath.Join(dir, "discrawl.db"))
|
||||||
|
require.NoError(t, err)
|
||||||
|
defer func() { _ = st.Close() }()
|
||||||
|
|
||||||
|
stats, err := Import(ctx, st, Options{Path: dir})
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.Equal(t, 1, stats.FilesScanned)
|
||||||
|
require.Equal(t, 1, stats.SkippedMessages)
|
||||||
|
require.Equal(t, 1, stats.Checkpoints)
|
||||||
|
|
||||||
|
results, err := st.SearchMessages(ctx, store.SearchOptions{Query: "permanent unresolved", Limit: 10})
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.Empty(t, results)
|
||||||
|
|
||||||
|
stats, err = Import(ctx, st, Options{Path: dir})
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.Equal(t, 0, stats.FilesScanned)
|
||||||
|
require.Equal(t, 1, stats.FilesUnchanged)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestImportDoesNotAppendEventsForSkippedMixedBatch(t *testing.T) {
|
||||||
|
ctx := context.Background()
|
||||||
|
dir := t.TempDir()
|
||||||
|
cachePath := filepath.Join(dir, "Cache", "Cache_Data")
|
||||||
|
require.NoError(t, os.MkdirAll(cachePath, 0o755))
|
||||||
|
|
||||||
|
guildID := "999999999999999996"
|
||||||
|
resolvedChannelID := "111111111111111121"
|
||||||
|
unresolvedChannelID := "111111111111111122"
|
||||||
|
require.NoError(t, os.WriteFile(filepath.Join(cachePath, "entry_000"), []byte(fmt.Sprintf(`https://discord.com/channels/%s/%s
|
||||||
|
https://discord.com/api/v9/channels/%s/messages?limit=50
|
||||||
|
{"id":"333333333333333346","channel_id":"%s","content":"mixed resolved message","timestamp":"2026-04-23T18:20:43Z","author":{"id":"222222222222222232","username":"alice"}}
|
||||||
|
{"id":"333333333333333347","channel_id":"%s","content":"mixed unresolved message","timestamp":"2026-04-23T18:20:44Z","author":{"id":"222222222222222232","username":"alice"}}
|
||||||
|
`, guildID, resolvedChannelID, unresolvedChannelID, resolvedChannelID, unresolvedChannelID)), 0o600))
|
||||||
|
|
||||||
|
st, err := store.Open(ctx, filepath.Join(dir, "discrawl.db"))
|
||||||
|
require.NoError(t, err)
|
||||||
|
defer func() { _ = st.Close() }()
|
||||||
|
|
||||||
|
stats, err := Import(ctx, st, Options{Path: dir})
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.Equal(t, 1, stats.FilesScanned)
|
||||||
|
require.Equal(t, 1, stats.Checkpoints)
|
||||||
|
requireMessageCount(t, ctx, st, "message_events", 0)
|
||||||
|
|
||||||
|
results, err := st.SearchMessages(ctx, store.SearchOptions{Query: "mixed resolved", Limit: 10})
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.Len(t, results, 1)
|
||||||
|
results, err = st.SearchMessages(ctx, store.SearchOptions{Query: "mixed unresolved", Limit: 10})
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.Empty(t, results)
|
||||||
|
|
||||||
|
stats, err = Import(ctx, st, Options{Path: dir})
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.Equal(t, 0, stats.FilesScanned)
|
||||||
|
require.Equal(t, 1, stats.FilesUnchanged)
|
||||||
|
requireMessageCount(t, ctx, st, "message_events", 0)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestImportDoesNotDuplicateEventsWhenSwitchingFullCacheModes(t *testing.T) {
|
||||||
|
ctx := context.Background()
|
||||||
|
dir := t.TempDir()
|
||||||
|
cachePath := filepath.Join(dir, "Cache", "Cache_Data")
|
||||||
|
require.NoError(t, os.MkdirAll(cachePath, 0o755))
|
||||||
|
|
||||||
|
channelID := "111111111111111121"
|
||||||
|
guildID := "999999999999999996"
|
||||||
|
require.NoError(t, os.WriteFile(filepath.Join(cachePath, "entry_000"), []byte(fmt.Sprintf(`https://discord.com/channels/%s/%s
|
||||||
|
{"id":"%s","guild_id":"%s","type":0,"name":"mode-switch"}
|
||||||
|
{"id":"333333333333333346","channel_id":"%s","content":"mode switch event once","timestamp":"2026-04-23T18:20:43Z","author":{"id":"222222222222222232","username":"alice"}}
|
||||||
|
`, guildID, channelID, channelID, guildID, channelID)), 0o600))
|
||||||
|
|
||||||
|
t.Run("full then default", func(t *testing.T) {
|
||||||
|
st, err := store.Open(ctx, filepath.Join(dir, "full-first.db"))
|
||||||
|
require.NoError(t, err)
|
||||||
|
defer func() { _ = st.Close() }()
|
||||||
|
|
||||||
|
stats, err := Import(ctx, st, Options{Path: dir, FullCache: true})
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.Equal(t, 1, stats.FilesScanned)
|
||||||
|
require.Equal(t, 1, stats.Messages)
|
||||||
|
requireMessageCount(t, ctx, st, "message_events", 1)
|
||||||
|
|
||||||
|
stats, err = Import(ctx, st, Options{Path: dir})
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.Equal(t, 0, stats.FilesScanned)
|
||||||
|
require.Equal(t, 1, stats.FilesUnchanged)
|
||||||
|
requireMessageCount(t, ctx, st, "message_events", 1)
|
||||||
|
})
|
||||||
|
|
||||||
|
t.Run("default then full", func(t *testing.T) {
|
||||||
|
st, err := store.Open(ctx, filepath.Join(dir, "default-first.db"))
|
||||||
|
require.NoError(t, err)
|
||||||
|
defer func() { _ = st.Close() }()
|
||||||
|
|
||||||
|
stats, err := Import(ctx, st, Options{Path: dir})
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.Equal(t, 1, stats.FilesScanned)
|
||||||
|
require.Equal(t, 1, stats.Messages)
|
||||||
|
requireMessageCount(t, ctx, st, "message_events", 1)
|
||||||
|
|
||||||
|
stats, err = Import(ctx, st, Options{Path: dir, FullCache: true})
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.Equal(t, 0, stats.FilesScanned)
|
||||||
|
require.Equal(t, 1, stats.FilesUnchanged)
|
||||||
|
requireMessageCount(t, ctx, st, "message_events", 1)
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestImportFastCachePreservesKnownChannelMetadataAcrossBatches(t *testing.T) {
|
||||||
|
ctx := context.Background()
|
||||||
|
dir := t.TempDir()
|
||||||
|
leveldbPath := filepath.Join(dir, "Local Storage", "leveldb")
|
||||||
|
cachePath := filepath.Join(dir, "Cache", "Cache_Data")
|
||||||
|
require.NoError(t, os.MkdirAll(leveldbPath, 0o755))
|
||||||
|
require.NoError(t, os.MkdirAll(cachePath, 0o755))
|
||||||
|
|
||||||
|
channelID := "111111111111111121"
|
||||||
|
guildID := "999999999999999996"
|
||||||
|
require.NoError(t, os.WriteFile(filepath.Join(leveldbPath, "000001.log"), []byte(fmt.Sprintf(
|
||||||
|
`{"id":"%s","guild_id":"%s","type":11,"name":"known-thread","thread_metadata":{"archived":false}}`,
|
||||||
|
channelID,
|
||||||
|
guildID,
|
||||||
|
)), 0o600))
|
||||||
|
require.NoError(t, os.WriteFile(filepath.Join(cachePath, "entry_0"), []byte(fmt.Sprintf(`https://discord.com/channels/%s/%s
|
||||||
|
{"id":"333333333333333346","channel_id":"%s","content":"thread metadata cache","timestamp":"2026-04-23T18:20:43Z","author":{"id":"222222222222222232","username":"alice"}}
|
||||||
|
`, guildID, channelID, channelID)), 0o600))
|
||||||
|
|
||||||
|
st, err := store.Open(ctx, filepath.Join(dir, "discrawl.db"))
|
||||||
|
require.NoError(t, err)
|
||||||
|
defer func() { _ = st.Close() }()
|
||||||
|
|
||||||
|
stats, err := Import(ctx, st, Options{Path: dir})
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.Equal(t, 1, stats.Messages)
|
||||||
|
|
||||||
|
channels, err := st.Channels(ctx, guildID)
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.Len(t, channels, 1)
|
||||||
|
require.Equal(t, "known-thread", channels[0].Name)
|
||||||
|
require.Equal(t, "thread_public", channels[0].Kind)
|
||||||
|
|
||||||
|
_, rows, err := st.ReadOnlyQuery(ctx, "select raw_json from channels where id = '111111111111111121'")
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.Len(t, rows, 1)
|
||||||
|
require.Contains(t, rows[0][0], `"type":11`)
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestImportFastCacheRouteFiltersServiceWorkerCacheStorage(t *testing.T) {
|
||||||
|
ctx := context.Background()
|
||||||
|
dir := t.TempDir()
|
||||||
|
cachePath := filepath.Join(dir, "Service Worker", "CacheStorage", "cache-id")
|
||||||
|
require.NoError(t, os.MkdirAll(cachePath, 0o755))
|
||||||
|
require.NoError(t, os.WriteFile(filepath.Join(cachePath, "unrouted"), []byte(`
|
||||||
|
{"id":"111111111111111121","guild_id":"999999999999999996","type":0,"name":"service-worker-cache"}
|
||||||
|
{"id":"333333333333333346","channel_id":"111111111111111121","content":"service worker historical cache","timestamp":"2026-04-23T18:20:43Z","author":{"id":"222222222222222232","username":"alice"}}
|
||||||
|
`), 0o600))
|
||||||
|
|
||||||
|
st, err := store.Open(ctx, filepath.Join(dir, "discrawl.db"))
|
||||||
|
require.NoError(t, err)
|
||||||
|
defer func() { _ = st.Close() }()
|
||||||
|
|
||||||
|
stats, err := Import(ctx, st, Options{Path: dir})
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.Equal(t, 0, stats.FilesScanned)
|
||||||
|
require.Equal(t, 1, stats.CacheFilesFastSkipped)
|
||||||
|
|
||||||
|
results, err := st.SearchMessages(ctx, store.SearchOptions{Query: "service worker historical", Limit: 10})
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.Empty(t, results)
|
||||||
|
}
|
||||||
|
|
||||||
|
func requireMessageCount(t *testing.T, ctx context.Context, st *store.Store, table string, expected int) {
|
||||||
|
t.Helper()
|
||||||
|
_, rows, err := st.ReadOnlyQuery(ctx, fmt.Sprintf("select count(*) from %s", table))
|
||||||
|
require.NoError(t, err)
|
||||||
|
require.Len(t, rows, 1)
|
||||||
|
require.Len(t, rows[0], 1)
|
||||||
|
require.Equal(t, fmt.Sprint(expected), rows[0][0])
|
||||||
|
}
|
||||||
113
internal/discorddesktop/import_run.go
Normal file
113
internal/discorddesktop/import_run.go
Normal file
@ -0,0 +1,113 @@
|
|||||||
|
package discorddesktop
|
||||||
|
|
||||||
|
import (
|
||||||
|
"context"
|
||||||
|
"os"
|
||||||
|
|
||||||
|
"github.com/steipete/discrawl/internal/store"
|
||||||
|
)
|
||||||
|
|
||||||
|
type importRun struct {
|
||||||
|
ctx context.Context
|
||||||
|
st *store.Store
|
||||||
|
opts Options
|
||||||
|
state scanState
|
||||||
|
rootFS *os.Root
|
||||||
|
channelLookup map[string]store.ChannelRecord
|
||||||
|
totals scanTotals
|
||||||
|
stats *Stats
|
||||||
|
base snapshot
|
||||||
|
pending []fileCandidate
|
||||||
|
pendingUnresolved unresolvedMessages
|
||||||
|
pendingLookupSize int
|
||||||
|
pendingRouteSize int
|
||||||
|
}
|
||||||
|
|
||||||
|
func newImportRun(ctx context.Context, st *store.Store, opts Options, state scanState, rootFS *os.Root, stats *Stats) *importRun {
|
||||||
|
return &importRun{
|
||||||
|
ctx: ctx,
|
||||||
|
st: st,
|
||||||
|
opts: opts,
|
||||||
|
state: state,
|
||||||
|
rootFS: rootFS,
|
||||||
|
channelLookup: copyChannelLookup(state.channels),
|
||||||
|
totals: newScanTotals(),
|
||||||
|
stats: stats,
|
||||||
|
base: newSnapshot(),
|
||||||
|
pendingUnresolved: unresolvedMessages{},
|
||||||
|
pendingLookupSize: -1,
|
||||||
|
pendingRouteSize: -1,
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *importRun) scanContext(candidates []fileCandidate) error {
|
||||||
|
if err := scanCandidates(r.ctx, r.rootFS, r.opts, candidates, r.base, r.channelLookup, r.stats); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
return r.finalizeAndCommit(candidates, r.base, false)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *importRun) scanCacheBatches(candidates []fileCandidate) error {
|
||||||
|
for start := 0; start < len(candidates); start += checkpointEveryFiles {
|
||||||
|
end := start + checkpointEveryFiles
|
||||||
|
if end > len(candidates) {
|
||||||
|
end = len(candidates)
|
||||||
|
}
|
||||||
|
batchCandidates := candidates[start:end]
|
||||||
|
batch := newSnapshotWithContext(r.base)
|
||||||
|
if err := scanCandidates(r.ctx, r.rootFS, r.opts, batchCandidates, batch, r.channelLookup, r.stats); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
if err := r.finalizeAndCommit(batchCandidates, batch, false); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
mergeSnapshotContext(r.base, batch)
|
||||||
|
}
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *importRun) finalizeAndCommit(candidates []fileCandidate, snap snapshot, recordSkipped bool) error {
|
||||||
|
unresolved := finalizeSnapshot(snap, r.channelLookup, r.totals, r.stats, recordSkipped)
|
||||||
|
checkpoint := len(unresolved) == 0
|
||||||
|
if !checkpoint {
|
||||||
|
r.deferCandidates(candidates, unresolved)
|
||||||
|
}
|
||||||
|
if len(candidates) == 0 && !snapshotHasChanges(snap) {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
return commitSnapshot(r.ctx, r.st, r.opts, r.state, candidates, snap, checkpoint, r.stats)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *importRun) deferCandidates(candidates []fileCandidate, unresolved unresolvedMessages) {
|
||||||
|
r.pending = append(r.pending, candidates...)
|
||||||
|
mergeUnresolved(r.pendingUnresolved, unresolved)
|
||||||
|
if r.pendingLookupSize >= 0 {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
r.pendingLookupSize = len(r.channelLookup)
|
||||||
|
r.pendingRouteSize = len(r.base.routes)
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *importRun) retryPending() error {
|
||||||
|
if len(r.pending) == 0 {
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
if !r.pendingCanResolve() {
|
||||||
|
recordUnresolved(r.pendingUnresolved, r.totals, r.stats)
|
||||||
|
return checkpointScannedCandidates(r.ctx, r.st, r.opts, r.state, r.pending, r.stats)
|
||||||
|
}
|
||||||
|
retry := newSnapshotWithContext(r.base)
|
||||||
|
if err := scanCandidates(r.ctx, r.rootFS, r.opts, r.pending, retry, r.channelLookup, r.stats); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
finalizeSnapshot(retry, r.channelLookup, r.totals, r.stats, true)
|
||||||
|
if err := commitSnapshot(r.ctx, r.st, r.opts, r.state, r.pending, retry, true, r.stats); err != nil {
|
||||||
|
return err
|
||||||
|
}
|
||||||
|
mergeSnapshotContext(r.base, retry)
|
||||||
|
return nil
|
||||||
|
}
|
||||||
|
|
||||||
|
func (r *importRun) pendingCanResolve() bool {
|
||||||
|
return len(r.channelLookup) > r.pendingLookupSize || len(r.base.routes) > r.pendingRouteSize
|
||||||
|
}
|
||||||
Loading…
Reference in New Issue
Block a user