Compare commits
2 Commits
main
...
feat/discr
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a473225867 | ||
|
|
afd9b5b33e |
@ -6,6 +6,8 @@ All notable changes to `discrawl` will be documented in this file.
|
||||
|
||||
### Changes
|
||||
|
||||
- `sync --source both|discord|wiretap` controls bot-token sync versus local Discord Desktop cache import; the default is `both`.
|
||||
- `wiretap` imports classifiable cached Discord Desktop message payloads into the local archive, including proven DMs under synthetic guild id `@me`, without using user tokens.
|
||||
- `sync` now defaults to the fast latest-message refresh path for untargeted runs; use `--all-channels` for the broad stored-channel repair sweep or `--full` for historical backfill.
|
||||
|
||||
## 0.4.1 - 2026-04-22
|
||||
|
||||
33
README.md
33
README.md
@ -1,8 +1,8 @@
|
||||
# discrawl 🛰️ — Mirror Discord into SQLite; search server history locally
|
||||
|
||||
`discrawl` mirrors Discord guild data into local SQLite so you can search, inspect, and query server history without depending on Discord search. Teams can also publish that archive as a private Git snapshot repo, so readers get fresh org memory without Discord bot credentials.
|
||||
`discrawl` mirrors Discord guild data into local SQLite so you can search, inspect, and query server history without depending on Discord search. It can also import classifiable Discord Desktop cache messages for DM recovery/search without using a user token. Teams can publish that archive as a private Git snapshot repo, so readers get fresh org memory without Discord bot credentials.
|
||||
|
||||
Live sync uses real bot tokens. No user-token hacks. Data stays local unless you explicitly publish a Git-backed snapshot.
|
||||
Live guild sync uses real bot tokens. Desktop wiretap mode reads local cache artifacts only; it does not extract credentials or run a selfbot. Data stays local unless you explicitly publish a Git-backed snapshot.
|
||||
|
||||
## What It Does
|
||||
|
||||
@ -13,6 +13,7 @@ Live sync uses real bot tokens. No user-token hacks. Data stays local unless you
|
||||
- extracts small text-like attachments into the local search index
|
||||
- records structured user and role mentions for direct querying
|
||||
- tails Gateway events for live updates, with periodic repair syncs
|
||||
- imports classifiable Discord Desktop cache messages with `wiretap`, including proven DMs under `@me`
|
||||
- publishes and imports private Git-backed archive snapshots for org-wide read access
|
||||
- supports Git-only read mode with no Discord credentials on reader machines
|
||||
- generates backup README activity reports, with optional AI-written field notes
|
||||
@ -24,7 +25,8 @@ Search defaults to all guilds. `sync` and `tail` default to the configured defau
|
||||
## Requirements
|
||||
|
||||
- Go `1.26+`
|
||||
- for publishing/syncing: a Discord bot token the bot can use to read the target guilds
|
||||
- for publishing/syncing guilds: a Discord bot token the bot can use to read the target guilds
|
||||
- for DM wiretap import: local Discord Desktop cache files on the same machine
|
||||
- for read-only Git-backed access: access to a private snapshot repo, no Discord credentials required
|
||||
- bot permissions for the channels you want archived when running `sync` or `tail`
|
||||
|
||||
@ -111,6 +113,7 @@ discrawl sync --full
|
||||
discrawl sync
|
||||
discrawl search "panic: nil pointer"
|
||||
discrawl tail
|
||||
discrawl wiretap
|
||||
```
|
||||
|
||||
Multi-account OpenClaw setup:
|
||||
@ -174,6 +177,9 @@ discrawl sync --full
|
||||
discrawl sync --full --all
|
||||
discrawl sync --guild 123456789012345678
|
||||
discrawl sync --guilds 123,456 --concurrency 8
|
||||
discrawl sync --source both
|
||||
discrawl sync --source discord
|
||||
discrawl sync --source wiretap
|
||||
discrawl sync --guild 123456789012345678 --all-channels
|
||||
discrawl sync --channels 111,222 --since 2026-03-01T00:00:00Z
|
||||
```
|
||||
@ -187,6 +193,7 @@ Sync modes:
|
||||
| `discrawl sync --full` | historical backfill | crawls older history until channels are complete; can take a long time on large servers |
|
||||
|
||||
`sync` already uses parallel channel workers. `--concurrency` overrides the default, and the default is auto-sized from `GOMAXPROCS` with a floor of `8` and a cap of `32`.
|
||||
`--source` selects what gets refreshed: `both` (default), `discord`/`key` for bot-token API sync only, or `wiretap` for local Discord Desktop cache import only.
|
||||
`--all` ignores `default_guild_id` and fans out across every discovered guild the bot can access.
|
||||
`--skip-members` refreshes guild/channel/message data without crawling the full member list, which is useful for frequent Git snapshot publishers that only need latest messages.
|
||||
`--latest-only` is still accepted for explicit latest-only runs; it is now the default for untargeted `sync`. Use `--all-channels` to opt out of the fast default without doing a full historical crawl.
|
||||
@ -209,6 +216,26 @@ discrawl tail --guild 123456789012345678
|
||||
discrawl tail --repair-every 30m
|
||||
```
|
||||
|
||||
### `wiretap`
|
||||
|
||||
Imports classifiable Discord Desktop message payloads into the same local SQLite archive. This is the path for searchable DMs because bot tokens cannot read personal direct messages.
|
||||
|
||||
```bash
|
||||
discrawl wiretap
|
||||
discrawl wiretap --path "$HOME/Library/Application Support/discord"
|
||||
discrawl wiretap --dry-run
|
||||
discrawl wiretap --watch-every 2m
|
||||
```
|
||||
|
||||
Notes:
|
||||
|
||||
- stores only classifiable cache messages in the normal `guilds` / `channels` / `messages` tables
|
||||
- stores proven DMs under the synthetic guild id `@me`
|
||||
- drops message payloads whose channel cannot be classified from cached channel metadata or Discord route URLs
|
||||
- scans local `.ldb`, `.log`, `.json`, and `.txt` artifacts for Discord message JSON
|
||||
- does not extract, store, or print Discord auth tokens
|
||||
- `--max-file-bytes` skips unusually large files; default is 64 MiB
|
||||
|
||||
### `search`
|
||||
|
||||
Searches archived messages. FTS is the default mode and works without embeddings.
|
||||
|
||||
29
SPEC.md
29
SPEC.md
@ -6,6 +6,7 @@ Goal:
|
||||
|
||||
- build a local-first Discord guild crawler
|
||||
- mirror all guild data the configured bot can access
|
||||
- import classifiable Discord Desktop cache messages without user tokens, including DMs
|
||||
- store it in SQLite
|
||||
- support fast text search, semantic search, and raw SQL
|
||||
- support one-shot backfill and long-running live sync
|
||||
@ -26,6 +27,7 @@ V1 scope:
|
||||
- all accessible private threads
|
||||
- archived thread coverage
|
||||
- full message history
|
||||
- desktop-local import from cached Discord Desktop artifacts, with proven DMs stored under `@me`
|
||||
- current member snapshot
|
||||
- FTS5 search
|
||||
- optional OpenAI embeddings with local vector search
|
||||
@ -33,7 +35,8 @@ V1 scope:
|
||||
|
||||
Out of scope for V1:
|
||||
|
||||
- personal-account DMs
|
||||
- remote/API personal-account DM crawling
|
||||
- Discord user-token automation/selfbot flows
|
||||
- reactions as primary indexed entities
|
||||
- attachment blob downloads by default
|
||||
- cross-guild unified sync UX
|
||||
@ -118,6 +121,8 @@ Important Discord facts that drive the schema:
|
||||
- forum posts are threads under a forum parent
|
||||
- message history is paginated and must be backfilled incrementally
|
||||
- live updates come from Gateway events, not from polling alone
|
||||
- personal DMs are only supported through desktop-local cache import
|
||||
- desktop cache messages without a provable channel/guild route are skipped rather than stored as unknown data
|
||||
- archived public and private threads must be enumerated explicitly
|
||||
- private archived thread access may require elevated bot perms like `Manage Threads`
|
||||
|
||||
@ -410,6 +415,7 @@ discrawl [global flags] <command> [args]
|
||||
- `init`
|
||||
- `sync`
|
||||
- `tail`
|
||||
- `wiretap`
|
||||
- `search`
|
||||
- `sql`
|
||||
- `members`
|
||||
@ -468,6 +474,27 @@ Requirements:
|
||||
- write checkpoints
|
||||
- periodic repair sync
|
||||
|
||||
### `wiretap`
|
||||
|
||||
Purpose:
|
||||
|
||||
- import Discord Desktop cache artifacts into the local archive
|
||||
- make cached personal DMs searchable under synthetic guild id `@me`
|
||||
|
||||
Expected flags:
|
||||
|
||||
- `--path <dir>`
|
||||
- `--dry-run`
|
||||
- `--watch-every <duration>`
|
||||
- `--max-file-bytes <bytes>`
|
||||
|
||||
Requirements:
|
||||
|
||||
- never use Discord user tokens
|
||||
- never extract or persist auth tokens from desktop cache
|
||||
- scan bounded local files only
|
||||
- store sanitized raw metadata, not full arbitrary cache blobs
|
||||
|
||||
### `search`
|
||||
|
||||
Purpose:
|
||||
|
||||
@ -1,22 +1,37 @@
|
||||
package cli
|
||||
|
||||
import (
|
||||
"context"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"os"
|
||||
"os/signal"
|
||||
"strings"
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/steipete/discrawl/internal/config"
|
||||
"github.com/steipete/discrawl/internal/discord"
|
||||
"github.com/steipete/discrawl/internal/discorddesktop"
|
||||
"github.com/steipete/discrawl/internal/embed"
|
||||
"github.com/steipete/discrawl/internal/store"
|
||||
"github.com/steipete/discrawl/internal/syncer"
|
||||
)
|
||||
|
||||
type syncSources struct {
|
||||
name string
|
||||
discord bool
|
||||
wiretap bool
|
||||
}
|
||||
|
||||
type syncRunStats struct {
|
||||
Source string `json:"source"`
|
||||
Discord *syncer.SyncStats `json:"discord,omitempty"`
|
||||
Wiretap *discorddesktop.Stats `json:"wiretap,omitempty"`
|
||||
}
|
||||
|
||||
func (r *runtime) runInit(args []string) error {
|
||||
fs := flag.NewFlagSet("init", flag.ContinueOnError)
|
||||
fs.SetOutput(io.Discard)
|
||||
@ -103,6 +118,7 @@ func (r *runtime) runSync(args []string) error {
|
||||
since := fs.String("since", "", "")
|
||||
channels := fs.String("channels", "", "")
|
||||
concurrency := fs.Int("concurrency", r.cfg.Sync.Concurrency, "")
|
||||
source := fs.String("source", r.cfg.Sync.Source, "")
|
||||
withEmbeddings := fs.Bool("with-embeddings", false, "")
|
||||
skipMembers := fs.Bool("skip-members", false, "")
|
||||
latestOnly := fs.Bool("latest-only", false, "")
|
||||
@ -111,6 +127,10 @@ func (r *runtime) runSync(args []string) error {
|
||||
if err := fs.Parse(args); err != nil {
|
||||
return usageErr(err)
|
||||
}
|
||||
sources, err := parseSyncSources(*source)
|
||||
if err != nil {
|
||||
return usageErr(err)
|
||||
}
|
||||
var sinceTime time.Time
|
||||
if *since != "" {
|
||||
parsed, err := time.Parse(time.RFC3339, *since)
|
||||
@ -135,11 +155,74 @@ func (r *runtime) runSync(args []string) error {
|
||||
SkipMembers: *skipMembers || defaultLatest,
|
||||
LatestOnly: latestMode,
|
||||
}
|
||||
stats, err := r.syncer.Sync(r.ctx, opts)
|
||||
if err != nil {
|
||||
return err
|
||||
var apiStats *syncer.SyncStats
|
||||
if sources.discord {
|
||||
shouldClose := r.client == nil
|
||||
if err := r.ensureDiscordServices(); err != nil {
|
||||
return err
|
||||
}
|
||||
if shouldClose && r.client != nil {
|
||||
defer func() { _ = r.client.Close() }()
|
||||
}
|
||||
stats, err := r.syncer.Sync(r.ctx, opts)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
apiStats = &stats
|
||||
}
|
||||
return r.print(stats)
|
||||
var wiretapStats *discorddesktop.Stats
|
||||
if sources.wiretap {
|
||||
stats, err := discorddesktop.Import(r.ctx, r.store, discorddesktop.Options{
|
||||
Path: r.cfg.Desktop.Path,
|
||||
MaxFileBytes: r.cfg.Desktop.MaxFileBytes,
|
||||
Now: r.now,
|
||||
})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
wiretapStats = &stats
|
||||
}
|
||||
if sources.discord && !sources.wiretap {
|
||||
return r.print(*apiStats)
|
||||
}
|
||||
if sources.wiretap && !sources.discord {
|
||||
return r.print(*wiretapStats)
|
||||
}
|
||||
return r.print(syncRunStats{Source: sources.name, Discord: apiStats, Wiretap: wiretapStats})
|
||||
}
|
||||
|
||||
func parseSyncSources(raw string) (syncSources, error) {
|
||||
normalized := strings.ToLower(strings.TrimSpace(raw))
|
||||
if normalized == "" {
|
||||
normalized = "both"
|
||||
}
|
||||
normalized = strings.ReplaceAll(normalized, "+", ",")
|
||||
parts := strings.Split(normalized, ",")
|
||||
out := syncSources{name: normalized}
|
||||
for _, part := range parts {
|
||||
switch strings.TrimSpace(part) {
|
||||
case "", "both", "all":
|
||||
out.discord = true
|
||||
out.wiretap = true
|
||||
case "discord", "api", "bot", "key":
|
||||
out.discord = true
|
||||
case "wiretap", "desktop", "cache":
|
||||
out.wiretap = true
|
||||
default:
|
||||
return syncSources{}, fmt.Errorf("invalid --source %q; use both, discord, or wiretap", raw)
|
||||
}
|
||||
}
|
||||
switch {
|
||||
case out.discord && out.wiretap:
|
||||
out.name = "both"
|
||||
case out.discord:
|
||||
out.name = "discord"
|
||||
case out.wiretap:
|
||||
out.name = "wiretap"
|
||||
default:
|
||||
return syncSources{}, fmt.Errorf("invalid --source %q; use both, discord, or wiretap", raw)
|
||||
}
|
||||
return out, nil
|
||||
}
|
||||
|
||||
func (r *runtime) runTail(args []string) error {
|
||||
@ -156,6 +239,59 @@ func (r *runtime) runTail(args []string) error {
|
||||
return r.syncer.RunTail(ctx, r.resolveSyncGuilds(*guildFlag, *guildsFlag), *repairEvery)
|
||||
}
|
||||
|
||||
func (r *runtime) runWiretap(args []string) error {
|
||||
fs := flag.NewFlagSet("wiretap", flag.ContinueOnError)
|
||||
fs.SetOutput(io.Discard)
|
||||
path := fs.String("path", r.cfg.Desktop.Path, "")
|
||||
maxFileBytes := fs.Int64("max-file-bytes", r.cfg.Desktop.MaxFileBytes, "")
|
||||
dryRun := fs.Bool("dry-run", false, "")
|
||||
watchEvery := fs.Duration("watch-every", 0, "")
|
||||
if err := fs.Parse(args); err != nil {
|
||||
return usageErr(err)
|
||||
}
|
||||
if fs.NArg() != 0 {
|
||||
return usageErr(fmt.Errorf("wiretap takes flags only"))
|
||||
}
|
||||
if *maxFileBytes <= 0 {
|
||||
return usageErr(fmt.Errorf("--max-file-bytes must be positive"))
|
||||
}
|
||||
runOnce := func(ctx context.Context) error {
|
||||
stats, err := discorddesktop.Import(ctx, r.store, discorddesktop.Options{
|
||||
Path: *path,
|
||||
MaxFileBytes: *maxFileBytes,
|
||||
DryRun: *dryRun,
|
||||
Now: r.now,
|
||||
})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return r.print(stats)
|
||||
}
|
||||
if *watchEvery <= 0 {
|
||||
return runOnce(r.ctx)
|
||||
}
|
||||
if *watchEvery < time.Second {
|
||||
return usageErr(fmt.Errorf("--watch-every must be at least 1s"))
|
||||
}
|
||||
ctx, stop := signal.NotifyContext(r.ctx, os.Interrupt, syscall.SIGTERM)
|
||||
defer stop()
|
||||
if err := runOnce(ctx); err != nil {
|
||||
return err
|
||||
}
|
||||
ticker := time.NewTicker(*watchEvery)
|
||||
defer ticker.Stop()
|
||||
for {
|
||||
select {
|
||||
case <-ctx.Done():
|
||||
return nil
|
||||
case <-ticker.C:
|
||||
if err := runOnce(ctx); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (r *runtime) runStatus(args []string) error {
|
||||
if len(args) != 0 {
|
||||
return usageErr(fmt.Errorf("status takes no arguments"))
|
||||
|
||||
@ -123,27 +123,32 @@ func (r *runtime) dispatch(rest []string) error {
|
||||
case "init":
|
||||
return r.runInit(rest[1:])
|
||||
case "sync":
|
||||
return r.withServicesAuto(true, true, func() error { return r.runSync(rest[1:]) })
|
||||
return r.withLocalStoreDefault(true, func() error { return r.runSync(rest[1:]) })
|
||||
case "tail":
|
||||
return r.withServices(true, func() error { return r.runTail(rest[1:]) })
|
||||
case "wiretap":
|
||||
return r.withLocalStoreDefault(false, func() error { return r.runWiretap(rest[1:]) })
|
||||
case "search":
|
||||
return r.withServices(false, func() error { return r.runSearch(rest[1:]) })
|
||||
return r.withLocalStoreDefault(true, func() error { return r.runSearch(rest[1:]) })
|
||||
case "messages":
|
||||
return r.withServicesAuto(hasBoolFlag(rest[1:], "--sync"), true, func() error { return r.runMessages(rest[1:]) })
|
||||
if hasBoolFlag(rest[1:], "--sync") {
|
||||
return r.withServicesAuto(true, true, func() error { return r.runMessages(rest[1:]) })
|
||||
}
|
||||
return r.withLocalStoreDefault(true, func() error { return r.runMessages(rest[1:]) })
|
||||
case "mentions":
|
||||
return r.withServices(false, func() error { return r.runMentions(rest[1:]) })
|
||||
return r.withLocalStoreDefault(true, func() error { return r.runMentions(rest[1:]) })
|
||||
case "embed":
|
||||
return r.withServices(false, func() error { return r.runEmbed(rest[1:]) })
|
||||
return r.withLocalStoreDefault(true, func() error { return r.runEmbed(rest[1:]) })
|
||||
case "sql":
|
||||
return r.withServices(false, func() error { return r.runSQL(rest[1:]) })
|
||||
return r.withLocalStoreDefault(true, func() error { return r.runSQL(rest[1:]) })
|
||||
case "members":
|
||||
return r.withServices(false, func() error { return r.runMembers(rest[1:]) })
|
||||
return r.withLocalStoreDefault(true, func() error { return r.runMembers(rest[1:]) })
|
||||
case "channels":
|
||||
return r.withServices(false, func() error { return r.runChannels(rest[1:]) })
|
||||
return r.withLocalStoreDefault(true, func() error { return r.runChannels(rest[1:]) })
|
||||
case "status":
|
||||
return r.withServices(false, func() error { return r.runStatus(rest[1:]) })
|
||||
return r.withLocalStoreDefault(true, func() error { return r.runStatus(rest[1:]) })
|
||||
case "report":
|
||||
return r.withServices(false, func() error { return r.runReport(rest[1:]) })
|
||||
return r.withLocalStoreDefault(true, func() error { return r.runReport(rest[1:]) })
|
||||
case "publish":
|
||||
return r.withServicesAuto(false, false, func() error { return r.runPublish(rest[1:]) })
|
||||
case "subscribe":
|
||||
@ -161,6 +166,42 @@ func (r *runtime) withServices(withDiscord bool, fn func() error) error {
|
||||
return r.withServicesAuto(withDiscord, !withDiscord, fn)
|
||||
}
|
||||
|
||||
func (r *runtime) withLocalStoreDefault(autoShareUpdate bool, fn func() error) error {
|
||||
cfg, err := config.Load(r.configPath)
|
||||
if err != nil {
|
||||
if !errors.Is(err, os.ErrNotExist) {
|
||||
return configErr(err)
|
||||
}
|
||||
cfg = config.Default()
|
||||
if err := cfg.Normalize(); err != nil {
|
||||
return configErr(err)
|
||||
}
|
||||
}
|
||||
if err := config.EnsureRuntimeDirs(cfg); err != nil {
|
||||
return configErr(err)
|
||||
}
|
||||
dbPath, err := config.ExpandPath(cfg.DBPath)
|
||||
if err != nil {
|
||||
return configErr(err)
|
||||
}
|
||||
r.cfg = cfg
|
||||
storeFactory := r.openStore
|
||||
if storeFactory == nil {
|
||||
storeFactory = store.Open
|
||||
}
|
||||
r.store, err = storeFactory(r.ctx, dbPath)
|
||||
if err != nil {
|
||||
return dbErr(err)
|
||||
}
|
||||
defer func() { _ = r.store.Close() }()
|
||||
if autoShareUpdate && os.Getenv("DISCRAWL_NO_AUTO_UPDATE") != "1" {
|
||||
if err := r.autoUpdateShare(); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return fn()
|
||||
}
|
||||
|
||||
func (r *runtime) withServicesAuto(withDiscord, autoShareUpdate bool, fn func() error) error {
|
||||
cfg, err := config.Load(r.configPath)
|
||||
if err != nil {
|
||||
@ -189,35 +230,45 @@ func (r *runtime) withServicesAuto(withDiscord, autoShareUpdate bool, fn func()
|
||||
}
|
||||
}
|
||||
if withDiscord {
|
||||
discordFactory := r.newDiscord
|
||||
if discordFactory == nil {
|
||||
discordFactory = func(cfg config.Config) (discordClient, error) {
|
||||
token, err := config.ResolveDiscordToken(cfg)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return discord.New(token.Token)
|
||||
}
|
||||
if err := r.ensureDiscordServices(); err != nil {
|
||||
return err
|
||||
}
|
||||
r.client, err = discordFactory(cfg)
|
||||
if err != nil {
|
||||
return authErr(err)
|
||||
}
|
||||
defer func() { _ = r.client.Close() }()
|
||||
syncerFactory := r.newSyncer
|
||||
if syncerFactory == nil {
|
||||
syncerFactory = func(client syncer.Client, s *store.Store, logger *slog.Logger) syncService {
|
||||
return syncer.New(client, s, logger)
|
||||
}
|
||||
}
|
||||
r.syncer = syncerFactory(r.client, r.store, r.logger)
|
||||
if configurable, ok := r.syncer.(attachmentTextConfigurer); ok {
|
||||
configurable.SetAttachmentTextEnabled(cfg.AttachmentTextEnabled())
|
||||
if r.client != nil {
|
||||
defer func() { _ = r.client.Close() }()
|
||||
}
|
||||
}
|
||||
return fn()
|
||||
}
|
||||
|
||||
func (r *runtime) ensureDiscordServices() error {
|
||||
discordFactory := r.newDiscord
|
||||
if discordFactory == nil {
|
||||
discordFactory = func(cfg config.Config) (discordClient, error) {
|
||||
token, err := config.ResolveDiscordToken(cfg)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return discord.New(token.Token)
|
||||
}
|
||||
}
|
||||
client, err := discordFactory(r.cfg)
|
||||
if err != nil {
|
||||
return authErr(err)
|
||||
}
|
||||
r.client = client
|
||||
syncerFactory := r.newSyncer
|
||||
if syncerFactory == nil {
|
||||
syncerFactory = func(client syncer.Client, s *store.Store, logger *slog.Logger) syncService {
|
||||
return syncer.New(client, s, logger)
|
||||
}
|
||||
}
|
||||
r.syncer = syncerFactory(r.client, r.store, r.logger)
|
||||
if configurable, ok := r.syncer.(attachmentTextConfigurer); ok {
|
||||
configurable.SetAttachmentTextEnabled(r.cfg.AttachmentTextEnabled())
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *runtime) autoUpdateShare() error {
|
||||
if !r.cfg.ShareEnabled() || !r.cfg.Share.AutoUpdate {
|
||||
return nil
|
||||
|
||||
@ -138,6 +138,105 @@ func TestStatusSearchSQLAndListings(t *testing.T) {
|
||||
}
|
||||
}
|
||||
|
||||
func TestWiretapImportsDesktopDirectMessages(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
dir := t.TempDir()
|
||||
cfgPath := filepath.Join(dir, "config.toml")
|
||||
dbPath := filepath.Join(dir, "discrawl.db")
|
||||
desktopPath := filepath.Join(dir, "discord")
|
||||
require.NoError(t, os.MkdirAll(filepath.Join(desktopPath, "IndexedDB"), 0o755))
|
||||
require.NoError(t, os.WriteFile(filepath.Join(desktopPath, "IndexedDB", "000001.log"), []byte(`{"id":"111111111111111111","type":1,"recipients":[{"id":"222222222222222222","username":"alice","global_name":"Alice"}]}
|
||||
{"id":"333333333333333333","channel_id":"111111111111111111","content":"secret DM launch plan","timestamp":"2026-04-23T18:20:43Z","author":{"id":"222222222222222222","username":"alice","global_name":"Alice"}}`), 0o600))
|
||||
|
||||
cfg := config.Default()
|
||||
cfg.DBPath = dbPath
|
||||
cfg.Desktop.Path = desktopPath
|
||||
cfg.Discord.TokenSource = "none"
|
||||
require.NoError(t, config.Write(cfgPath, cfg))
|
||||
|
||||
var out bytes.Buffer
|
||||
require.NoError(t, Run(ctx, []string{"--config", cfgPath, "wiretap"}, &out, &bytes.Buffer{}))
|
||||
require.Contains(t, out.String(), "messages=1")
|
||||
|
||||
out.Reset()
|
||||
require.NoError(t, Run(ctx, []string{"--config", cfgPath, "search", "launch"}, &out, &bytes.Buffer{}))
|
||||
require.Contains(t, out.String(), "secret DM launch plan")
|
||||
require.Contains(t, out.String(), "@me")
|
||||
}
|
||||
|
||||
func TestWiretapAndSearchWorkWithoutConfig(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
dir := t.TempDir()
|
||||
home := filepath.Join(dir, "home")
|
||||
desktopPath := filepath.Join(dir, "discord")
|
||||
require.NoError(t, os.MkdirAll(filepath.Join(desktopPath, "IndexedDB"), 0o755))
|
||||
require.NoError(t, os.MkdirAll(home, 0o755))
|
||||
t.Setenv("HOME", home)
|
||||
t.Setenv("USERPROFILE", home)
|
||||
require.NoError(t, os.WriteFile(filepath.Join(desktopPath, "IndexedDB", "000001.log"), []byte(`{"id":"111111111111111112","type":1,"recipients":[{"id":"222222222222222223","username":"alice","global_name":"Alice"}]}
|
||||
{"id":"333333333333333334","channel_id":"111111111111111112","content":"local-only DM import","timestamp":"2026-04-23T18:20:43Z","author":{"id":"222222222222222223","username":"alice","global_name":"Alice"}}`), 0o600))
|
||||
|
||||
cfgPath := filepath.Join(dir, "missing.toml")
|
||||
var out bytes.Buffer
|
||||
require.NoError(t, Run(ctx, []string{"--config", cfgPath, "wiretap", "--path", desktopPath}, &out, &bytes.Buffer{}))
|
||||
require.Contains(t, out.String(), "messages=1")
|
||||
|
||||
out.Reset()
|
||||
require.NoError(t, Run(ctx, []string{"--config", cfgPath, "search", "local-only"}, &out, &bytes.Buffer{}))
|
||||
require.Contains(t, out.String(), "local-only DM import")
|
||||
require.Contains(t, out.String(), "@me")
|
||||
}
|
||||
|
||||
func TestSyncWiretapSourceImportsDesktopMessages(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
dir := t.TempDir()
|
||||
cfgPath := filepath.Join(dir, "config.toml")
|
||||
dbPath := filepath.Join(dir, "discrawl.db")
|
||||
desktopPath := filepath.Join(dir, "discord")
|
||||
require.NoError(t, os.MkdirAll(filepath.Join(desktopPath, "IndexedDB"), 0o755))
|
||||
require.NoError(t, os.WriteFile(filepath.Join(desktopPath, "IndexedDB", "000001.log"), []byte(`{"id":"111111111111111117","type":1,"recipients":[{"id":"222222222222222228","username":"alice","global_name":"Alice"}]}
|
||||
{"id":"333333333333333339","channel_id":"111111111111111117","content":"sync wiretap import","timestamp":"2026-04-23T18:20:43Z","author":{"id":"222222222222222228","username":"alice","global_name":"Alice"}}`), 0o600))
|
||||
|
||||
cfg := config.Default()
|
||||
cfg.DBPath = dbPath
|
||||
cfg.Desktop.Path = desktopPath
|
||||
cfg.Discord.TokenSource = "none"
|
||||
require.NoError(t, config.Write(cfgPath, cfg))
|
||||
|
||||
var out bytes.Buffer
|
||||
require.NoError(t, Run(ctx, []string{"--config", cfgPath, "sync", "--source", "wiretap"}, &out, &bytes.Buffer{}))
|
||||
require.Contains(t, out.String(), "dm_messages=1")
|
||||
|
||||
out.Reset()
|
||||
require.NoError(t, Run(ctx, []string{"--config", cfgPath, "search", "sync wiretap"}, &out, &bytes.Buffer{}))
|
||||
require.Contains(t, out.String(), "sync wiretap import")
|
||||
require.Contains(t, out.String(), "@me")
|
||||
}
|
||||
|
||||
func TestParseSyncSources(t *testing.T) {
|
||||
for _, tc := range []struct {
|
||||
raw string
|
||||
name string
|
||||
discord bool
|
||||
wiretap bool
|
||||
}{
|
||||
{"", "both", true, true},
|
||||
{"both", "both", true, true},
|
||||
{"key", "discord", true, false},
|
||||
{"discord", "discord", true, false},
|
||||
{"wiretap", "wiretap", false, true},
|
||||
{"key+wiretap", "both", true, true},
|
||||
} {
|
||||
sources, err := parseSyncSources(tc.raw)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, tc.name, sources.name)
|
||||
require.Equal(t, tc.discord, sources.discord)
|
||||
require.Equal(t, tc.wiretap, sources.wiretap)
|
||||
}
|
||||
_, err := parseSyncSources("nope")
|
||||
require.Error(t, err)
|
||||
}
|
||||
|
||||
func TestReadCommandsAutoImportStaleShare(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
dir := t.TempDir()
|
||||
|
||||
@ -9,6 +9,7 @@ import (
|
||||
"text/tabwriter"
|
||||
"time"
|
||||
|
||||
"github.com/steipete/discrawl/internal/discorddesktop"
|
||||
"github.com/steipete/discrawl/internal/store"
|
||||
"github.com/steipete/discrawl/internal/syncer"
|
||||
)
|
||||
@ -77,6 +78,7 @@ Commands:
|
||||
init
|
||||
sync
|
||||
tail
|
||||
wiretap
|
||||
search
|
||||
messages
|
||||
mentions
|
||||
@ -101,9 +103,30 @@ func printRows(w io.Writer, cols []string, rows [][]string) error {
|
||||
|
||||
func printHuman(w io.Writer, value any) error {
|
||||
switch v := value.(type) {
|
||||
case syncRunStats:
|
||||
if _, err := fmt.Fprintf(w, "source=%s\n", v.Source); err != nil {
|
||||
return err
|
||||
}
|
||||
if v.Discord != nil {
|
||||
if _, err := fmt.Fprintf(w, "discord_guilds=%d\ndiscord_channels=%d\ndiscord_threads=%d\ndiscord_members=%d\ndiscord_messages=%d\n",
|
||||
v.Discord.Guilds, v.Discord.Channels, v.Discord.Threads, v.Discord.Members, v.Discord.Messages); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
if v.Wiretap != nil {
|
||||
if _, err := fmt.Fprintf(w, "wiretap_messages=%d\nwiretap_dm_messages=%d\nwiretap_dm_channels=%d\nwiretap_guild_messages=%d\nwiretap_skipped_messages=%d\nwiretap_skipped_channels=%d\n",
|
||||
v.Wiretap.Messages, v.Wiretap.DMMessages, v.Wiretap.DMChannels, v.Wiretap.GuildMessages, v.Wiretap.SkippedMessages, v.Wiretap.SkippedChannels); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
case syncer.SyncStats:
|
||||
_, err := fmt.Fprintf(w, "guilds=%d channels=%d threads=%d members=%d messages=%d\n", v.Guilds, v.Channels, v.Threads, v.Members, v.Messages)
|
||||
return err
|
||||
case discorddesktop.Stats:
|
||||
_, err := fmt.Fprintf(w, "path=%s\nfiles=%d\nskipped=%d\nobjects=%d\nguilds=%d\nchannels=%d\nmessages=%d\ndm_messages=%d\ndm_channels=%d\nguild_messages=%d\nskipped_messages=%d\nskipped_channels=%d\ndry_run=%t\n",
|
||||
v.Path, v.FilesScanned, v.FilesSkipped, v.JSONObjects, v.Guilds, v.Channels, v.Messages, v.DMMessages, v.DMChannels, v.GuildMessages, v.SkippedMessages, v.SkippedChannels, v.DryRun)
|
||||
return err
|
||||
case store.Status:
|
||||
_, err := fmt.Fprintf(w, "db=%s\nguilds=%d\nchannels=%d\nthreads=%d\nmessages=%d\nmembers=%d\nembedding_backlog=%d\nlast_sync=%s\nlast_tail_event=%s\n",
|
||||
v.DBPath, v.GuildCount, v.ChannelCount, v.ThreadCount, v.MessageCount, v.MemberCount, v.EmbeddingBacklog,
|
||||
|
||||
@ -28,6 +28,7 @@ type Config struct {
|
||||
CacheDir string `toml:"cache_dir"`
|
||||
LogDir string `toml:"log_dir"`
|
||||
Discord DiscordConfig `toml:"discord"`
|
||||
Desktop DesktopConfig `toml:"desktop"`
|
||||
Sync SyncConfig `toml:"sync"`
|
||||
Search SearchConfig `toml:"search"`
|
||||
Share ShareConfig `toml:"share"`
|
||||
@ -40,7 +41,13 @@ type DiscordConfig struct {
|
||||
TokenEnv string `toml:"token_env"`
|
||||
}
|
||||
|
||||
type DesktopConfig struct {
|
||||
Path string `toml:"path"`
|
||||
MaxFileBytes int64 `toml:"max_file_bytes"`
|
||||
}
|
||||
|
||||
type SyncConfig struct {
|
||||
Source string `toml:"source"`
|
||||
Concurrency int `toml:"concurrency"`
|
||||
RepairEvery string `toml:"repair_every"`
|
||||
FullHistory bool `toml:"full_history"`
|
||||
@ -115,7 +122,12 @@ func Default() Config {
|
||||
Account: "default",
|
||||
TokenEnv: DefaultTokenEnv,
|
||||
},
|
||||
Desktop: DesktopConfig{
|
||||
Path: defaultDiscordDesktopPath(home),
|
||||
MaxFileBytes: 64 << 20,
|
||||
},
|
||||
Sync: SyncConfig{
|
||||
Source: "both",
|
||||
Concurrency: defaultSyncConcurrency(),
|
||||
RepairEvery: "6h",
|
||||
FullHistory: true,
|
||||
@ -233,9 +245,19 @@ func (c *Config) Normalize() error {
|
||||
if c.Discord.TokenEnv == "" {
|
||||
c.Discord.TokenEnv = DefaultTokenEnv
|
||||
}
|
||||
if c.Desktop.Path == "" {
|
||||
c.Desktop.Path = defaultDiscordDesktopPath(homeDir())
|
||||
}
|
||||
if c.Desktop.MaxFileBytes <= 0 {
|
||||
c.Desktop.MaxFileBytes = 64 << 20
|
||||
}
|
||||
if c.Sync.Concurrency <= 0 {
|
||||
c.Sync.Concurrency = defaultSyncConcurrency()
|
||||
}
|
||||
c.Sync.Source = strings.ToLower(strings.TrimSpace(c.Sync.Source))
|
||||
if c.Sync.Source == "" {
|
||||
c.Sync.Source = "both"
|
||||
}
|
||||
if c.Sync.RepairEvery == "" {
|
||||
c.Sync.RepairEvery = "6h"
|
||||
}
|
||||
@ -294,6 +316,28 @@ func (c *Config) Normalize() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func defaultDiscordDesktopPath(home string) string {
|
||||
switch runtime.GOOS {
|
||||
case "darwin":
|
||||
return filepath.Join(home, "Library", "Application Support", "discord")
|
||||
case "windows":
|
||||
if appData := strings.TrimSpace(os.Getenv("APPDATA")); appData != "" {
|
||||
return filepath.Join(appData, "discord")
|
||||
}
|
||||
return filepath.Join(home, "AppData", "Roaming", "discord")
|
||||
default:
|
||||
if configHome := strings.TrimSpace(os.Getenv("XDG_CONFIG_HOME")); configHome != "" {
|
||||
return filepath.Join(configHome, "discord")
|
||||
}
|
||||
return filepath.Join(home, ".config", "discord")
|
||||
}
|
||||
}
|
||||
|
||||
func homeDir() string {
|
||||
home, _ := os.UserHomeDir()
|
||||
return home
|
||||
}
|
||||
|
||||
func (c Config) EffectiveDefaultGuildID() string {
|
||||
if c.DefaultGuildID != "" {
|
||||
return c.DefaultGuildID
|
||||
|
||||
926
internal/discorddesktop/import.go
Normal file
926
internal/discorddesktop/import.go
Normal file
@ -0,0 +1,926 @@
|
||||
package discorddesktop
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"compress/gzip"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"fmt"
|
||||
"io"
|
||||
"io/fs"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
"runtime"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
"time"
|
||||
"unicode"
|
||||
|
||||
"github.com/steipete/discrawl/internal/store"
|
||||
)
|
||||
|
||||
const (
|
||||
DirectMessageGuildID = "@me"
|
||||
DirectMessageGuildName = "Discord Direct Messages"
|
||||
defaultMaxFileBytes = 64 << 20
|
||||
maxObjectBytes = 4 << 20
|
||||
)
|
||||
|
||||
var channelRouteRE = regexp.MustCompile(`/channels/(@me|[0-9]{12,24})/([0-9]{12,24})`)
|
||||
|
||||
type Options struct {
|
||||
Path string
|
||||
MaxFileBytes int64
|
||||
DryRun bool
|
||||
Now func() time.Time
|
||||
}
|
||||
|
||||
type Stats struct {
|
||||
Path string `json:"path"`
|
||||
FilesScanned int `json:"files_scanned"`
|
||||
FilesSkipped int `json:"files_skipped"`
|
||||
BytesScanned int64 `json:"bytes_scanned"`
|
||||
JSONObjects int `json:"json_objects"`
|
||||
Guilds int `json:"guilds"`
|
||||
Channels int `json:"channels"`
|
||||
Messages int `json:"messages"`
|
||||
DMMessages int `json:"dm_messages"`
|
||||
DMChannels int `json:"dm_channels"`
|
||||
GuildMessages int `json:"guild_messages"`
|
||||
SkippedMessages int `json:"skipped_messages"`
|
||||
SkippedChannels int `json:"skipped_channels"`
|
||||
DryRun bool `json:"dry_run,omitempty"`
|
||||
StartedAt time.Time `json:"started_at"`
|
||||
FinishedAt time.Time `json:"finished_at"`
|
||||
}
|
||||
|
||||
type snapshot struct {
|
||||
guilds map[string]store.GuildRecord
|
||||
channels map[string]store.ChannelRecord
|
||||
messages map[string]store.MessageMutation
|
||||
routes map[string]string
|
||||
}
|
||||
|
||||
func DefaultPath() string {
|
||||
home, _ := os.UserHomeDir()
|
||||
switch runtime.GOOS {
|
||||
case "darwin":
|
||||
return filepath.Join(home, "Library", "Application Support", "discord")
|
||||
case "windows":
|
||||
if appData := strings.TrimSpace(os.Getenv("APPDATA")); appData != "" {
|
||||
return filepath.Join(appData, "discord")
|
||||
}
|
||||
return filepath.Join(home, "AppData", "Roaming", "discord")
|
||||
default:
|
||||
if configHome := strings.TrimSpace(os.Getenv("XDG_CONFIG_HOME")); configHome != "" {
|
||||
return filepath.Join(configHome, "discord")
|
||||
}
|
||||
return filepath.Join(home, ".config", "discord")
|
||||
}
|
||||
}
|
||||
|
||||
func Import(ctx context.Context, st *store.Store, opts Options) (Stats, error) {
|
||||
if st == nil && !opts.DryRun {
|
||||
return Stats{}, errors.New("store is required")
|
||||
}
|
||||
stats, snap, err := scan(ctx, opts)
|
||||
if err != nil {
|
||||
return stats, err
|
||||
}
|
||||
stats.DryRun = opts.DryRun
|
||||
if opts.DryRun {
|
||||
return stats, nil
|
||||
}
|
||||
if err := writeSnapshot(ctx, st, snap); err != nil {
|
||||
return stats, err
|
||||
}
|
||||
return stats, nil
|
||||
}
|
||||
|
||||
func scan(ctx context.Context, opts Options) (Stats, snapshot, error) {
|
||||
now := opts.Now
|
||||
if now == nil {
|
||||
now = time.Now
|
||||
}
|
||||
root := strings.TrimSpace(opts.Path)
|
||||
if root == "" {
|
||||
root = DefaultPath()
|
||||
}
|
||||
maxBytes := opts.MaxFileBytes
|
||||
if maxBytes <= 0 {
|
||||
maxBytes = defaultMaxFileBytes
|
||||
}
|
||||
stats := Stats{Path: root, StartedAt: now().UTC()}
|
||||
snap := snapshot{
|
||||
guilds: map[string]store.GuildRecord{},
|
||||
channels: map[string]store.ChannelRecord{},
|
||||
messages: map[string]store.MessageMutation{},
|
||||
routes: map[string]string{},
|
||||
}
|
||||
if err := filepath.WalkDir(root, func(path string, entry fs.DirEntry, err error) error {
|
||||
if err != nil {
|
||||
return ignoreCacheFileError(err)
|
||||
}
|
||||
if ctx.Err() != nil {
|
||||
return ctx.Err()
|
||||
}
|
||||
if entry.IsDir() {
|
||||
if shouldSkipDir(entry.Name()) && path != root {
|
||||
return filepath.SkipDir
|
||||
}
|
||||
return nil
|
||||
}
|
||||
info, err := entry.Info()
|
||||
if err != nil {
|
||||
stats.FilesSkipped++
|
||||
return ignoreCacheFileError(err)
|
||||
}
|
||||
if !isCandidateFile(path) || info.Size() <= 0 || info.Size() > maxBytes {
|
||||
stats.FilesSkipped++
|
||||
return nil
|
||||
}
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
stats.FilesSkipped++
|
||||
return ignoreCacheFileError(err)
|
||||
}
|
||||
stats.FilesScanned++
|
||||
stats.BytesScanned += int64(len(data))
|
||||
collectChannelRoutes(snap, bytes.ToValidUTF8(data, nil))
|
||||
objects := extractJSONValues(bytes.ToValidUTF8(data, nil))
|
||||
for _, payload := range extractGzipPayloads(data, maxBytes) {
|
||||
collectChannelRoutes(snap, bytes.ToValidUTF8(payload, nil))
|
||||
objects = append(objects, extractJSONValues(bytes.ToValidUTF8(payload, nil))...)
|
||||
}
|
||||
stats.JSONObjects += len(objects)
|
||||
for _, raw := range objects {
|
||||
var value any
|
||||
if err := json.Unmarshal(raw, &value); err != nil {
|
||||
continue
|
||||
}
|
||||
collectValue(snap, value, info.ModTime().UTC())
|
||||
}
|
||||
return nil
|
||||
}); err != nil {
|
||||
return stats, snap, err
|
||||
}
|
||||
reconcileMessages(snap)
|
||||
skippedChannels := map[string]struct{}{}
|
||||
for id, msg := range snap.messages {
|
||||
guildID := msg.Record.GuildID
|
||||
if guildID == "" {
|
||||
stats.SkippedMessages++
|
||||
skippedChannels[msg.Record.ChannelID] = struct{}{}
|
||||
delete(snap.messages, id)
|
||||
continue
|
||||
}
|
||||
if guildID == DirectMessageGuildID {
|
||||
if _, ok := snap.guilds[guildID]; !ok {
|
||||
snap.guilds[guildID] = syntheticGuild(guildID, guildName(guildID))
|
||||
}
|
||||
}
|
||||
if _, ok := snap.channels[msg.Record.ChannelID]; !ok && guildID == DirectMessageGuildID {
|
||||
snap.channels[msg.Record.ChannelID] = syntheticChannel(msg.Record.ChannelID, guildID, msg.Record.ChannelName)
|
||||
}
|
||||
snap.messages[id] = msg
|
||||
}
|
||||
dmChannels := map[string]struct{}{}
|
||||
for _, msg := range snap.messages {
|
||||
switch msg.Record.GuildID {
|
||||
case DirectMessageGuildID:
|
||||
stats.DMMessages++
|
||||
dmChannels[msg.Record.ChannelID] = struct{}{}
|
||||
default:
|
||||
stats.GuildMessages++
|
||||
}
|
||||
}
|
||||
for id, channel := range snap.channels {
|
||||
if channel.GuildID != DirectMessageGuildID {
|
||||
delete(snap.channels, id)
|
||||
continue
|
||||
}
|
||||
if _, ok := dmChannels[id]; !ok {
|
||||
delete(snap.channels, id)
|
||||
}
|
||||
}
|
||||
stats.DMChannels = len(dmChannels)
|
||||
stats.SkippedChannels = len(skippedChannels)
|
||||
stats.Guilds = len(snap.guilds)
|
||||
stats.Channels = len(snap.channels)
|
||||
stats.Messages = len(snap.messages)
|
||||
stats.FinishedAt = now().UTC()
|
||||
return stats, snap, nil
|
||||
}
|
||||
|
||||
func ignoreCacheFileError(error) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func writeSnapshot(ctx context.Context, st *store.Store, snap snapshot) error {
|
||||
if err := st.DeleteGuildData(ctx, "@unknown"); err != nil {
|
||||
return err
|
||||
}
|
||||
guilds := mapValues(snap.guilds)
|
||||
sort.Slice(guilds, func(i, j int) bool { return guilds[i].ID < guilds[j].ID })
|
||||
for _, guild := range guilds {
|
||||
if err := st.UpsertGuild(ctx, guild); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
channels := mapValues(snap.channels)
|
||||
sort.Slice(channels, func(i, j int) bool { return channels[i].ID < channels[j].ID })
|
||||
for _, channel := range channels {
|
||||
if err := st.UpsertChannel(ctx, channel); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
messages := mapValues(snap.messages)
|
||||
sort.Slice(messages, func(i, j int) bool { return messages[i].Record.ID < messages[j].Record.ID })
|
||||
if err := st.UpsertMessages(ctx, messages); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := st.DeleteOrphanChannels(ctx, DirectMessageGuildID); err != nil {
|
||||
return err
|
||||
}
|
||||
return st.SetSyncState(ctx, "wiretap:last_import", time.Now().UTC().Format(time.RFC3339Nano))
|
||||
}
|
||||
|
||||
func collectValue(snap snapshot, value any, fallbackTime time.Time) {
|
||||
switch typed := value.(type) {
|
||||
case map[string]any:
|
||||
if channel, ok := parseChannel(typed); ok {
|
||||
snap.channels[channel.ID] = channel
|
||||
if channel.GuildID == DirectMessageGuildID {
|
||||
if _, ok := snap.guilds[channel.GuildID]; !ok {
|
||||
snap.guilds[channel.GuildID] = syntheticGuild(channel.GuildID, guildName(channel.GuildID))
|
||||
}
|
||||
}
|
||||
}
|
||||
if message, ok := parseMessage(typed, fallbackTime, snap.channels); ok {
|
||||
snap.messages[message.Record.ID] = message
|
||||
}
|
||||
for _, child := range typed {
|
||||
collectValue(snap, child, fallbackTime)
|
||||
}
|
||||
case []any:
|
||||
for _, child := range typed {
|
||||
collectValue(snap, child, fallbackTime)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func collectChannelRoutes(snap snapshot, data []byte) {
|
||||
for _, match := range channelRouteRE.FindAllSubmatch(data, -1) {
|
||||
if len(match) != 3 {
|
||||
continue
|
||||
}
|
||||
guildID := string(match[1])
|
||||
channelID := string(match[2])
|
||||
if !looksSnowflake(channelID) {
|
||||
continue
|
||||
}
|
||||
if existing, ok := snap.routes[channelID]; ok && existing != guildID {
|
||||
snap.routes[channelID] = ""
|
||||
continue
|
||||
}
|
||||
snap.routes[channelID] = guildID
|
||||
}
|
||||
}
|
||||
|
||||
func parseChannel(raw map[string]any) (store.ChannelRecord, bool) {
|
||||
id := stringField(raw, "id")
|
||||
if !looksSnowflake(id) {
|
||||
return store.ChannelRecord{}, false
|
||||
}
|
||||
if _, hasChannelID := raw["channel_id"]; hasChannelID {
|
||||
return store.ChannelRecord{}, false
|
||||
}
|
||||
typeValue, hasType := intField(raw, "type")
|
||||
name := strings.TrimSpace(stringField(raw, "name"))
|
||||
recipients, hasRecipients := raw["recipients"].([]any)
|
||||
guildID := stringField(raw, "guild_id")
|
||||
isDM := guildID == "" && (typeValue == 1 || typeValue == 3 || hasRecipients)
|
||||
if !hasType && !hasRecipients && name == "" {
|
||||
return store.ChannelRecord{}, false
|
||||
}
|
||||
if isDM {
|
||||
guildID = DirectMessageGuildID
|
||||
}
|
||||
if guildID == "" {
|
||||
return store.ChannelRecord{}, false
|
||||
}
|
||||
if name == "" {
|
||||
name = recipientLabel(recipients)
|
||||
}
|
||||
if name == "" {
|
||||
if isDM {
|
||||
name = "dm-" + shortID(id)
|
||||
} else {
|
||||
name = "channel-" + shortID(id)
|
||||
}
|
||||
}
|
||||
rawJSON := channelRawJSON(raw, id, guildID, name, kindForChannelType(typeValue, isDM))
|
||||
return store.ChannelRecord{
|
||||
ID: id,
|
||||
GuildID: guildID,
|
||||
Kind: kindForChannelType(typeValue, isDM),
|
||||
Name: name,
|
||||
RawJSON: rawJSON,
|
||||
}, true
|
||||
}
|
||||
|
||||
func parseMessage(raw map[string]any, fallbackTime time.Time, channels map[string]store.ChannelRecord) (store.MessageMutation, bool) {
|
||||
id := stringField(raw, "id")
|
||||
channelID := stringField(raw, "channel_id")
|
||||
if !looksSnowflake(id) || !looksSnowflake(channelID) {
|
||||
return store.MessageMutation{}, false
|
||||
}
|
||||
author, _ := raw["author"].(map[string]any)
|
||||
content, hasContent := raw["content"].(string)
|
||||
if !hasContent && len(author) == 0 {
|
||||
return store.MessageMutation{}, false
|
||||
}
|
||||
createdAt := parseDiscordTime(stringField(raw, "timestamp"))
|
||||
if createdAt.IsZero() {
|
||||
createdAt = snowflakeTime(id)
|
||||
}
|
||||
if createdAt.IsZero() {
|
||||
createdAt = fallbackTime
|
||||
}
|
||||
if createdAt.IsZero() {
|
||||
return store.MessageMutation{}, false
|
||||
}
|
||||
guildID := stringField(raw, "guild_id")
|
||||
if guildID == "" {
|
||||
if channel, ok := channels[channelID]; ok && channel.GuildID != "" {
|
||||
guildID = channel.GuildID
|
||||
}
|
||||
}
|
||||
channelName := "channel-" + shortID(channelID)
|
||||
if channel, ok := channels[channelID]; ok && channel.Name != "" {
|
||||
channelName = channel.Name
|
||||
}
|
||||
authorID := stringField(author, "id")
|
||||
authorName := firstNonEmpty(
|
||||
stringField(author, "global_name"),
|
||||
stringField(author, "display_name"),
|
||||
stringField(author, "username"),
|
||||
)
|
||||
msgType, _ := intField(raw, "type")
|
||||
editedAt := parseDiscordTime(stringField(raw, "edited_timestamp"))
|
||||
attachments := parseAttachments(raw, id, guildID, channelID, authorID)
|
||||
mentions := parseMentions(raw, id, guildID, channelID, authorID, createdAt)
|
||||
normalized := normalizeText(content, attachmentText(attachments), embedText(raw))
|
||||
return store.MessageMutation{
|
||||
Record: store.MessageRecord{
|
||||
ID: id,
|
||||
GuildID: guildID,
|
||||
ChannelID: channelID,
|
||||
ChannelName: channelName,
|
||||
AuthorID: authorID,
|
||||
AuthorName: authorName,
|
||||
MessageType: msgType,
|
||||
CreatedAt: createdAt.UTC().Format(time.RFC3339Nano),
|
||||
EditedAt: formatOptionalTime(editedAt),
|
||||
Content: content,
|
||||
NormalizedContent: normalized,
|
||||
ReplyToMessageID: messageReferenceID(raw),
|
||||
Pinned: boolField(raw, "pinned"),
|
||||
HasAttachments: len(attachments) > 0,
|
||||
RawJSON: messageRawJSON(raw, id, guildID, channelID, authorID),
|
||||
},
|
||||
EventType: "wiretap",
|
||||
PayloadJSON: messageRawJSON(raw, id, guildID, channelID, authorID),
|
||||
Options: store.WriteOptions{
|
||||
AppendEvent: true,
|
||||
EnqueueEmbedding: false,
|
||||
},
|
||||
Attachments: attachments,
|
||||
Mentions: mentions,
|
||||
}, true
|
||||
}
|
||||
|
||||
func reconcileMessages(snap snapshot) {
|
||||
for id, msg := range snap.messages {
|
||||
channel, ok := snap.channels[msg.Record.ChannelID]
|
||||
if !ok {
|
||||
if guildID := snap.routes[msg.Record.ChannelID]; guildID != "" {
|
||||
msg.Record.GuildID = guildID
|
||||
if guildID == DirectMessageGuildID {
|
||||
channel = syntheticChannel(msg.Record.ChannelID, guildID, "")
|
||||
snap.channels[msg.Record.ChannelID] = channel
|
||||
ok = true
|
||||
}
|
||||
}
|
||||
}
|
||||
if !ok {
|
||||
if msg.Record.GuildID != "" {
|
||||
for i := range msg.Attachments {
|
||||
msg.Attachments[i].GuildID = msg.Record.GuildID
|
||||
}
|
||||
for i := range msg.Mentions {
|
||||
msg.Mentions[i].GuildID = msg.Record.GuildID
|
||||
}
|
||||
msg.Record.RawJSON = withRawGuildID(msg.Record.RawJSON, msg.Record.GuildID)
|
||||
msg.PayloadJSON = withRawGuildID(msg.PayloadJSON, msg.Record.GuildID)
|
||||
}
|
||||
snap.messages[id] = msg
|
||||
continue
|
||||
}
|
||||
if channel.GuildID != "" {
|
||||
msg.Record.GuildID = channel.GuildID
|
||||
for i := range msg.Attachments {
|
||||
msg.Attachments[i].GuildID = channel.GuildID
|
||||
}
|
||||
for i := range msg.Mentions {
|
||||
msg.Mentions[i].GuildID = channel.GuildID
|
||||
}
|
||||
}
|
||||
if channel.Name != "" {
|
||||
msg.Record.ChannelName = channel.Name
|
||||
}
|
||||
msg.Record.RawJSON = withRawGuildID(msg.Record.RawJSON, msg.Record.GuildID)
|
||||
msg.PayloadJSON = withRawGuildID(msg.PayloadJSON, msg.Record.GuildID)
|
||||
snap.messages[id] = msg
|
||||
}
|
||||
}
|
||||
|
||||
func withRawGuildID(rawJSON, guildID string) string {
|
||||
if rawJSON == "" || guildID == "" {
|
||||
return rawJSON
|
||||
}
|
||||
var raw map[string]any
|
||||
if err := json.Unmarshal([]byte(rawJSON), &raw); err != nil {
|
||||
return rawJSON
|
||||
}
|
||||
raw["guild_id"] = guildID
|
||||
body, err := json.Marshal(raw)
|
||||
if err != nil {
|
||||
return rawJSON
|
||||
}
|
||||
return string(body)
|
||||
}
|
||||
|
||||
func extractGzipPayloads(data []byte, maxBytes int64) [][]byte {
|
||||
var out [][]byte
|
||||
for offset := 0; offset < len(data)-1; offset++ {
|
||||
if data[offset] != 0x1f || data[offset+1] != 0x8b {
|
||||
continue
|
||||
}
|
||||
reader, err := gzip.NewReader(bytes.NewReader(data[offset:]))
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
reader.Multistream(false)
|
||||
payload, readErr := io.ReadAll(io.LimitReader(reader, maxBytes+1))
|
||||
closeErr := reader.Close()
|
||||
if readErr != nil || closeErr != nil || int64(len(payload)) > maxBytes {
|
||||
continue
|
||||
}
|
||||
out = append(out, payload)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func extractJSONValues(data []byte) [][]byte {
|
||||
candidate := bytes.TrimSpace(data)
|
||||
if len(candidate) <= maxObjectBytes && len(candidate) > 0 && json.Valid(candidate) {
|
||||
switch candidate[0] {
|
||||
case '{', '[':
|
||||
return [][]byte{append([]byte(nil), candidate...)}
|
||||
}
|
||||
}
|
||||
return extractJSONObjects(data)
|
||||
}
|
||||
|
||||
func extractJSONObjects(data []byte) [][]byte {
|
||||
var out [][]byte
|
||||
depth := 0
|
||||
start := -1
|
||||
inString := false
|
||||
escaped := false
|
||||
for i, b := range data {
|
||||
if inString {
|
||||
if escaped {
|
||||
escaped = false
|
||||
continue
|
||||
}
|
||||
switch b {
|
||||
case '\\':
|
||||
escaped = true
|
||||
case '"':
|
||||
inString = false
|
||||
}
|
||||
continue
|
||||
}
|
||||
switch b {
|
||||
case '"':
|
||||
if depth > 0 {
|
||||
inString = true
|
||||
}
|
||||
case '{':
|
||||
if depth == 0 {
|
||||
start = i
|
||||
}
|
||||
depth++
|
||||
case '}':
|
||||
if depth == 0 {
|
||||
continue
|
||||
}
|
||||
depth--
|
||||
if depth == 0 && start >= 0 {
|
||||
if i-start+1 <= maxObjectBytes {
|
||||
candidate := bytes.TrimSpace(data[start : i+1])
|
||||
if json.Valid(candidate) {
|
||||
out = append(out, append([]byte(nil), candidate...))
|
||||
}
|
||||
}
|
||||
start = -1
|
||||
}
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func shouldSkipDir(name string) bool {
|
||||
switch strings.ToLower(name) {
|
||||
case "crashpad", "gpu-cache", "shadercache", "spellcheck", "dawncache":
|
||||
return true
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func isCandidateFile(path string) bool {
|
||||
switch strings.ToLower(filepath.Ext(path)) {
|
||||
case ".ldb", ".log", ".json", ".txt":
|
||||
return true
|
||||
default:
|
||||
clean := filepath.ToSlash(path)
|
||||
return strings.Contains(clean, "/Cache/Cache_Data/") ||
|
||||
strings.Contains(clean, "/Service Worker/CacheStorage/") ||
|
||||
strings.Contains(clean, "/WebStorage/")
|
||||
}
|
||||
}
|
||||
|
||||
func parseAttachments(raw map[string]any, messageID, guildID, channelID, authorID string) []store.AttachmentRecord {
|
||||
items, _ := raw["attachments"].([]any)
|
||||
out := make([]store.AttachmentRecord, 0, len(items))
|
||||
for i, item := range items {
|
||||
attachment, _ := item.(map[string]any)
|
||||
if len(attachment) == 0 {
|
||||
continue
|
||||
}
|
||||
id := stringField(attachment, "id")
|
||||
if id == "" {
|
||||
id = fmt.Sprintf("%s:%d", messageID, i)
|
||||
}
|
||||
out = append(out, store.AttachmentRecord{
|
||||
AttachmentID: id,
|
||||
MessageID: messageID,
|
||||
GuildID: guildID,
|
||||
ChannelID: channelID,
|
||||
AuthorID: authorID,
|
||||
Filename: firstNonEmpty(stringField(attachment, "filename"), id),
|
||||
ContentType: stringField(attachment, "content_type"),
|
||||
Size: int64Field(attachment, "size"),
|
||||
URL: stringField(attachment, "url"),
|
||||
ProxyURL: stringField(attachment, "proxy_url"),
|
||||
})
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func parseMentions(raw map[string]any, messageID, guildID, channelID, authorID string, eventAt time.Time) []store.MentionEventRecord {
|
||||
items, _ := raw["mentions"].([]any)
|
||||
out := make([]store.MentionEventRecord, 0, len(items))
|
||||
for _, item := range items {
|
||||
mention, _ := item.(map[string]any)
|
||||
id := stringField(mention, "id")
|
||||
if id == "" {
|
||||
continue
|
||||
}
|
||||
out = append(out, store.MentionEventRecord{
|
||||
MessageID: messageID,
|
||||
GuildID: guildID,
|
||||
ChannelID: channelID,
|
||||
AuthorID: authorID,
|
||||
TargetType: "user",
|
||||
TargetID: id,
|
||||
TargetName: firstNonEmpty(stringField(mention, "global_name"), stringField(mention, "username")),
|
||||
EventAt: eventAt.UTC().Format(time.RFC3339Nano),
|
||||
})
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func attachmentText(attachments []store.AttachmentRecord) []string {
|
||||
out := make([]string, 0, len(attachments))
|
||||
for _, attachment := range attachments {
|
||||
out = append(out, attachment.Filename)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func embedText(raw map[string]any) []string {
|
||||
items, _ := raw["embeds"].([]any)
|
||||
out := []string{}
|
||||
for _, item := range items {
|
||||
embed, _ := item.(map[string]any)
|
||||
for _, key := range []string{"title", "description"} {
|
||||
if value := strings.TrimSpace(stringField(embed, key)); value != "" {
|
||||
out = append(out, value)
|
||||
}
|
||||
}
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func normalizeText(parts ...any) string {
|
||||
flat := []string{}
|
||||
for _, part := range parts {
|
||||
switch typed := part.(type) {
|
||||
case string:
|
||||
if text := cleanText(typed); text != "" {
|
||||
flat = append(flat, text)
|
||||
}
|
||||
case []string:
|
||||
for _, item := range typed {
|
||||
if text := cleanText(item); text != "" {
|
||||
flat = append(flat, text)
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return strings.Join(flat, "\n")
|
||||
}
|
||||
|
||||
func cleanText(raw string) string {
|
||||
raw = strings.ToValidUTF8(raw, "")
|
||||
var b strings.Builder
|
||||
spacePending := false
|
||||
for _, r := range raw {
|
||||
switch {
|
||||
case r == '\u200b' || r == '\u200c' || r == '\u200d' || r == '\ufeff':
|
||||
continue
|
||||
case unicode.IsControl(r):
|
||||
continue
|
||||
case unicode.IsSpace(r):
|
||||
spacePending = b.Len() > 0
|
||||
default:
|
||||
if spacePending {
|
||||
b.WriteByte(' ')
|
||||
spacePending = false
|
||||
}
|
||||
b.WriteRune(r)
|
||||
}
|
||||
}
|
||||
return strings.TrimSpace(b.String())
|
||||
}
|
||||
|
||||
func messageReferenceID(raw map[string]any) string {
|
||||
ref, _ := raw["message_reference"].(map[string]any)
|
||||
return stringField(ref, "message_id")
|
||||
}
|
||||
|
||||
func syntheticGuild(id, name string) store.GuildRecord {
|
||||
raw, _ := json.Marshal(map[string]any{
|
||||
"id": id,
|
||||
"name": name,
|
||||
"source": "discord_desktop",
|
||||
})
|
||||
return store.GuildRecord{ID: id, Name: name, RawJSON: string(raw)}
|
||||
}
|
||||
|
||||
func syntheticChannel(id, guildID, name string) store.ChannelRecord {
|
||||
if name == "" {
|
||||
name = "channel-" + shortID(id)
|
||||
}
|
||||
raw, _ := json.Marshal(map[string]any{
|
||||
"id": id,
|
||||
"guild_id": guildID,
|
||||
"name": name,
|
||||
"source": "discord_desktop",
|
||||
})
|
||||
kind := "text"
|
||||
if guildID == DirectMessageGuildID {
|
||||
kind = "dm"
|
||||
if strings.Contains(name, ", ") {
|
||||
kind = "group_dm"
|
||||
}
|
||||
}
|
||||
return store.ChannelRecord{ID: id, GuildID: guildID, Kind: kind, Name: name, RawJSON: string(raw)}
|
||||
}
|
||||
|
||||
func guildName(id string) string {
|
||||
switch id {
|
||||
case DirectMessageGuildID:
|
||||
return DirectMessageGuildName
|
||||
default:
|
||||
return "Discord Desktop Guild " + id
|
||||
}
|
||||
}
|
||||
|
||||
func kindForChannelType(typeValue int, dm bool) string {
|
||||
if dm {
|
||||
if typeValue == 3 {
|
||||
return "group_dm"
|
||||
}
|
||||
return "dm"
|
||||
}
|
||||
switch typeValue {
|
||||
case 0:
|
||||
return "text"
|
||||
case 5:
|
||||
return "announcement"
|
||||
case 10:
|
||||
return "thread_announcement"
|
||||
case 11:
|
||||
return "thread_public"
|
||||
case 12:
|
||||
return "thread_private"
|
||||
case 15:
|
||||
return "forum"
|
||||
default:
|
||||
return "desktop"
|
||||
}
|
||||
}
|
||||
|
||||
func channelRawJSON(raw map[string]any, id, guildID, name, kind string) string {
|
||||
body, _ := json.Marshal(map[string]any{
|
||||
"id": id,
|
||||
"guild_id": guildID,
|
||||
"name": name,
|
||||
"kind": kind,
|
||||
"source": "discord_desktop",
|
||||
"type": raw["type"],
|
||||
})
|
||||
return string(body)
|
||||
}
|
||||
|
||||
func messageRawJSON(raw map[string]any, id, guildID, channelID, authorID string) string {
|
||||
body, _ := json.Marshal(map[string]any{
|
||||
"id": id,
|
||||
"guild_id": guildID,
|
||||
"channel_id": channelID,
|
||||
"author_id": authorID,
|
||||
"source": "discord_desktop",
|
||||
"type": raw["type"],
|
||||
"timestamp": raw["timestamp"],
|
||||
"edited_timestamp": raw["edited_timestamp"],
|
||||
"message_reference": raw["message_reference"],
|
||||
"attachment_count": lenArray(raw["attachments"]),
|
||||
"mention_count": lenArray(raw["mentions"]),
|
||||
"desktop_cache_note": "raw desktop cache payload intentionally not stored",
|
||||
})
|
||||
return string(body)
|
||||
}
|
||||
|
||||
func recipientLabel(items []any) string {
|
||||
names := []string{}
|
||||
for _, item := range items {
|
||||
recipient, _ := item.(map[string]any)
|
||||
name := firstNonEmpty(
|
||||
stringField(recipient, "global_name"),
|
||||
stringField(recipient, "display_name"),
|
||||
stringField(recipient, "username"),
|
||||
)
|
||||
if name != "" {
|
||||
names = append(names, name)
|
||||
}
|
||||
}
|
||||
sort.Strings(names)
|
||||
return strings.Join(names, ", ")
|
||||
}
|
||||
|
||||
func parseDiscordTime(raw string) time.Time {
|
||||
raw = strings.TrimSpace(raw)
|
||||
if raw == "" || raw == "null" {
|
||||
return time.Time{}
|
||||
}
|
||||
if t, err := time.Parse(time.RFC3339Nano, raw); err == nil {
|
||||
return t.UTC()
|
||||
}
|
||||
if t, err := time.Parse(time.RFC3339, raw); err == nil {
|
||||
return t.UTC()
|
||||
}
|
||||
return time.Time{}
|
||||
}
|
||||
|
||||
func snowflakeTime(id string) time.Time {
|
||||
value, err := strconv.ParseUint(id, 10, 64)
|
||||
if err != nil {
|
||||
return time.Time{}
|
||||
}
|
||||
ms := int64((value >> 22) + 1420070400000)
|
||||
return time.UnixMilli(ms).UTC()
|
||||
}
|
||||
|
||||
func formatOptionalTime(t time.Time) string {
|
||||
if t.IsZero() {
|
||||
return ""
|
||||
}
|
||||
return t.UTC().Format(time.RFC3339Nano)
|
||||
}
|
||||
|
||||
func looksSnowflake(value string) bool {
|
||||
if len(value) < 12 || len(value) > 24 {
|
||||
return false
|
||||
}
|
||||
for _, r := range value {
|
||||
if r < '0' || r > '9' {
|
||||
return false
|
||||
}
|
||||
}
|
||||
return true
|
||||
}
|
||||
|
||||
func shortID(id string) string {
|
||||
if len(id) <= 6 {
|
||||
return id
|
||||
}
|
||||
return id[len(id)-6:]
|
||||
}
|
||||
|
||||
func stringField(raw map[string]any, key string) string {
|
||||
value, ok := raw[key]
|
||||
if !ok || value == nil {
|
||||
return ""
|
||||
}
|
||||
switch typed := value.(type) {
|
||||
case string:
|
||||
return strings.TrimSpace(typed)
|
||||
case json.Number:
|
||||
return typed.String()
|
||||
default:
|
||||
return ""
|
||||
}
|
||||
}
|
||||
|
||||
func intField(raw map[string]any, key string) (int, bool) {
|
||||
value, ok := raw[key]
|
||||
if !ok || value == nil {
|
||||
return 0, false
|
||||
}
|
||||
switch typed := value.(type) {
|
||||
case float64:
|
||||
return int(typed), true
|
||||
case int:
|
||||
return typed, true
|
||||
case json.Number:
|
||||
i, err := typed.Int64()
|
||||
return int(i), err == nil
|
||||
default:
|
||||
return 0, false
|
||||
}
|
||||
}
|
||||
|
||||
func int64Field(raw map[string]any, key string) int64 {
|
||||
value, ok := raw[key]
|
||||
if !ok || value == nil {
|
||||
return 0
|
||||
}
|
||||
switch typed := value.(type) {
|
||||
case float64:
|
||||
return int64(typed)
|
||||
case int64:
|
||||
return typed
|
||||
case int:
|
||||
return int64(typed)
|
||||
case json.Number:
|
||||
i, _ := typed.Int64()
|
||||
return i
|
||||
default:
|
||||
return 0
|
||||
}
|
||||
}
|
||||
|
||||
func boolField(raw map[string]any, key string) bool {
|
||||
value, _ := raw[key].(bool)
|
||||
return value
|
||||
}
|
||||
|
||||
func lenArray(value any) int {
|
||||
items, _ := value.([]any)
|
||||
return len(items)
|
||||
}
|
||||
|
||||
func firstNonEmpty(items ...string) string {
|
||||
for _, item := range items {
|
||||
if strings.TrimSpace(item) != "" {
|
||||
return strings.TrimSpace(item)
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func mapValues[M ~map[string]T, T any](m M) []T {
|
||||
out := make([]T, 0, len(m))
|
||||
for _, value := range m {
|
||||
out = append(out, value)
|
||||
}
|
||||
return out
|
||||
}
|
||||
218
internal/discorddesktop/import_test.go
Normal file
218
internal/discorddesktop/import_test.go
Normal file
@ -0,0 +1,218 @@
|
||||
package discorddesktop
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"compress/gzip"
|
||||
"context"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/steipete/discrawl/internal/store"
|
||||
)
|
||||
|
||||
func TestImportExtractsDirectMessageFromDesktopCache(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
dir := t.TempDir()
|
||||
cachePath := filepath.Join(dir, "Local Storage", "leveldb")
|
||||
require.NoError(t, os.MkdirAll(cachePath, 0o755))
|
||||
require.NoError(t, os.WriteFile(filepath.Join(cachePath, "000001.log"), []byte(`noise
|
||||
{"id":"111111111111111111","type":1,"recipients":[{"id":"222222222222222222","username":"alice","global_name":"Alice"}]}
|
||||
binary-ish {"t":"MESSAGE_CREATE","token":"do-not-store","d":{"id":"333333333333333333","channel_id":"111111111111111111","content":"launch checklist in a DM","timestamp":"2026-04-23T18:20:43.123Z","author":{"id":"222222222222222222","username":"alice","global_name":"Alice"},"attachments":[{"id":"444444444444444444","filename":"plan.txt","size":10}],"mentions":[{"id":"555555555555555555","username":"bob"}]}} tail
|
||||
`), 0o600))
|
||||
|
||||
dbPath := filepath.Join(dir, "discrawl.db")
|
||||
st, err := store.Open(ctx, dbPath)
|
||||
require.NoError(t, err)
|
||||
defer func() { _ = st.Close() }()
|
||||
|
||||
stats, err := Import(ctx, st, Options{
|
||||
Path: dir,
|
||||
Now: func() time.Time { return time.Date(2026, 4, 23, 18, 30, 0, 0, time.UTC) },
|
||||
})
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, 1, stats.FilesScanned)
|
||||
require.Equal(t, 1, stats.Messages)
|
||||
require.Equal(t, 1, stats.Channels)
|
||||
|
||||
results, err := st.SearchMessages(ctx, store.SearchOptions{Query: "launch", Limit: 10})
|
||||
require.NoError(t, err)
|
||||
require.Len(t, results, 1)
|
||||
require.Equal(t, DirectMessageGuildID, results[0].GuildID)
|
||||
require.Equal(t, "Alice", results[0].ChannelName)
|
||||
require.Equal(t, "Alice", results[0].AuthorName)
|
||||
|
||||
mentions, err := st.ListMentions(ctx, store.MentionListOptions{
|
||||
GuildIDs: []string{DirectMessageGuildID},
|
||||
Target: "bob",
|
||||
Limit: 10,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
require.Len(t, mentions, 1)
|
||||
require.Equal(t, "555555555555555555", mentions[0].TargetID)
|
||||
|
||||
_, rows, err := st.ReadOnlyQuery(ctx, "select raw_json from messages where id = '333333333333333333'")
|
||||
require.NoError(t, err)
|
||||
require.Len(t, rows, 1)
|
||||
require.NotContains(t, rows[0][0], "do-not-store")
|
||||
}
|
||||
|
||||
func TestImportDryRunDoesNotWrite(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
dir := t.TempDir()
|
||||
require.NoError(t, os.WriteFile(filepath.Join(dir, "000001.log"), []byte(`{"id":"333333333333333333","channel_id":"111111111111111111","content":"dry run only","timestamp":"2026-04-23T18:20:43Z","author":{"id":"222222222222222222","username":"alice"}}`), 0o600))
|
||||
|
||||
dbPath := filepath.Join(dir, "discrawl.db")
|
||||
st, err := store.Open(ctx, dbPath)
|
||||
require.NoError(t, err)
|
||||
defer func() { _ = st.Close() }()
|
||||
|
||||
stats, err := Import(ctx, st, Options{Path: dir, DryRun: true})
|
||||
require.NoError(t, err)
|
||||
require.True(t, stats.DryRun)
|
||||
require.Equal(t, 0, stats.Messages)
|
||||
require.Equal(t, 1, stats.SkippedMessages)
|
||||
|
||||
results, err := st.SearchMessages(ctx, store.SearchOptions{Query: "dry", Limit: 10})
|
||||
require.NoError(t, err)
|
||||
require.Empty(t, results)
|
||||
}
|
||||
|
||||
func TestImportExtractsCompressedUnknownMessageArrayFromChromiumCache(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
dir := t.TempDir()
|
||||
cachePath := filepath.Join(dir, "Cache", "Cache_Data")
|
||||
require.NoError(t, os.MkdirAll(cachePath, 0o755))
|
||||
|
||||
var compressed bytes.Buffer
|
||||
zw := gzip.NewWriter(&compressed)
|
||||
_, err := zw.Write([]byte(`[{"id":"333333333333333334","channel_id":"111111111111111112","content":"compressed cache history","timestamp":"2026-04-23T18:20:43.123Z","author":{"id":"222222222222222223","username":"alice"}}]`))
|
||||
require.NoError(t, err)
|
||||
require.NoError(t, zw.Close())
|
||||
|
||||
cacheBlob := append([]byte("https://discord.com/api/v9/channels/111111111111111112/messages?limit=50\x00"), compressed.Bytes()...)
|
||||
cacheBlob = append(cacheBlob, []byte("chromium trailing metadata")...)
|
||||
require.NoError(t, os.WriteFile(filepath.Join(cachePath, "entry_0"), cacheBlob, 0o600))
|
||||
|
||||
dbPath := filepath.Join(dir, "discrawl.db")
|
||||
st, err := store.Open(ctx, dbPath)
|
||||
require.NoError(t, err)
|
||||
defer func() { _ = st.Close() }()
|
||||
|
||||
stats, err := Import(ctx, st, Options{Path: dir})
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, 1, stats.FilesScanned)
|
||||
require.Equal(t, 0, stats.Messages)
|
||||
require.Equal(t, 0, stats.DMMessages)
|
||||
require.Equal(t, 1, stats.SkippedMessages)
|
||||
require.Equal(t, 1, stats.SkippedChannels)
|
||||
|
||||
results, err := st.SearchMessages(ctx, store.SearchOptions{Query: "compressed", Limit: 10})
|
||||
require.NoError(t, err)
|
||||
require.Empty(t, results)
|
||||
}
|
||||
|
||||
func TestImportReconcilesMessagesWithLaterGuildChannelMetadata(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
dir := t.TempDir()
|
||||
cachePath := filepath.Join(dir, "Local Storage", "leveldb")
|
||||
require.NoError(t, os.MkdirAll(cachePath, 0o755))
|
||||
require.NoError(t, os.WriteFile(filepath.Join(cachePath, "000001.log"), []byte(`{"id":"333333333333333335","channel_id":"111111111111111113","content":"guild cache message","timestamp":"2026-04-23T18:20:43Z","author":{"id":"222222222222222224","username":"alice"},"mentions":[{"id":"555555555555555556","username":"bob"}],"attachments":[{"id":"444444444444444445","filename":"trace.txt"}]}`), 0o600))
|
||||
require.NoError(t, os.WriteFile(filepath.Join(cachePath, "000002.log"), []byte(`{"id":"111111111111111113","guild_id":"999999999999999999","type":0,"name":"backend"}`), 0o600))
|
||||
|
||||
dbPath := filepath.Join(dir, "discrawl.db")
|
||||
st, err := store.Open(ctx, dbPath)
|
||||
require.NoError(t, err)
|
||||
defer func() { _ = st.Close() }()
|
||||
|
||||
stats, err := Import(ctx, st, Options{Path: dir})
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, 1, stats.Messages)
|
||||
|
||||
results, err := st.SearchMessages(ctx, store.SearchOptions{Query: "guild cache", Limit: 10})
|
||||
require.NoError(t, err)
|
||||
require.Len(t, results, 1)
|
||||
require.Equal(t, "999999999999999999", results[0].GuildID)
|
||||
require.Equal(t, "backend", results[0].ChannelName)
|
||||
|
||||
mentions, err := st.ListMentions(ctx, store.MentionListOptions{
|
||||
GuildIDs: []string{"999999999999999999"},
|
||||
Target: "bob",
|
||||
Limit: 10,
|
||||
})
|
||||
require.NoError(t, err)
|
||||
require.Len(t, mentions, 1)
|
||||
|
||||
_, rows, err := st.ReadOnlyQuery(ctx, "select guild_id from message_attachments where message_id = '333333333333333335'")
|
||||
require.NoError(t, err)
|
||||
require.Len(t, rows, 1)
|
||||
require.Equal(t, "999999999999999999", rows[0][0])
|
||||
}
|
||||
|
||||
func TestImportClassifiesMessagesFromCachedChannelRoutes(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
dir := t.TempDir()
|
||||
cachePath := filepath.Join(dir, "Cache", "Cache_Data")
|
||||
require.NoError(t, os.MkdirAll(cachePath, 0o755))
|
||||
require.NoError(t, os.WriteFile(filepath.Join(cachePath, "dm_0"), []byte(`https://discord.com/channels/@me/111111111111111114
|
||||
{"id":"333333333333333336","channel_id":"111111111111111114","content":"route dm message","timestamp":"2026-04-23T18:20:43Z","author":{"id":"222222222222222225","username":"alice"}}`), 0o600))
|
||||
require.NoError(t, os.WriteFile(filepath.Join(cachePath, "guild_0"), []byte(`https://discord.com/channels/999999999999999998/111111111111111115
|
||||
{"id":"333333333333333337","channel_id":"111111111111111115","content":"route guild message","timestamp":"2026-04-23T18:20:44Z","author":{"id":"222222222222222226","username":"bob"}}`), 0o600))
|
||||
|
||||
dbPath := filepath.Join(dir, "discrawl.db")
|
||||
st, err := store.Open(ctx, dbPath)
|
||||
require.NoError(t, err)
|
||||
defer func() { _ = st.Close() }()
|
||||
|
||||
stats, err := Import(ctx, st, Options{Path: dir})
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, 2, stats.Messages)
|
||||
require.Equal(t, 1, stats.DMMessages)
|
||||
require.Equal(t, 1, stats.GuildMessages)
|
||||
require.Equal(t, 0, stats.SkippedMessages)
|
||||
|
||||
dmResults, err := st.SearchMessages(ctx, store.SearchOptions{Query: "route dm", Limit: 10})
|
||||
require.NoError(t, err)
|
||||
require.Len(t, dmResults, 1)
|
||||
require.Equal(t, DirectMessageGuildID, dmResults[0].GuildID)
|
||||
|
||||
guildResults, err := st.SearchMessages(ctx, store.SearchOptions{Query: "route guild", Limit: 10})
|
||||
require.NoError(t, err)
|
||||
require.Len(t, guildResults, 1)
|
||||
require.Equal(t, "999999999999999998", guildResults[0].GuildID)
|
||||
}
|
||||
|
||||
func TestImportDropsPreviousUnknownWiretapRows(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
dir := t.TempDir()
|
||||
dbPath := filepath.Join(dir, "discrawl.db")
|
||||
st, err := store.Open(ctx, dbPath)
|
||||
require.NoError(t, err)
|
||||
defer func() { _ = st.Close() }()
|
||||
|
||||
require.NoError(t, st.UpsertGuild(ctx, store.GuildRecord{ID: "@unknown", Name: "Unknown", RawJSON: `{}`}))
|
||||
require.NoError(t, st.UpsertChannel(ctx, store.ChannelRecord{ID: "111111111111111116", GuildID: "@unknown", Kind: "unknown", Name: "unknown", RawJSON: `{}`}))
|
||||
require.NoError(t, st.UpsertMessage(ctx, store.MessageRecord{
|
||||
ID: "333333333333333338",
|
||||
GuildID: "@unknown",
|
||||
ChannelID: "111111111111111116",
|
||||
AuthorID: "222222222222222227",
|
||||
AuthorName: "alice",
|
||||
MessageType: 0,
|
||||
CreatedAt: "2026-04-23T18:20:43Z",
|
||||
Content: "stale unknown message",
|
||||
NormalizedContent: "stale unknown message",
|
||||
RawJSON: `{}`,
|
||||
}))
|
||||
|
||||
stats, err := Import(ctx, st, Options{Path: dir})
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, 0, stats.Messages)
|
||||
|
||||
results, err := st.SearchMessages(ctx, store.SearchOptions{Query: "stale unknown", Limit: 10})
|
||||
require.NoError(t, err)
|
||||
require.Empty(t, results)
|
||||
}
|
||||
@ -218,6 +218,45 @@ func (s *Store) UpsertMember(ctx context.Context, member MemberRecord) error {
|
||||
return tx.Commit()
|
||||
}
|
||||
|
||||
func (s *Store) DeleteGuildData(ctx context.Context, guildID string) error {
|
||||
tx, err := s.db.BeginTx(ctx, nil)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer rollback(tx)
|
||||
for _, stmt := range []string{
|
||||
`delete from embedding_jobs where message_id in (select id from messages where guild_id = ?)`,
|
||||
`delete from message_embeddings where message_id in (select id from messages where guild_id = ?)`,
|
||||
`delete from message_fts where guild_id = ?`,
|
||||
`delete from message_events where guild_id = ?`,
|
||||
`delete from message_attachments where guild_id = ?`,
|
||||
`delete from mention_events where guild_id = ?`,
|
||||
`delete from messages where guild_id = ?`,
|
||||
`delete from member_fts where guild_id = ?`,
|
||||
`delete from members where guild_id = ?`,
|
||||
`delete from channels where guild_id = ?`,
|
||||
`delete from guilds where id = ?`,
|
||||
} {
|
||||
if _, err := tx.ExecContext(ctx, stmt, guildID); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return tx.Commit()
|
||||
}
|
||||
|
||||
func (s *Store) DeleteOrphanChannels(ctx context.Context, guildID string) error {
|
||||
_, err := s.db.ExecContext(ctx, `
|
||||
delete from channels
|
||||
where guild_id = ?
|
||||
and not exists (
|
||||
select 1
|
||||
from messages
|
||||
where messages.channel_id = channels.id
|
||||
)
|
||||
`, guildID)
|
||||
return err
|
||||
}
|
||||
|
||||
func (s *Store) DeleteMember(ctx context.Context, guildID, userID string) error {
|
||||
tx, err := s.db.BeginTx(ctx, nil)
|
||||
if err != nil {
|
||||
|
||||
Loading…
Reference in New Issue
Block a user