From c8986cbcf9c4a9f1bcbc4416df2b7af809e96ac0 Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Sun, 26 Apr 2026 01:24:21 +0100 Subject: [PATCH] feat: import twitter archives --- CHANGELOG.md | 6 + README.md | 18 ++ internal/cli/cli.go | 2 + internal/cli/cli_test.go | 44 +++ internal/cli/output.go | 7 + internal/cli/twitter_commands.go | 44 +++ internal/twitterarchive/helpers.go | 235 +++++++++++++++ internal/twitterarchive/import.go | 396 +++++++++++++++++++++++++ internal/twitterarchive/import_test.go | 121 ++++++++ 9 files changed, 873 insertions(+) create mode 100644 internal/cli/twitter_commands.go create mode 100644 internal/twitterarchive/helpers.go create mode 100644 internal/twitterarchive/import.go create mode 100644 internal/twitterarchive/import_test.go diff --git a/CHANGELOG.md b/CHANGELOG.md index 7537f03..03e1a1c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,12 @@ All notable changes to `discrawl` will be documented in this file. +## Unreleased + +### Changes + +- Added `twitter import` / `x import` for local X/Twitter archive `.zip` files, storing tweets, likes, and direct messages under synthetic guild id `x` so existing Discrawl search and SQL commands can query them. + ## 0.6.1 - 2026-04-25 ### Maintenance diff --git a/README.md b/README.md index 8481e6f..d2ba00b 100644 --- a/README.md +++ b/README.md @@ -268,6 +268,24 @@ Notes: - does not extract, store, or print Discord auth tokens - `--max-file-bytes` skips unusually large files; default is 64 MiB +### `twitter import` + +Imports a local X/Twitter archive `.zip` into the same SQLite message/search tables. + +```bash +discrawl twitter import --archive ~/Downloads/twitter-2025-08-05.zip +discrawl x import --archive ~/Downloads/twitter-2025-08-05.zip --dry-run +discrawl search --guild x "launch checklist" +``` + +Notes: + +- stores imported tweets, likes, and direct messages under synthetic guild id `x` +- maps tweets to `x:tweets`, liked tweet text to `x:likes`, and each DM conversation to an `x:dm:*` channel +- uses the archive's JS assignment files such as `data/account.js`, `data/tweets*.js`, `data/like.js`, `data/direct-messages.js`, and `data/direct-messages-group.js` +- prefixes imported message ids with `x:tweet:`, `x:like:`, or `x:dm:` so they cannot collide with Discord snowflakes +- stamps `twitter:last_import` and `twitter:last_archive` in `sync_state` + ### `search` Searches archived messages. FTS is the default mode and works without embeddings. diff --git a/internal/cli/cli.go b/internal/cli/cli.go index 2ec8263..95e57ff 100644 --- a/internal/cli/cli.go +++ b/internal/cli/cli.go @@ -128,6 +128,8 @@ func (r *runtime) dispatch(rest []string) error { return r.withServices(true, func() error { return r.runTail(rest[1:]) }) case "wiretap": return r.withLocalStoreDefault(false, func() error { return r.runWiretap(rest[1:]) }) + case "twitter", "x": + return r.withLocalStoreDefault(false, func() error { return r.runTwitter(rest[1:]) }) case "search": autoShareUpdate := !hasBoolFlag(rest[1:], "--dm") return r.withLocalStoreDefault(autoShareUpdate, func() error { return r.runSearch(rest[1:]) }) diff --git a/internal/cli/cli_test.go b/internal/cli/cli_test.go index 0ec4de9..cb27117 100644 --- a/internal/cli/cli_test.go +++ b/internal/cli/cli_test.go @@ -1,6 +1,7 @@ package cli import ( + "archive/zip" "bytes" "context" "encoding/json" @@ -182,6 +183,49 @@ func TestWiretapImportsDesktopDirectMessages(t *testing.T) { require.Contains(t, out.String(), "secret DM launch plan") } +func TestTwitterImportCommand(t *testing.T) { + ctx := context.Background() + dir := t.TempDir() + cfgPath := filepath.Join(dir, "config.toml") + dbPath := filepath.Join(dir, "discrawl.db") + archivePath := filepath.Join(dir, "twitter.zip") + writeTwitterArchiveFixture(t, archivePath) + + cfg := config.Default() + cfg.DBPath = dbPath + cfg.Discord.TokenSource = "none" + require.NoError(t, config.Write(cfgPath, cfg)) + + var out bytes.Buffer + require.NoError(t, Run(ctx, []string{"--config", cfgPath, "twitter", "import", "--archive", archivePath}, &out, &bytes.Buffer{})) + require.Contains(t, out.String(), "tweets=1") + require.Contains(t, out.String(), "dm_messages=1") + + out.Reset() + require.NoError(t, Run(ctx, []string{"--config", cfgPath, "search", "--guild", "x", "secret roadmap"}, &out, &bytes.Buffer{})) + require.Contains(t, out.String(), "secret roadmap") +} + +func writeTwitterArchiveFixture(t *testing.T, path string) { + t.Helper() + file, err := os.Create(path) + require.NoError(t, err) + defer func() { _ = file.Close() }() + zw := zip.NewWriter(file) + defer func() { require.NoError(t, zw.Close()) }() + writeTwitterZipEntry(t, zw, "data/account.js", `window.YTD.account.part0 = [{"account":{"username":"steipete","accountId":"25401953","accountDisplayName":"Peter Steinberger"}}]`) + writeTwitterZipEntry(t, zw, "data/tweets.js", `window.YTD.tweets.part0 = [{"tweet":{"id_str":"1952542067017584782","created_at":"Tue Aug 05 01:27:59 +0000 2025","full_text":"archive tweet search text","entities":{"user_mentions":[]}}}]`) + writeTwitterZipEntry(t, zw, "data/direct-messages.js", `window.YTD.direct_messages.part0 = [{"dmConversation":{"conversationId":"929-25401953","messages":[{"messageCreate":{"recipientId":"929","senderId":"25401953","id":"1052590933307461636","createdAt":"2018-10-17T16:03:29.391Z","text":"secret roadmap","mediaUrls":[]}}]}}]`) +} + +func writeTwitterZipEntry(t *testing.T, zw *zip.Writer, name, body string) { + t.Helper() + w, err := zw.Create(name) + require.NoError(t, err) + _, err = w.Write([]byte(body)) + require.NoError(t, err) +} + func TestParseMessageWindow(t *testing.T) { rt := &runtime{now: func() time.Time { return time.Date(2026, 4, 24, 12, 0, 0, 0, time.UTC) diff --git a/internal/cli/output.go b/internal/cli/output.go index 92d6e0c..21b7c83 100644 --- a/internal/cli/output.go +++ b/internal/cli/output.go @@ -13,6 +13,7 @@ import ( "github.com/steipete/discrawl/internal/discorddesktop" "github.com/steipete/discrawl/internal/store" "github.com/steipete/discrawl/internal/syncer" + "github.com/steipete/discrawl/internal/twitterarchive" ) func (r *runtime) print(value any) error { @@ -85,6 +86,8 @@ Commands: sync tail wiretap + twitter + x search messages dms @@ -134,6 +137,10 @@ func printHuman(w io.Writer, value any) error { _, err := fmt.Fprintf(w, "path=%s\nfiles=%d\nskipped=%d\nobjects=%d\nguilds=%d\nchannels=%d\nmessages=%d\ndm_messages=%d\ndm_channels=%d\nguild_messages=%d\nskipped_messages=%d\nskipped_channels=%d\ndry_run=%t\n", v.Path, v.FilesScanned, v.FilesSkipped, v.JSONObjects, v.Guilds, v.Channels, v.Messages, v.DMMessages, v.DMChannels, v.GuildMessages, v.SkippedMessages, v.SkippedChannels, v.DryRun) return err + case twitterarchive.Stats: + _, err := fmt.Fprintf(w, "path=%s\nfiles=%d\naccounts=%d\ntweets=%d\nlikes=%d\ndm_conversations=%d\ndm_messages=%d\nskipped=%d\ndry_run=%t\n", + v.Path, v.FilesScanned, v.Accounts, v.Tweets, v.Likes, v.DMConversations, v.DMMessages, v.Skipped, v.DryRun) + return err case store.Status: _, err := fmt.Fprintf(w, "db=%s\nguilds=%d\nchannels=%d\nthreads=%d\nmessages=%d\nmembers=%d\nembedding_backlog=%d\nlast_sync=%s\nlast_tail_event=%s\n", v.DBPath, v.GuildCount, v.ChannelCount, v.ThreadCount, v.MessageCount, v.MemberCount, v.EmbeddingBacklog, diff --git a/internal/cli/twitter_commands.go b/internal/cli/twitter_commands.go new file mode 100644 index 0000000..e176586 --- /dev/null +++ b/internal/cli/twitter_commands.go @@ -0,0 +1,44 @@ +package cli + +import ( + "flag" + "fmt" + "io" + + "github.com/steipete/discrawl/internal/twitterarchive" +) + +func (r *runtime) runTwitter(args []string) error { + if len(args) == 0 { + return usageErr(fmt.Errorf("twitter requires subcommand: import")) + } + switch args[0] { + case "import": + return r.runTwitterImport(args[1:]) + default: + return usageErr(fmt.Errorf("unknown twitter subcommand %q", args[0])) + } +} + +func (r *runtime) runTwitterImport(args []string) error { + fs := flag.NewFlagSet("twitter import", flag.ContinueOnError) + fs.SetOutput(io.Discard) + archivePath := fs.String("archive", "", "") + fs.StringVar(archivePath, "path", "", "") + dryRun := fs.Bool("dry-run", false, "") + if err := fs.Parse(args); err != nil { + return usageErr(err) + } + if fs.NArg() > 0 { + return usageErr(fmt.Errorf("twitter import takes flags only")) + } + stats, err := twitterarchive.Import(r.ctx, r.store, twitterarchive.Options{ + Path: *archivePath, + DryRun: *dryRun, + Now: r.now, + }) + if err != nil { + return err + } + return r.print(stats) +} diff --git a/internal/twitterarchive/helpers.go b/internal/twitterarchive/helpers.go new file mode 100644 index 0000000..f3dd16b --- /dev/null +++ b/internal/twitterarchive/helpers.go @@ -0,0 +1,235 @@ +package twitterarchive + +import ( + "archive/zip" + "encoding/json" + "errors" + "fmt" + "io" + "path/filepath" + "sort" + "strconv" + "strings" + "time" + + "github.com/steipete/discrawl/internal/store" +) + +func classify(path string) string { + normalized := filepath.ToSlash(path) + base := filepath.Base(normalized) + switch { + case strings.HasSuffix(normalized, "/data/account.js") || normalized == "data/account.js": + return "account" + case (base == "tweets.js" || strings.HasPrefix(base, "tweets-part")) && strings.HasSuffix(base, ".js"): + return "tweets" + case base == "like.js" || base == "likes.js": + return "likes" + case (base == "direct-messages.js" || base == "direct-messages-group.js") && strings.HasSuffix(base, ".js"): + return "dms" + default: + return "" + } +} + +func readZipText(file *zip.File) (string, error) { + rc, err := file.Open() + if err != nil { + return "", err + } + defer func() { _ = rc.Close() }() + var b strings.Builder + if _, err := io.Copy(&b, rc); err != nil { + return "", err + } + return b.String(), nil +} + +func parseArchiveArray(content string) ([]map[string]any, error) { + idx := strings.Index(content, "=") + if idx < 0 { + return nil, errors.New("missing assignment") + } + payload := strings.TrimSuffix(strings.TrimSpace(content[idx+1:]), ";") + dec := json.NewDecoder(strings.NewReader(payload)) + dec.UseNumber() + var records []map[string]any + if err := dec.Decode(&records); err != nil { + return nil, err + } + return records, nil +} + +func dmAttachments(messageID, channelID, authorID string, raw map[string]any) []store.AttachmentRecord { + urls := stringSlice(raw["mediaUrls"]) + attachments := make([]store.AttachmentRecord, 0, len(urls)) + for i, url := range urls { + attachments = append(attachments, store.AttachmentRecord{ + AttachmentID: fmt.Sprintf("x:dm:%s:media:%d", messageID, i), + MessageID: "x:dm:" + messageID, + GuildID: GuildID, + ChannelID: channelID, + AuthorID: authorID, + Filename: filepath.Base(url), + URL: url, + }) + } + return attachments +} + +func participantsForMessage(conversationID, senderID, recipientID string) []string { + set := map[string]struct{}{} + for _, value := range []string{senderID, recipientID} { + if value != "" { + set[value] = struct{}{} + } + } + for _, part := range strings.Split(conversationID, "-") { + if part != "" { + set[part] = struct{}{} + } + } + out := make([]string, 0, len(set)) + for id := range set { + out = append(out, id) + } + sort.Strings(out) + return out +} + +func conversationName(conversationID, selfID string) string { + parts := strings.Split(conversationID, "-") + if len(parts) == 2 { + for _, part := range parts { + if part != "" && part != selfID { + return "dm-" + shortID(part) + } + } + } + return "group-" + shortID(conversationID) +} + +func authorLabel(id string, acc account) string { + if id == acc.ID && acc.Username != "" { + return acc.Username + } + return "user-" + shortID(id) +} + +func parseTime(value string) time.Time { + value = strings.TrimSpace(value) + for _, layout := range []string{time.RFC3339Nano, time.RFC3339, "Mon Jan 02 15:04:05 -0700 2006"} { + if t, err := time.Parse(layout, value); err == nil { + return t + } + } + return time.Time{} +} + +func twitterSnowflakeTime(id string) time.Time { + n, err := strconv.ParseInt(id, 10, 64) + if err != nil || n <= 0 { + return time.Time{} + } + const twitterEpochMs = int64(1288834974657) + ms := (n >> 22) + twitterEpochMs + return time.UnixMilli(ms).UTC() +} + +func sortedFiles(files []*zip.File) []*zip.File { + out := append([]*zip.File(nil), files...) + sort.Slice(out, func(i, j int) bool { return out[i].Name < out[j].Name }) + return out +} + +func sortedChannels(channels map[string]store.ChannelRecord) []store.ChannelRecord { + out := make([]store.ChannelRecord, 0, len(channels)) + for _, ch := range channels { + out = append(out, ch) + } + sort.Slice(out, func(i, j int) bool { return out[i].ID < out[j].ID }) + return out +} + +func sortedMembers(members map[string]store.MemberRecord) []store.MemberRecord { + out := make([]store.MemberRecord, 0, len(members)) + for _, member := range members { + out = append(out, member) + } + sort.Slice(out, func(i, j int) bool { return out[i].UserID < out[j].UserID }) + return out +} + +func memberKey(guildID, userID string) string { return guildID + "\x00" + userID } + +func prefixedTweetID(id string) string { + if id == "" { + return "" + } + return "x:tweet:" + id +} + +func shortID(id string) string { + if len(id) <= 8 { + return id + } + return id[len(id)-8:] +} + +func rawJSON(value any) string { + data, err := json.Marshal(value) + if err != nil { + return `{}` + } + return string(data) +} + +func stringValue(value any) string { + switch typed := value.(type) { + case string: + return strings.TrimSpace(typed) + case json.Number: + return typed.String() + case float64: + return strconv.FormatInt(int64(typed), 10) + default: + return "" + } +} + +func firstString(values ...any) string { + for _, value := range values { + if s := stringValue(value); s != "" { + return s + } + } + return "" +} + +func firstAny(values ...any) any { + for _, value := range values { + if stringValue(value) != "" { + return value + } + } + return nil +} + +func nestedString(raw map[string]any, key, nested string) string { + child, _ := raw[key].(map[string]any) + return stringValue(child[nested]) +} + +func stringSlice(value any) []string { + raw, ok := value.([]any) + if !ok { + return nil + } + out := make([]string, 0, len(raw)) + for _, item := range raw { + if s := stringValue(item); s != "" { + out = append(out, s) + } + } + return out +} diff --git a/internal/twitterarchive/import.go b/internal/twitterarchive/import.go new file mode 100644 index 0000000..996af23 --- /dev/null +++ b/internal/twitterarchive/import.go @@ -0,0 +1,396 @@ +package twitterarchive + +import ( + "archive/zip" + "context" + "errors" + "fmt" + "html" + "strings" + "time" + + "github.com/steipete/discrawl/internal/store" +) + +const ( + GuildID = "x" + GuildName = "X / Twitter Archive" + + tweetsChannelID = "x:tweets" + likesChannelID = "x:likes" + syncScope = "twitter:last_import" + archiveScope = "twitter:last_archive" +) + +type Options struct { + Path string + DryRun bool + Now func() time.Time +} + +type Stats struct { + Path string `json:"path"` + FilesScanned int `json:"files_scanned"` + Accounts int `json:"accounts"` + Tweets int `json:"tweets"` + Likes int `json:"likes"` + DMConversations int `json:"dm_conversations"` + DMMessages int `json:"dm_messages"` + Skipped int `json:"skipped"` + DryRun bool `json:"dry_run,omitempty"` + StartedAt time.Time `json:"started_at"` + FinishedAt time.Time `json:"finished_at"` +} + +type account struct { + ID string + Username string + DisplayName string + Email string + CreatedAt string + Raw map[string]any +} + +type importSnapshot struct { + account account + channels map[string]store.ChannelRecord + members map[string]store.MemberRecord + messages []store.MessageMutation + conversations map[string]struct{} +} + +func Import(ctx context.Context, st *store.Store, opts Options) (Stats, error) { + if st == nil && !opts.DryRun { + return Stats{}, errors.New("store is required") + } + if strings.TrimSpace(opts.Path) == "" { + return Stats{}, errors.New("archive path is required") + } + now := opts.Now + if now == nil { + now = time.Now + } + stats := Stats{Path: opts.Path, DryRun: opts.DryRun, StartedAt: now().UTC()} + reader, err := zip.OpenReader(opts.Path) + if err != nil { + stats.FinishedAt = now().UTC() + return stats, fmt.Errorf("open twitter archive: %w", err) + } + defer func() { _ = reader.Close() }() + + snap := importSnapshot{ + channels: map[string]store.ChannelRecord{}, + members: map[string]store.MemberRecord{}, + conversations: map[string]struct{}{}, + } + snap.channels[tweetsChannelID] = channel(tweetsChannelID, "tweets", "tweet") + snap.channels[likesChannelID] = channel(likesChannelID, "likes", "like") + + for _, file := range sortedFiles(reader.File) { + if ctx.Err() != nil { + return stats, ctx.Err() + } + kind := classify(file.Name) + if kind == "" { + continue + } + content, err := readZipText(file) + if err != nil { + stats.Skipped++ + continue + } + records, err := parseArchiveArray(content) + if err != nil { + stats.Skipped++ + continue + } + stats.FilesScanned++ + switch kind { + case "account": + if parseAccount(records, &snap) { + stats.Accounts = 1 + } + case "tweets": + stats.Tweets += parseTweets(records, &snap) + case "likes": + stats.Likes += parseLikes(records, &snap, now().UTC()) + case "dms": + conversations, messages := parseDMs(records, &snap) + stats.DMConversations += conversations + stats.DMMessages += messages + } + } + if snap.account.ID != "" { + snap.members[memberKey(GuildID, snap.account.ID)] = memberFromAccount(snap.account) + } + for i := range snap.messages { + if snap.messages[i].Record.AuthorID == "" && snap.account.ID != "" { + snap.messages[i].Record.AuthorID = snap.account.ID + snap.messages[i].Record.AuthorName = snap.account.Username + } + } + if !opts.DryRun { + if err := writeSnapshot(ctx, st, snap, opts.Path); err != nil { + return stats, err + } + } + stats.FinishedAt = now().UTC() + return stats, nil +} + +func writeSnapshot(ctx context.Context, st *store.Store, snap importSnapshot, path string) error { + if err := st.UpsertGuild(ctx, store.GuildRecord{ID: GuildID, Name: GuildName, RawJSON: rawJSON(map[string]any{"platform": "x"})}); err != nil { + return err + } + for _, member := range sortedMembers(snap.members) { + if err := st.UpsertMember(ctx, member); err != nil { + return err + } + } + for _, ch := range sortedChannels(snap.channels) { + if err := st.UpsertChannel(ctx, ch); err != nil { + return err + } + } + const chunkSize = 1000 + for start := 0; start < len(snap.messages); start += chunkSize { + end := start + chunkSize + if end > len(snap.messages) { + end = len(snap.messages) + } + if err := st.UpsertMessages(ctx, snap.messages[start:end]); err != nil { + return err + } + } + if err := st.SetSyncState(ctx, syncScope, time.Now().UTC().Format(time.RFC3339Nano)); err != nil { + return err + } + return st.SetSyncState(ctx, archiveScope, path) +} + +func parseAccount(records []map[string]any, snap *importSnapshot) bool { + for _, record := range records { + raw, ok := record["account"].(map[string]any) + if !ok { + continue + } + acc := account{ + ID: stringValue(raw["accountId"]), + Username: stringValue(raw["username"]), + DisplayName: firstString(raw["accountDisplayName"], raw["name"]), + Email: stringValue(raw["email"]), + CreatedAt: stringValue(raw["createdAt"]), + Raw: raw, + } + if acc.ID == "" || acc.Username == "" { + continue + } + snap.account = acc + snap.members[memberKey(GuildID, acc.ID)] = memberFromAccount(acc) + return true + } + return false +} + +func parseTweets(records []map[string]any, snap *importSnapshot) int { + count := 0 + for _, record := range records { + raw, ok := record["tweet"].(map[string]any) + if !ok { + continue + } + id := firstString(raw["id_str"], raw["id"]) + text := html.UnescapeString(stringValue(raw["full_text"])) + createdAt := parseTime(firstString(raw["created_at"], raw["createdAt"])) + if id == "" || text == "" || createdAt.IsZero() { + continue + } + authorID := firstString(raw["author_id"], raw["user_id_str"], nestedString(raw, "user", "id_str"), snap.account.ID) + authorName := snap.account.Username + msg := store.MessageMutation{ + Record: store.MessageRecord{ + ID: "x:tweet:" + id, + GuildID: GuildID, + ChannelID: tweetsChannelID, + ChannelName: "tweets", + AuthorID: authorID, + AuthorName: authorName, + MessageType: 0, + CreatedAt: createdAt.UTC().Format(time.RFC3339Nano), + Content: text, + NormalizedContent: text, + ReplyToMessageID: prefixedTweetID(firstString(raw["in_reply_to_status_id_str"], raw["in_reply_to_status_id"])), + HasAttachments: hasMedia(raw), + RawJSON: rawJSON(map[string]any{"platform": "x", "type": "tweet", "tweet": raw}), + }, + Mentions: tweetMentions(id, authorID, createdAt, raw), + } + snap.messages = append(snap.messages, msg) + count++ + } + return count +} + +func parseLikes(records []map[string]any, snap *importSnapshot, fallback time.Time) int { + count := 0 + for _, record := range records { + raw, ok := record["like"].(map[string]any) + if !ok { + continue + } + tweetID := stringValue(raw["tweetId"]) + text := html.UnescapeString(stringValue(raw["fullText"])) + if tweetID == "" || text == "" { + continue + } + createdAt := twitterSnowflakeTime(tweetID) + if createdAt.IsZero() { + createdAt = fallback + } + msg := store.MessageMutation{Record: store.MessageRecord{ + ID: "x:like:" + tweetID, + GuildID: GuildID, + ChannelID: likesChannelID, + ChannelName: "likes", + AuthorID: "x:liked", + AuthorName: "liked", + MessageType: 0, + CreatedAt: createdAt.UTC().Format(time.RFC3339Nano), + Content: text, + NormalizedContent: text, + RawJSON: rawJSON(map[string]any{"platform": "x", "type": "like", "like": raw}), + }} + snap.members[memberKey(GuildID, "x:liked")] = store.MemberRecord{GuildID: GuildID, UserID: "x:liked", Username: "liked", DisplayName: "Liked Tweets", RoleIDsJSON: `[]`, RawJSON: `{}`} + snap.messages = append(snap.messages, msg) + count++ + } + return count +} + +func parseDMs(records []map[string]any, snap *importSnapshot) (int, int) { + conversations := 0 + messages := 0 + for _, record := range records { + rawConv, ok := record["dmConversation"].(map[string]any) + if !ok { + continue + } + conversationID := stringValue(rawConv["conversationId"]) + rawMessages, ok := rawConv["messages"].([]any) + if conversationID == "" || !ok { + continue + } + channelID := "x:dm:" + conversationID + if _, seen := snap.conversations[conversationID]; !seen { + snap.conversations[conversationID] = struct{}{} + snap.channels[channelID] = channel(channelID, conversationName(conversationID, snap.account.ID), "dm") + conversations++ + } + for _, item := range rawMessages { + rawMessage, ok := item.(map[string]any) + if !ok { + continue + } + messageCreate, ok := rawMessage["messageCreate"].(map[string]any) + if !ok { + continue + } + id := stringValue(messageCreate["id"]) + text := html.UnescapeString(stringValue(messageCreate["text"])) + createdAt := parseTime(stringValue(messageCreate["createdAt"])) + senderID := stringValue(messageCreate["senderId"]) + if id == "" || createdAt.IsZero() || (text == "" && len(stringSlice(messageCreate["mediaUrls"])) == 0) { + continue + } + participants := participantsForMessage(conversationID, senderID, stringValue(messageCreate["recipientId"])) + for _, participant := range participants { + upsertGenericMember(snap, participant) + } + attachments := dmAttachments(id, channelID, senderID, messageCreate) + snap.messages = append(snap.messages, store.MessageMutation{ + Record: store.MessageRecord{ + ID: "x:dm:" + id, + GuildID: GuildID, + ChannelID: channelID, + ChannelName: snap.channels[channelID].Name, + AuthorID: senderID, + AuthorName: authorLabel(senderID, snap.account), + MessageType: 0, + CreatedAt: createdAt.UTC().Format(time.RFC3339Nano), + Content: text, + NormalizedContent: text, + HasAttachments: len(attachments) > 0, + RawJSON: rawJSON(map[string]any{"platform": "x", "type": "dm", "conversation_id": conversationID, "message": messageCreate}), + }, + Attachments: attachments, + }) + messages++ + } + } + return conversations, messages +} + +func channel(id, name, kind string) store.ChannelRecord { + return store.ChannelRecord{ID: id, GuildID: GuildID, Kind: kind, Name: name, RawJSON: rawJSON(map[string]any{"platform": "x", "kind": kind})} +} + +func memberFromAccount(acc account) store.MemberRecord { + return store.MemberRecord{ + GuildID: GuildID, + UserID: acc.ID, + Username: acc.Username, + DisplayName: acc.DisplayName, + JoinedAt: acc.CreatedAt, + RoleIDsJSON: `[]`, + RawJSON: rawJSON(map[string]any{"platform": "x", "account": acc.Raw, "email": acc.Email}), + } +} + +func upsertGenericMember(snap *importSnapshot, id string) { + if id == "" { + return + } + if snap.account.ID == id { + snap.members[memberKey(GuildID, id)] = memberFromAccount(snap.account) + return + } + key := memberKey(GuildID, id) + if _, ok := snap.members[key]; ok { + return + } + snap.members[key] = store.MemberRecord{GuildID: GuildID, UserID: id, Username: "user-" + shortID(id), RoleIDsJSON: `[]`, RawJSON: rawJSON(map[string]any{"platform": "x"})} +} + +func tweetMentions(tweetID, authorID string, createdAt time.Time, raw map[string]any) []store.MentionEventRecord { + entities, _ := raw["entities"].(map[string]any) + rawMentions, _ := entities["user_mentions"].([]any) + var mentions []store.MentionEventRecord + for _, item := range rawMentions { + mention, ok := item.(map[string]any) + if !ok { + continue + } + targetID := stringValue(firstAny(mention["id_str"], mention["id"])) + targetName := firstString(mention["screen_name"], mention["name"]) + if targetID == "" && targetName == "" { + continue + } + mentions = append(mentions, store.MentionEventRecord{ + MessageID: "x:tweet:" + tweetID, + GuildID: GuildID, + ChannelID: tweetsChannelID, + AuthorID: authorID, + TargetType: "user", + TargetID: targetID, + TargetName: targetName, + EventAt: createdAt.UTC().Format(time.RFC3339Nano), + }) + } + return mentions +} + +func hasMedia(raw map[string]any) bool { + extended, _ := raw["extended_entities"].(map[string]any) + media, _ := extended["media"].([]any) + return len(media) > 0 +} diff --git a/internal/twitterarchive/import_test.go b/internal/twitterarchive/import_test.go new file mode 100644 index 0000000..7bdef78 --- /dev/null +++ b/internal/twitterarchive/import_test.go @@ -0,0 +1,121 @@ +package twitterarchive + +import ( + "archive/zip" + "context" + "os" + "path/filepath" + "testing" + "time" + + "github.com/stretchr/testify/require" + + "github.com/steipete/discrawl/internal/store" +) + +func TestImportTwitterArchiveWritesSearchableMessages(t *testing.T) { + ctx := context.Background() + dir := t.TempDir() + archivePath := filepath.Join(dir, "twitter.zip") + writeTestArchive(t, archivePath) + + st, err := store.Open(ctx, filepath.Join(dir, "discrawl.db")) + require.NoError(t, err) + defer func() { _ = st.Close() }() + + stats, err := Import(ctx, st, Options{ + Path: archivePath, + Now: func() time.Time { return time.Date(2026, 4, 26, 12, 0, 0, 0, time.UTC) }, + }) + require.NoError(t, err) + require.Equal(t, 1, stats.Accounts) + require.Equal(t, 1, stats.Tweets) + require.Equal(t, 1, stats.Likes) + require.Equal(t, 1, stats.DMConversations) + require.Equal(t, 1, stats.DMMessages) + + results, err := st.SearchMessages(ctx, store.SearchOptions{Query: "pull requests", GuildIDs: []string{GuildID}, Limit: 10}) + require.NoError(t, err) + require.Len(t, results, 1) + require.Equal(t, "x:tweet:1952542067017584782", results[0].MessageID) + require.Equal(t, "steipete", results[0].AuthorName) + + results, err = st.SearchMessages(ctx, store.SearchOptions{Query: "secret roadmap", GuildIDs: []string{GuildID}, Limit: 10}) + require.NoError(t, err) + require.Len(t, results, 1) + require.Equal(t, "x:dm:1052590933307461636", results[0].MessageID) + + cursor, err := st.GetSyncState(ctx, archiveScope) + require.NoError(t, err) + require.Equal(t, archivePath, cursor) +} + +func TestImportTwitterArchiveDryRunDoesNotWrite(t *testing.T) { + ctx := context.Background() + dir := t.TempDir() + archivePath := filepath.Join(dir, "twitter.zip") + writeTestArchive(t, archivePath) + + stats, err := Import(ctx, nil, Options{Path: archivePath, DryRun: true}) + require.NoError(t, err) + require.Equal(t, 1, stats.Tweets) + require.True(t, stats.DryRun) +} + +func writeTestArchive(t *testing.T, path string) { + t.Helper() + file, err := os.Create(path) + require.NoError(t, err) + defer func() { _ = file.Close() }() + + zw := zip.NewWriter(file) + defer func() { require.NoError(t, zw.Close()) }() + + writeZipEntry(t, zw, "data/account.js", `window.YTD.account.part0 = [{ + "account": { + "email": "steipete@gmail.com", + "username": "steipete", + "accountId": "25401953", + "createdAt": "2009-03-19T22:54:05.000Z", + "accountDisplayName": "Peter Steinberger" + } +}]`) + writeZipEntry(t, zw, "data/tweets.js", `window.YTD.tweets.part0 = [{ + "tweet": { + "id_str": "1952542067017584782", + "created_at": "Tue Aug 05 01:27:59 +0000 2025", + "full_text": "Getting pull requests with zero user testing aint it.", + "entities": {"user_mentions": [{"screen_name": "alice", "id_str": "42"}]} + } +}]`) + writeZipEntry(t, zw, "data/like.js", `window.YTD.like.part0 = [{ + "like": { + "tweetId": "1952539858771275983", + "fullText": "Liked archive import smoke" + } +}]`) + writeZipEntry(t, zw, "data/direct-messages.js", `window.YTD.direct_messages.part0 = [{ + "dmConversation": { + "conversationId": "929-25401953", + "messages": [{ + "messageCreate": { + "recipientId": "929", + "senderId": "25401953", + "id": "1052590933307461636", + "createdAt": "2018-10-17T16:03:29.391Z", + "text": "secret roadmap", + "mediaUrls": [], + "urls": [] + } + }] + } +}]`) +} + +func writeZipEntry(t *testing.T, zw *zip.Writer, name, body string) { + t.Helper() + w, err := zw.Create(name) + require.NoError(t, err) + _, err = w.Write([]byte(body)) + require.NoError(t, err) +}