diff --git a/CHANGELOG.md b/CHANGELOG.md index 8307841..a848f4e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,16 +4,6 @@ All notable changes to `discrawl` will be documented in this file. ## Unreleased -### Changes - -- Added `twitter import` / `x import` for local X/Twitter archive `.zip` files, storing tweets, likes, and direct messages under synthetic guild id `x` so existing Discrawl search and SQL commands can query them. - -### Fixes - -- Git snapshot imports now preserve local X/Twitter archive rows and `twitter:*` sync state, matching the local-only handling already used for wiretap DMs. - -## 0.6.1 - 2026-04-25 - ### Maintenance - Refreshed Go module dependencies and CI tool/action pins, including staticcheck, gofumpt, gosec, govulncheck, gitleaks, setup-node, and GoReleaser. diff --git a/README.md b/README.md index d2ba00b..8481e6f 100644 --- a/README.md +++ b/README.md @@ -268,24 +268,6 @@ Notes: - does not extract, store, or print Discord auth tokens - `--max-file-bytes` skips unusually large files; default is 64 MiB -### `twitter import` - -Imports a local X/Twitter archive `.zip` into the same SQLite message/search tables. - -```bash -discrawl twitter import --archive ~/Downloads/twitter-2025-08-05.zip -discrawl x import --archive ~/Downloads/twitter-2025-08-05.zip --dry-run -discrawl search --guild x "launch checklist" -``` - -Notes: - -- stores imported tweets, likes, and direct messages under synthetic guild id `x` -- maps tweets to `x:tweets`, liked tweet text to `x:likes`, and each DM conversation to an `x:dm:*` channel -- uses the archive's JS assignment files such as `data/account.js`, `data/tweets*.js`, `data/like.js`, `data/direct-messages.js`, and `data/direct-messages-group.js` -- prefixes imported message ids with `x:tweet:`, `x:like:`, or `x:dm:` so they cannot collide with Discord snowflakes -- stamps `twitter:last_import` and `twitter:last_archive` in `sync_state` - ### `search` Searches archived messages. FTS is the default mode and works without embeddings. diff --git a/internal/cli/cli.go b/internal/cli/cli.go index 95e57ff..2ec8263 100644 --- a/internal/cli/cli.go +++ b/internal/cli/cli.go @@ -128,8 +128,6 @@ func (r *runtime) dispatch(rest []string) error { return r.withServices(true, func() error { return r.runTail(rest[1:]) }) case "wiretap": return r.withLocalStoreDefault(false, func() error { return r.runWiretap(rest[1:]) }) - case "twitter", "x": - return r.withLocalStoreDefault(false, func() error { return r.runTwitter(rest[1:]) }) case "search": autoShareUpdate := !hasBoolFlag(rest[1:], "--dm") return r.withLocalStoreDefault(autoShareUpdate, func() error { return r.runSearch(rest[1:]) }) diff --git a/internal/cli/cli_test.go b/internal/cli/cli_test.go index cb27117..0ec4de9 100644 --- a/internal/cli/cli_test.go +++ b/internal/cli/cli_test.go @@ -1,7 +1,6 @@ package cli import ( - "archive/zip" "bytes" "context" "encoding/json" @@ -183,49 +182,6 @@ func TestWiretapImportsDesktopDirectMessages(t *testing.T) { require.Contains(t, out.String(), "secret DM launch plan") } -func TestTwitterImportCommand(t *testing.T) { - ctx := context.Background() - dir := t.TempDir() - cfgPath := filepath.Join(dir, "config.toml") - dbPath := filepath.Join(dir, "discrawl.db") - archivePath := filepath.Join(dir, "twitter.zip") - writeTwitterArchiveFixture(t, archivePath) - - cfg := config.Default() - cfg.DBPath = dbPath - cfg.Discord.TokenSource = "none" - require.NoError(t, config.Write(cfgPath, cfg)) - - var out bytes.Buffer - require.NoError(t, Run(ctx, []string{"--config", cfgPath, "twitter", "import", "--archive", archivePath}, &out, &bytes.Buffer{})) - require.Contains(t, out.String(), "tweets=1") - require.Contains(t, out.String(), "dm_messages=1") - - out.Reset() - require.NoError(t, Run(ctx, []string{"--config", cfgPath, "search", "--guild", "x", "secret roadmap"}, &out, &bytes.Buffer{})) - require.Contains(t, out.String(), "secret roadmap") -} - -func writeTwitterArchiveFixture(t *testing.T, path string) { - t.Helper() - file, err := os.Create(path) - require.NoError(t, err) - defer func() { _ = file.Close() }() - zw := zip.NewWriter(file) - defer func() { require.NoError(t, zw.Close()) }() - writeTwitterZipEntry(t, zw, "data/account.js", `window.YTD.account.part0 = [{"account":{"username":"steipete","accountId":"25401953","accountDisplayName":"Peter Steinberger"}}]`) - writeTwitterZipEntry(t, zw, "data/tweets.js", `window.YTD.tweets.part0 = [{"tweet":{"id_str":"1952542067017584782","created_at":"Tue Aug 05 01:27:59 +0000 2025","full_text":"archive tweet search text","entities":{"user_mentions":[]}}}]`) - writeTwitterZipEntry(t, zw, "data/direct-messages.js", `window.YTD.direct_messages.part0 = [{"dmConversation":{"conversationId":"929-25401953","messages":[{"messageCreate":{"recipientId":"929","senderId":"25401953","id":"1052590933307461636","createdAt":"2018-10-17T16:03:29.391Z","text":"secret roadmap","mediaUrls":[]}}]}}]`) -} - -func writeTwitterZipEntry(t *testing.T, zw *zip.Writer, name, body string) { - t.Helper() - w, err := zw.Create(name) - require.NoError(t, err) - _, err = w.Write([]byte(body)) - require.NoError(t, err) -} - func TestParseMessageWindow(t *testing.T) { rt := &runtime{now: func() time.Time { return time.Date(2026, 4, 24, 12, 0, 0, 0, time.UTC) diff --git a/internal/cli/output.go b/internal/cli/output.go index 21b7c83..92d6e0c 100644 --- a/internal/cli/output.go +++ b/internal/cli/output.go @@ -13,7 +13,6 @@ import ( "github.com/steipete/discrawl/internal/discorddesktop" "github.com/steipete/discrawl/internal/store" "github.com/steipete/discrawl/internal/syncer" - "github.com/steipete/discrawl/internal/twitterarchive" ) func (r *runtime) print(value any) error { @@ -86,8 +85,6 @@ Commands: sync tail wiretap - twitter - x search messages dms @@ -137,10 +134,6 @@ func printHuman(w io.Writer, value any) error { _, err := fmt.Fprintf(w, "path=%s\nfiles=%d\nskipped=%d\nobjects=%d\nguilds=%d\nchannels=%d\nmessages=%d\ndm_messages=%d\ndm_channels=%d\nguild_messages=%d\nskipped_messages=%d\nskipped_channels=%d\ndry_run=%t\n", v.Path, v.FilesScanned, v.FilesSkipped, v.JSONObjects, v.Guilds, v.Channels, v.Messages, v.DMMessages, v.DMChannels, v.GuildMessages, v.SkippedMessages, v.SkippedChannels, v.DryRun) return err - case twitterarchive.Stats: - _, err := fmt.Fprintf(w, "path=%s\nfiles=%d\naccounts=%d\ntweets=%d\nlikes=%d\ndm_conversations=%d\ndm_messages=%d\nskipped=%d\ndry_run=%t\n", - v.Path, v.FilesScanned, v.Accounts, v.Tweets, v.Likes, v.DMConversations, v.DMMessages, v.Skipped, v.DryRun) - return err case store.Status: _, err := fmt.Fprintf(w, "db=%s\nguilds=%d\nchannels=%d\nthreads=%d\nmessages=%d\nmembers=%d\nembedding_backlog=%d\nlast_sync=%s\nlast_tail_event=%s\n", v.DBPath, v.GuildCount, v.ChannelCount, v.ThreadCount, v.MessageCount, v.MemberCount, v.EmbeddingBacklog, diff --git a/internal/cli/twitter_commands.go b/internal/cli/twitter_commands.go deleted file mode 100644 index e176586..0000000 --- a/internal/cli/twitter_commands.go +++ /dev/null @@ -1,44 +0,0 @@ -package cli - -import ( - "flag" - "fmt" - "io" - - "github.com/steipete/discrawl/internal/twitterarchive" -) - -func (r *runtime) runTwitter(args []string) error { - if len(args) == 0 { - return usageErr(fmt.Errorf("twitter requires subcommand: import")) - } - switch args[0] { - case "import": - return r.runTwitterImport(args[1:]) - default: - return usageErr(fmt.Errorf("unknown twitter subcommand %q", args[0])) - } -} - -func (r *runtime) runTwitterImport(args []string) error { - fs := flag.NewFlagSet("twitter import", flag.ContinueOnError) - fs.SetOutput(io.Discard) - archivePath := fs.String("archive", "", "") - fs.StringVar(archivePath, "path", "", "") - dryRun := fs.Bool("dry-run", false, "") - if err := fs.Parse(args); err != nil { - return usageErr(err) - } - if fs.NArg() > 0 { - return usageErr(fmt.Errorf("twitter import takes flags only")) - } - stats, err := twitterarchive.Import(r.ctx, r.store, twitterarchive.Options{ - Path: *archivePath, - DryRun: *dryRun, - Now: r.now, - }) - if err != nil { - return err - } - return r.print(stats) -} diff --git a/internal/share/share.go b/internal/share/share.go index d7a268c..24fefd4 100644 --- a/internal/share/share.go +++ b/internal/share/share.go @@ -24,7 +24,6 @@ const ( LastImportSyncScope = "share:last_import_at" LastImportManifestSyncScope = "share:last_import_manifest_generated_at" directMessageGuildID = "@me" - twitterArchiveGuildID = "x" ) var ErrNoManifest = errors.New("share manifest not found") @@ -635,11 +634,11 @@ func importColumns(table TableManifest) []string { func snapshotExportQuery(table string) (string, []any) { switch table { case "guilds": - return "select * from guilds where id not in (?, ?)", []any{directMessageGuildID, twitterArchiveGuildID} + return "select * from guilds where id != ?", []any{directMessageGuildID} case "channels", "members", "messages", "message_events", "message_attachments", "mention_events": - return "select * from " + table + " where guild_id not in (?, ?)", []any{directMessageGuildID, twitterArchiveGuildID} + return "select * from " + table + " where guild_id != ?", []any{directMessageGuildID} case "sync_state": - return "select * from sync_state where scope not like 'wiretap:%' and scope not like 'twitter:%'", nil + return "select * from sync_state where scope not like 'wiretap:%'", nil default: return "select * from " + table, nil } @@ -648,13 +647,13 @@ func snapshotExportQuery(table string) (string, []any) { func snapshotDeleteQuery(table string) (string, []any) { switch table { case "guilds": - return "delete from guilds where id not in (?, ?)", []any{directMessageGuildID, twitterArchiveGuildID} + return "delete from guilds where id != ?", []any{directMessageGuildID} case "message_events", "mention_events": - return "delete from " + table + " where guild_id not in (?, ?)", []any{directMessageGuildID, twitterArchiveGuildID} + return "delete from " + table + " where guild_id != ?", []any{directMessageGuildID} case "channels", "members", "messages", "message_attachments": - return "delete from " + table + " where guild_id not in (?, ?)", []any{directMessageGuildID, twitterArchiveGuildID} + return "delete from " + table + " where guild_id != ?", []any{directMessageGuildID} case "sync_state": - return "delete from sync_state where scope not like 'wiretap:%' and scope not like 'twitter:%'", nil + return "delete from sync_state where scope not like 'wiretap:%'", nil default: return "delete from " + table, nil } @@ -668,14 +667,14 @@ func isDirectMessageSnapshotRow(table string, row map[string]any) bool { return isLocalOnlyGuildID(stringValue(row["guild_id"])) case "sync_state": scope := stringValue(row["scope"]) - return strings.HasPrefix(scope, "wiretap:") || strings.HasPrefix(scope, "twitter:") + return strings.HasPrefix(scope, "wiretap:") default: return false } } func isLocalOnlyGuildID(guildID string) bool { - return guildID == directMessageGuildID || guildID == twitterArchiveGuildID + return guildID == directMessageGuildID } func importEmbeddings(ctx context.Context, tx *sql.Tx, opts Options, manifests []EmbeddingManifest) error { diff --git a/internal/share/share_test.go b/internal/share/share_test.go index 43eaf62..4f3c0ce 100644 --- a/internal/share/share_test.go +++ b/internal/share/share_test.go @@ -110,7 +110,6 @@ func TestSnapshotExcludesAndPreservesDirectMessages(t *testing.T) { src := seedStore(t, filepath.Join(t.TempDir(), "src.db")) defer func() { _ = src.Close() }() seedDirectMessageData(t, ctx, src) - seedTwitterArchiveData(t, ctx, src) repo := filepath.Join(t.TempDir(), "share") manifest, err := Export(ctx, src, Options{RepoPath: repo, Branch: "main"}) @@ -119,19 +118,14 @@ func TestSnapshotExcludesAndPreservesDirectMessages(t *testing.T) { require.Equal(t, 1, tableEntry(t, manifest, "channels").Rows) require.Equal(t, 1, tableEntry(t, manifest, "messages").Rows) require.NotContains(t, snapshotTableText(t, repo, tableEntry(t, manifest, "guilds")), directMessageGuildID) - require.NotContains(t, snapshotTableText(t, repo, tableEntry(t, manifest, "guilds")), twitterArchiveGuildID) require.NotContains(t, snapshotTableText(t, repo, tableEntry(t, manifest, "channels")), directMessageGuildID) - require.NotContains(t, snapshotTableText(t, repo, tableEntry(t, manifest, "channels")), "x:tweets") require.NotContains(t, snapshotTableText(t, repo, tableEntry(t, manifest, "messages")), "private dm content") - require.NotContains(t, snapshotTableText(t, repo, tableEntry(t, manifest, "messages")), "local tweet content") require.NotContains(t, snapshotTableText(t, repo, tableEntry(t, manifest, "sync_state")), "wiretap:last_import") - require.NotContains(t, snapshotTableText(t, repo, tableEntry(t, manifest, "sync_state")), "twitter:last_import") dst, err := store.Open(ctx, filepath.Join(t.TempDir(), "dst.db")) require.NoError(t, err) defer func() { _ = dst.Close() }() seedDirectMessageData(t, ctx, dst) - seedTwitterArchiveData(t, ctx, dst) _, err = Import(ctx, dst, Options{RepoPath: repo, Branch: "main"}) require.NoError(t, err) @@ -142,16 +136,9 @@ func TestSnapshotExcludesAndPreservesDirectMessages(t *testing.T) { guildResults, err := dst.SearchMessages(ctx, store.SearchOptions{Query: "launch checklist", Limit: 10}) require.NoError(t, err) require.Len(t, guildResults, 1) - twitterResults, err := dst.SearchMessages(ctx, store.SearchOptions{Query: "local tweet content", Limit: 10}) - require.NoError(t, err) - require.Len(t, twitterResults, 1) - require.Equal(t, twitterArchiveGuildID, twitterResults[0].GuildID) wiretapState, err := dst.GetSyncState(ctx, "wiretap:last_import") require.NoError(t, err) require.Equal(t, "2026-04-24T15:33:17Z", wiretapState) - twitterState, err := dst.GetSyncState(ctx, "twitter:last_import") - require.NoError(t, err) - require.Equal(t, "2026-04-24T16:00:00Z", twitterState) } func TestExportImportEmbeddingsOptIn(t *testing.T) { @@ -587,31 +574,28 @@ func TestShareSmallHelpersAndValidation(t *testing.T) { require.False(t, isNonFastForwardPush("everything up-to-date")) query, args := snapshotExportQuery("messages") - require.Equal(t, "select * from messages where guild_id not in (?, ?)", query) - require.Equal(t, []any{directMessageGuildID, twitterArchiveGuildID}, args) + require.Equal(t, "select * from messages where guild_id != ?", query) + require.Equal(t, []any{directMessageGuildID}, args) query, args = snapshotExportQuery("sync_state") - require.Equal(t, "select * from sync_state where scope not like 'wiretap:%' and scope not like 'twitter:%'", query) + require.Equal(t, "select * from sync_state where scope not like 'wiretap:%'", query) require.Nil(t, args) query, args = snapshotExportQuery("custom") require.Equal(t, "select * from custom", query) require.Nil(t, args) query, args = snapshotDeleteQuery("channels") - require.Equal(t, "delete from channels where guild_id not in (?, ?)", query) - require.Equal(t, []any{directMessageGuildID, twitterArchiveGuildID}, args) + require.Equal(t, "delete from channels where guild_id != ?", query) + require.Equal(t, []any{directMessageGuildID}, args) query, args = snapshotDeleteQuery("message_events") - require.Equal(t, "delete from message_events where guild_id not in (?, ?)", query) - require.Equal(t, []any{directMessageGuildID, twitterArchiveGuildID}, args) + require.Equal(t, "delete from message_events where guild_id != ?", query) + require.Equal(t, []any{directMessageGuildID}, args) query, args = snapshotDeleteQuery("custom") require.Equal(t, "delete from custom", query) require.Nil(t, args) require.True(t, isDirectMessageSnapshotRow("guilds", map[string]any{"id": directMessageGuildID})) - require.True(t, isDirectMessageSnapshotRow("guilds", map[string]any{"id": twitterArchiveGuildID})) require.True(t, isDirectMessageSnapshotRow("channels", map[string]any{"guild_id": directMessageGuildID})) - require.True(t, isDirectMessageSnapshotRow("channels", map[string]any{"guild_id": twitterArchiveGuildID})) require.True(t, isDirectMessageSnapshotRow("sync_state", map[string]any{"scope": "wiretap:last_import"})) - require.True(t, isDirectMessageSnapshotRow("sync_state", map[string]any{"scope": "twitter:last_import"})) require.False(t, isDirectMessageSnapshotRow("sync_state", map[string]any{"scope": "share:last_import"})) require.False(t, isDirectMessageSnapshotRow("custom", map[string]any{"guild_id": directMessageGuildID})) @@ -868,50 +852,6 @@ func seedDirectMessageData(t *testing.T, ctx context.Context, s *store.Store) { require.NoError(t, s.SetSyncState(ctx, "wiretap:last_import", now.Format(time.RFC3339))) } -func seedTwitterArchiveData(t *testing.T, ctx context.Context, s *store.Store) { - t.Helper() - now := time.Date(2026, 4, 24, 16, 0, 0, 0, time.UTC) - require.NoError(t, s.UpsertGuild(ctx, store.GuildRecord{ID: twitterArchiveGuildID, Name: "X / Twitter Archive", RawJSON: `{}`})) - require.NoError(t, s.UpsertChannel(ctx, store.ChannelRecord{ID: "x:tweets", GuildID: twitterArchiveGuildID, Kind: "tweet", Name: "tweets", RawJSON: `{}`})) - require.NoError(t, s.UpsertMember(ctx, store.MemberRecord{ - GuildID: twitterArchiveGuildID, - UserID: "25401953", - Username: "steipete", - DisplayName: "Peter", - RoleIDsJSON: `[]`, - RawJSON: `{}`, - })) - require.NoError(t, s.UpsertMessages(ctx, []store.MessageMutation{{ - Record: store.MessageRecord{ - ID: "x:tweet:1", - GuildID: twitterArchiveGuildID, - ChannelID: "x:tweets", - ChannelName: "tweets", - AuthorID: "25401953", - AuthorName: "steipete", - MessageType: 0, - CreatedAt: now.Format(time.RFC3339Nano), - Content: "local tweet content", - NormalizedContent: "local tweet content", - RawJSON: `{}`, - }, - EventType: "twitter", - PayloadJSON: `{"id":"x:tweet:1"}`, - Options: store.WriteOptions{AppendEvent: true}, - Mentions: []store.MentionEventRecord{{ - MessageID: "x:tweet:1", - GuildID: twitterArchiveGuildID, - ChannelID: "x:tweets", - AuthorID: "25401953", - TargetType: "user", - TargetID: "42", - TargetName: "alice", - EventAt: now.Format(time.RFC3339Nano), - }}, - }})) - require.NoError(t, s.SetSyncState(ctx, "twitter:last_import", now.Format(time.RFC3339))) -} - func configureGitUser(t *testing.T, repo string) { t.Helper() // #nosec G204 -- fixed git argv in test setup. diff --git a/internal/twitterarchive/helpers.go b/internal/twitterarchive/helpers.go deleted file mode 100644 index f3dd16b..0000000 --- a/internal/twitterarchive/helpers.go +++ /dev/null @@ -1,235 +0,0 @@ -package twitterarchive - -import ( - "archive/zip" - "encoding/json" - "errors" - "fmt" - "io" - "path/filepath" - "sort" - "strconv" - "strings" - "time" - - "github.com/steipete/discrawl/internal/store" -) - -func classify(path string) string { - normalized := filepath.ToSlash(path) - base := filepath.Base(normalized) - switch { - case strings.HasSuffix(normalized, "/data/account.js") || normalized == "data/account.js": - return "account" - case (base == "tweets.js" || strings.HasPrefix(base, "tweets-part")) && strings.HasSuffix(base, ".js"): - return "tweets" - case base == "like.js" || base == "likes.js": - return "likes" - case (base == "direct-messages.js" || base == "direct-messages-group.js") && strings.HasSuffix(base, ".js"): - return "dms" - default: - return "" - } -} - -func readZipText(file *zip.File) (string, error) { - rc, err := file.Open() - if err != nil { - return "", err - } - defer func() { _ = rc.Close() }() - var b strings.Builder - if _, err := io.Copy(&b, rc); err != nil { - return "", err - } - return b.String(), nil -} - -func parseArchiveArray(content string) ([]map[string]any, error) { - idx := strings.Index(content, "=") - if idx < 0 { - return nil, errors.New("missing assignment") - } - payload := strings.TrimSuffix(strings.TrimSpace(content[idx+1:]), ";") - dec := json.NewDecoder(strings.NewReader(payload)) - dec.UseNumber() - var records []map[string]any - if err := dec.Decode(&records); err != nil { - return nil, err - } - return records, nil -} - -func dmAttachments(messageID, channelID, authorID string, raw map[string]any) []store.AttachmentRecord { - urls := stringSlice(raw["mediaUrls"]) - attachments := make([]store.AttachmentRecord, 0, len(urls)) - for i, url := range urls { - attachments = append(attachments, store.AttachmentRecord{ - AttachmentID: fmt.Sprintf("x:dm:%s:media:%d", messageID, i), - MessageID: "x:dm:" + messageID, - GuildID: GuildID, - ChannelID: channelID, - AuthorID: authorID, - Filename: filepath.Base(url), - URL: url, - }) - } - return attachments -} - -func participantsForMessage(conversationID, senderID, recipientID string) []string { - set := map[string]struct{}{} - for _, value := range []string{senderID, recipientID} { - if value != "" { - set[value] = struct{}{} - } - } - for _, part := range strings.Split(conversationID, "-") { - if part != "" { - set[part] = struct{}{} - } - } - out := make([]string, 0, len(set)) - for id := range set { - out = append(out, id) - } - sort.Strings(out) - return out -} - -func conversationName(conversationID, selfID string) string { - parts := strings.Split(conversationID, "-") - if len(parts) == 2 { - for _, part := range parts { - if part != "" && part != selfID { - return "dm-" + shortID(part) - } - } - } - return "group-" + shortID(conversationID) -} - -func authorLabel(id string, acc account) string { - if id == acc.ID && acc.Username != "" { - return acc.Username - } - return "user-" + shortID(id) -} - -func parseTime(value string) time.Time { - value = strings.TrimSpace(value) - for _, layout := range []string{time.RFC3339Nano, time.RFC3339, "Mon Jan 02 15:04:05 -0700 2006"} { - if t, err := time.Parse(layout, value); err == nil { - return t - } - } - return time.Time{} -} - -func twitterSnowflakeTime(id string) time.Time { - n, err := strconv.ParseInt(id, 10, 64) - if err != nil || n <= 0 { - return time.Time{} - } - const twitterEpochMs = int64(1288834974657) - ms := (n >> 22) + twitterEpochMs - return time.UnixMilli(ms).UTC() -} - -func sortedFiles(files []*zip.File) []*zip.File { - out := append([]*zip.File(nil), files...) - sort.Slice(out, func(i, j int) bool { return out[i].Name < out[j].Name }) - return out -} - -func sortedChannels(channels map[string]store.ChannelRecord) []store.ChannelRecord { - out := make([]store.ChannelRecord, 0, len(channels)) - for _, ch := range channels { - out = append(out, ch) - } - sort.Slice(out, func(i, j int) bool { return out[i].ID < out[j].ID }) - return out -} - -func sortedMembers(members map[string]store.MemberRecord) []store.MemberRecord { - out := make([]store.MemberRecord, 0, len(members)) - for _, member := range members { - out = append(out, member) - } - sort.Slice(out, func(i, j int) bool { return out[i].UserID < out[j].UserID }) - return out -} - -func memberKey(guildID, userID string) string { return guildID + "\x00" + userID } - -func prefixedTweetID(id string) string { - if id == "" { - return "" - } - return "x:tweet:" + id -} - -func shortID(id string) string { - if len(id) <= 8 { - return id - } - return id[len(id)-8:] -} - -func rawJSON(value any) string { - data, err := json.Marshal(value) - if err != nil { - return `{}` - } - return string(data) -} - -func stringValue(value any) string { - switch typed := value.(type) { - case string: - return strings.TrimSpace(typed) - case json.Number: - return typed.String() - case float64: - return strconv.FormatInt(int64(typed), 10) - default: - return "" - } -} - -func firstString(values ...any) string { - for _, value := range values { - if s := stringValue(value); s != "" { - return s - } - } - return "" -} - -func firstAny(values ...any) any { - for _, value := range values { - if stringValue(value) != "" { - return value - } - } - return nil -} - -func nestedString(raw map[string]any, key, nested string) string { - child, _ := raw[key].(map[string]any) - return stringValue(child[nested]) -} - -func stringSlice(value any) []string { - raw, ok := value.([]any) - if !ok { - return nil - } - out := make([]string, 0, len(raw)) - for _, item := range raw { - if s := stringValue(item); s != "" { - out = append(out, s) - } - } - return out -} diff --git a/internal/twitterarchive/import.go b/internal/twitterarchive/import.go deleted file mode 100644 index 996af23..0000000 --- a/internal/twitterarchive/import.go +++ /dev/null @@ -1,396 +0,0 @@ -package twitterarchive - -import ( - "archive/zip" - "context" - "errors" - "fmt" - "html" - "strings" - "time" - - "github.com/steipete/discrawl/internal/store" -) - -const ( - GuildID = "x" - GuildName = "X / Twitter Archive" - - tweetsChannelID = "x:tweets" - likesChannelID = "x:likes" - syncScope = "twitter:last_import" - archiveScope = "twitter:last_archive" -) - -type Options struct { - Path string - DryRun bool - Now func() time.Time -} - -type Stats struct { - Path string `json:"path"` - FilesScanned int `json:"files_scanned"` - Accounts int `json:"accounts"` - Tweets int `json:"tweets"` - Likes int `json:"likes"` - DMConversations int `json:"dm_conversations"` - DMMessages int `json:"dm_messages"` - Skipped int `json:"skipped"` - DryRun bool `json:"dry_run,omitempty"` - StartedAt time.Time `json:"started_at"` - FinishedAt time.Time `json:"finished_at"` -} - -type account struct { - ID string - Username string - DisplayName string - Email string - CreatedAt string - Raw map[string]any -} - -type importSnapshot struct { - account account - channels map[string]store.ChannelRecord - members map[string]store.MemberRecord - messages []store.MessageMutation - conversations map[string]struct{} -} - -func Import(ctx context.Context, st *store.Store, opts Options) (Stats, error) { - if st == nil && !opts.DryRun { - return Stats{}, errors.New("store is required") - } - if strings.TrimSpace(opts.Path) == "" { - return Stats{}, errors.New("archive path is required") - } - now := opts.Now - if now == nil { - now = time.Now - } - stats := Stats{Path: opts.Path, DryRun: opts.DryRun, StartedAt: now().UTC()} - reader, err := zip.OpenReader(opts.Path) - if err != nil { - stats.FinishedAt = now().UTC() - return stats, fmt.Errorf("open twitter archive: %w", err) - } - defer func() { _ = reader.Close() }() - - snap := importSnapshot{ - channels: map[string]store.ChannelRecord{}, - members: map[string]store.MemberRecord{}, - conversations: map[string]struct{}{}, - } - snap.channels[tweetsChannelID] = channel(tweetsChannelID, "tweets", "tweet") - snap.channels[likesChannelID] = channel(likesChannelID, "likes", "like") - - for _, file := range sortedFiles(reader.File) { - if ctx.Err() != nil { - return stats, ctx.Err() - } - kind := classify(file.Name) - if kind == "" { - continue - } - content, err := readZipText(file) - if err != nil { - stats.Skipped++ - continue - } - records, err := parseArchiveArray(content) - if err != nil { - stats.Skipped++ - continue - } - stats.FilesScanned++ - switch kind { - case "account": - if parseAccount(records, &snap) { - stats.Accounts = 1 - } - case "tweets": - stats.Tweets += parseTweets(records, &snap) - case "likes": - stats.Likes += parseLikes(records, &snap, now().UTC()) - case "dms": - conversations, messages := parseDMs(records, &snap) - stats.DMConversations += conversations - stats.DMMessages += messages - } - } - if snap.account.ID != "" { - snap.members[memberKey(GuildID, snap.account.ID)] = memberFromAccount(snap.account) - } - for i := range snap.messages { - if snap.messages[i].Record.AuthorID == "" && snap.account.ID != "" { - snap.messages[i].Record.AuthorID = snap.account.ID - snap.messages[i].Record.AuthorName = snap.account.Username - } - } - if !opts.DryRun { - if err := writeSnapshot(ctx, st, snap, opts.Path); err != nil { - return stats, err - } - } - stats.FinishedAt = now().UTC() - return stats, nil -} - -func writeSnapshot(ctx context.Context, st *store.Store, snap importSnapshot, path string) error { - if err := st.UpsertGuild(ctx, store.GuildRecord{ID: GuildID, Name: GuildName, RawJSON: rawJSON(map[string]any{"platform": "x"})}); err != nil { - return err - } - for _, member := range sortedMembers(snap.members) { - if err := st.UpsertMember(ctx, member); err != nil { - return err - } - } - for _, ch := range sortedChannels(snap.channels) { - if err := st.UpsertChannel(ctx, ch); err != nil { - return err - } - } - const chunkSize = 1000 - for start := 0; start < len(snap.messages); start += chunkSize { - end := start + chunkSize - if end > len(snap.messages) { - end = len(snap.messages) - } - if err := st.UpsertMessages(ctx, snap.messages[start:end]); err != nil { - return err - } - } - if err := st.SetSyncState(ctx, syncScope, time.Now().UTC().Format(time.RFC3339Nano)); err != nil { - return err - } - return st.SetSyncState(ctx, archiveScope, path) -} - -func parseAccount(records []map[string]any, snap *importSnapshot) bool { - for _, record := range records { - raw, ok := record["account"].(map[string]any) - if !ok { - continue - } - acc := account{ - ID: stringValue(raw["accountId"]), - Username: stringValue(raw["username"]), - DisplayName: firstString(raw["accountDisplayName"], raw["name"]), - Email: stringValue(raw["email"]), - CreatedAt: stringValue(raw["createdAt"]), - Raw: raw, - } - if acc.ID == "" || acc.Username == "" { - continue - } - snap.account = acc - snap.members[memberKey(GuildID, acc.ID)] = memberFromAccount(acc) - return true - } - return false -} - -func parseTweets(records []map[string]any, snap *importSnapshot) int { - count := 0 - for _, record := range records { - raw, ok := record["tweet"].(map[string]any) - if !ok { - continue - } - id := firstString(raw["id_str"], raw["id"]) - text := html.UnescapeString(stringValue(raw["full_text"])) - createdAt := parseTime(firstString(raw["created_at"], raw["createdAt"])) - if id == "" || text == "" || createdAt.IsZero() { - continue - } - authorID := firstString(raw["author_id"], raw["user_id_str"], nestedString(raw, "user", "id_str"), snap.account.ID) - authorName := snap.account.Username - msg := store.MessageMutation{ - Record: store.MessageRecord{ - ID: "x:tweet:" + id, - GuildID: GuildID, - ChannelID: tweetsChannelID, - ChannelName: "tweets", - AuthorID: authorID, - AuthorName: authorName, - MessageType: 0, - CreatedAt: createdAt.UTC().Format(time.RFC3339Nano), - Content: text, - NormalizedContent: text, - ReplyToMessageID: prefixedTweetID(firstString(raw["in_reply_to_status_id_str"], raw["in_reply_to_status_id"])), - HasAttachments: hasMedia(raw), - RawJSON: rawJSON(map[string]any{"platform": "x", "type": "tweet", "tweet": raw}), - }, - Mentions: tweetMentions(id, authorID, createdAt, raw), - } - snap.messages = append(snap.messages, msg) - count++ - } - return count -} - -func parseLikes(records []map[string]any, snap *importSnapshot, fallback time.Time) int { - count := 0 - for _, record := range records { - raw, ok := record["like"].(map[string]any) - if !ok { - continue - } - tweetID := stringValue(raw["tweetId"]) - text := html.UnescapeString(stringValue(raw["fullText"])) - if tweetID == "" || text == "" { - continue - } - createdAt := twitterSnowflakeTime(tweetID) - if createdAt.IsZero() { - createdAt = fallback - } - msg := store.MessageMutation{Record: store.MessageRecord{ - ID: "x:like:" + tweetID, - GuildID: GuildID, - ChannelID: likesChannelID, - ChannelName: "likes", - AuthorID: "x:liked", - AuthorName: "liked", - MessageType: 0, - CreatedAt: createdAt.UTC().Format(time.RFC3339Nano), - Content: text, - NormalizedContent: text, - RawJSON: rawJSON(map[string]any{"platform": "x", "type": "like", "like": raw}), - }} - snap.members[memberKey(GuildID, "x:liked")] = store.MemberRecord{GuildID: GuildID, UserID: "x:liked", Username: "liked", DisplayName: "Liked Tweets", RoleIDsJSON: `[]`, RawJSON: `{}`} - snap.messages = append(snap.messages, msg) - count++ - } - return count -} - -func parseDMs(records []map[string]any, snap *importSnapshot) (int, int) { - conversations := 0 - messages := 0 - for _, record := range records { - rawConv, ok := record["dmConversation"].(map[string]any) - if !ok { - continue - } - conversationID := stringValue(rawConv["conversationId"]) - rawMessages, ok := rawConv["messages"].([]any) - if conversationID == "" || !ok { - continue - } - channelID := "x:dm:" + conversationID - if _, seen := snap.conversations[conversationID]; !seen { - snap.conversations[conversationID] = struct{}{} - snap.channels[channelID] = channel(channelID, conversationName(conversationID, snap.account.ID), "dm") - conversations++ - } - for _, item := range rawMessages { - rawMessage, ok := item.(map[string]any) - if !ok { - continue - } - messageCreate, ok := rawMessage["messageCreate"].(map[string]any) - if !ok { - continue - } - id := stringValue(messageCreate["id"]) - text := html.UnescapeString(stringValue(messageCreate["text"])) - createdAt := parseTime(stringValue(messageCreate["createdAt"])) - senderID := stringValue(messageCreate["senderId"]) - if id == "" || createdAt.IsZero() || (text == "" && len(stringSlice(messageCreate["mediaUrls"])) == 0) { - continue - } - participants := participantsForMessage(conversationID, senderID, stringValue(messageCreate["recipientId"])) - for _, participant := range participants { - upsertGenericMember(snap, participant) - } - attachments := dmAttachments(id, channelID, senderID, messageCreate) - snap.messages = append(snap.messages, store.MessageMutation{ - Record: store.MessageRecord{ - ID: "x:dm:" + id, - GuildID: GuildID, - ChannelID: channelID, - ChannelName: snap.channels[channelID].Name, - AuthorID: senderID, - AuthorName: authorLabel(senderID, snap.account), - MessageType: 0, - CreatedAt: createdAt.UTC().Format(time.RFC3339Nano), - Content: text, - NormalizedContent: text, - HasAttachments: len(attachments) > 0, - RawJSON: rawJSON(map[string]any{"platform": "x", "type": "dm", "conversation_id": conversationID, "message": messageCreate}), - }, - Attachments: attachments, - }) - messages++ - } - } - return conversations, messages -} - -func channel(id, name, kind string) store.ChannelRecord { - return store.ChannelRecord{ID: id, GuildID: GuildID, Kind: kind, Name: name, RawJSON: rawJSON(map[string]any{"platform": "x", "kind": kind})} -} - -func memberFromAccount(acc account) store.MemberRecord { - return store.MemberRecord{ - GuildID: GuildID, - UserID: acc.ID, - Username: acc.Username, - DisplayName: acc.DisplayName, - JoinedAt: acc.CreatedAt, - RoleIDsJSON: `[]`, - RawJSON: rawJSON(map[string]any{"platform": "x", "account": acc.Raw, "email": acc.Email}), - } -} - -func upsertGenericMember(snap *importSnapshot, id string) { - if id == "" { - return - } - if snap.account.ID == id { - snap.members[memberKey(GuildID, id)] = memberFromAccount(snap.account) - return - } - key := memberKey(GuildID, id) - if _, ok := snap.members[key]; ok { - return - } - snap.members[key] = store.MemberRecord{GuildID: GuildID, UserID: id, Username: "user-" + shortID(id), RoleIDsJSON: `[]`, RawJSON: rawJSON(map[string]any{"platform": "x"})} -} - -func tweetMentions(tweetID, authorID string, createdAt time.Time, raw map[string]any) []store.MentionEventRecord { - entities, _ := raw["entities"].(map[string]any) - rawMentions, _ := entities["user_mentions"].([]any) - var mentions []store.MentionEventRecord - for _, item := range rawMentions { - mention, ok := item.(map[string]any) - if !ok { - continue - } - targetID := stringValue(firstAny(mention["id_str"], mention["id"])) - targetName := firstString(mention["screen_name"], mention["name"]) - if targetID == "" && targetName == "" { - continue - } - mentions = append(mentions, store.MentionEventRecord{ - MessageID: "x:tweet:" + tweetID, - GuildID: GuildID, - ChannelID: tweetsChannelID, - AuthorID: authorID, - TargetType: "user", - TargetID: targetID, - TargetName: targetName, - EventAt: createdAt.UTC().Format(time.RFC3339Nano), - }) - } - return mentions -} - -func hasMedia(raw map[string]any) bool { - extended, _ := raw["extended_entities"].(map[string]any) - media, _ := extended["media"].([]any) - return len(media) > 0 -} diff --git a/internal/twitterarchive/import_test.go b/internal/twitterarchive/import_test.go deleted file mode 100644 index 7bdef78..0000000 --- a/internal/twitterarchive/import_test.go +++ /dev/null @@ -1,121 +0,0 @@ -package twitterarchive - -import ( - "archive/zip" - "context" - "os" - "path/filepath" - "testing" - "time" - - "github.com/stretchr/testify/require" - - "github.com/steipete/discrawl/internal/store" -) - -func TestImportTwitterArchiveWritesSearchableMessages(t *testing.T) { - ctx := context.Background() - dir := t.TempDir() - archivePath := filepath.Join(dir, "twitter.zip") - writeTestArchive(t, archivePath) - - st, err := store.Open(ctx, filepath.Join(dir, "discrawl.db")) - require.NoError(t, err) - defer func() { _ = st.Close() }() - - stats, err := Import(ctx, st, Options{ - Path: archivePath, - Now: func() time.Time { return time.Date(2026, 4, 26, 12, 0, 0, 0, time.UTC) }, - }) - require.NoError(t, err) - require.Equal(t, 1, stats.Accounts) - require.Equal(t, 1, stats.Tweets) - require.Equal(t, 1, stats.Likes) - require.Equal(t, 1, stats.DMConversations) - require.Equal(t, 1, stats.DMMessages) - - results, err := st.SearchMessages(ctx, store.SearchOptions{Query: "pull requests", GuildIDs: []string{GuildID}, Limit: 10}) - require.NoError(t, err) - require.Len(t, results, 1) - require.Equal(t, "x:tweet:1952542067017584782", results[0].MessageID) - require.Equal(t, "steipete", results[0].AuthorName) - - results, err = st.SearchMessages(ctx, store.SearchOptions{Query: "secret roadmap", GuildIDs: []string{GuildID}, Limit: 10}) - require.NoError(t, err) - require.Len(t, results, 1) - require.Equal(t, "x:dm:1052590933307461636", results[0].MessageID) - - cursor, err := st.GetSyncState(ctx, archiveScope) - require.NoError(t, err) - require.Equal(t, archivePath, cursor) -} - -func TestImportTwitterArchiveDryRunDoesNotWrite(t *testing.T) { - ctx := context.Background() - dir := t.TempDir() - archivePath := filepath.Join(dir, "twitter.zip") - writeTestArchive(t, archivePath) - - stats, err := Import(ctx, nil, Options{Path: archivePath, DryRun: true}) - require.NoError(t, err) - require.Equal(t, 1, stats.Tweets) - require.True(t, stats.DryRun) -} - -func writeTestArchive(t *testing.T, path string) { - t.Helper() - file, err := os.Create(path) - require.NoError(t, err) - defer func() { _ = file.Close() }() - - zw := zip.NewWriter(file) - defer func() { require.NoError(t, zw.Close()) }() - - writeZipEntry(t, zw, "data/account.js", `window.YTD.account.part0 = [{ - "account": { - "email": "steipete@gmail.com", - "username": "steipete", - "accountId": "25401953", - "createdAt": "2009-03-19T22:54:05.000Z", - "accountDisplayName": "Peter Steinberger" - } -}]`) - writeZipEntry(t, zw, "data/tweets.js", `window.YTD.tweets.part0 = [{ - "tweet": { - "id_str": "1952542067017584782", - "created_at": "Tue Aug 05 01:27:59 +0000 2025", - "full_text": "Getting pull requests with zero user testing aint it.", - "entities": {"user_mentions": [{"screen_name": "alice", "id_str": "42"}]} - } -}]`) - writeZipEntry(t, zw, "data/like.js", `window.YTD.like.part0 = [{ - "like": { - "tweetId": "1952539858771275983", - "fullText": "Liked archive import smoke" - } -}]`) - writeZipEntry(t, zw, "data/direct-messages.js", `window.YTD.direct_messages.part0 = [{ - "dmConversation": { - "conversationId": "929-25401953", - "messages": [{ - "messageCreate": { - "recipientId": "929", - "senderId": "25401953", - "id": "1052590933307461636", - "createdAt": "2018-10-17T16:03:29.391Z", - "text": "secret roadmap", - "mediaUrls": [], - "urls": [] - } - }] - } -}]`) -} - -func writeZipEntry(t *testing.T, zw *zip.Writer, name, body string) { - t.Helper() - w, err := zw.Create(name) - require.NoError(t, err) - _, err = w.Write([]byte(body)) - require.NoError(t, err) -}