feat: import twitter archives

This commit is contained in:
Peter Steinberger 2026-04-26 01:24:21 +01:00
parent 7f3ed7a8e7
commit c8986cbcf9
No known key found for this signature in database
9 changed files with 873 additions and 0 deletions

View File

@ -2,6 +2,12 @@
All notable changes to `discrawl` will be documented in this file.
## Unreleased
### Changes
- Added `twitter import` / `x import` for local X/Twitter archive `.zip` files, storing tweets, likes, and direct messages under synthetic guild id `x` so existing Discrawl search and SQL commands can query them.
## 0.6.1 - 2026-04-25
### Maintenance

View File

@ -268,6 +268,24 @@ Notes:
- does not extract, store, or print Discord auth tokens
- `--max-file-bytes` skips unusually large files; default is 64 MiB
### `twitter import`
Imports a local X/Twitter archive `.zip` into the same SQLite message/search tables.
```bash
discrawl twitter import --archive ~/Downloads/twitter-2025-08-05.zip
discrawl x import --archive ~/Downloads/twitter-2025-08-05.zip --dry-run
discrawl search --guild x "launch checklist"
```
Notes:
- stores imported tweets, likes, and direct messages under synthetic guild id `x`
- maps tweets to `x:tweets`, liked tweet text to `x:likes`, and each DM conversation to an `x:dm:*` channel
- uses the archive's JS assignment files such as `data/account.js`, `data/tweets*.js`, `data/like.js`, `data/direct-messages.js`, and `data/direct-messages-group.js`
- prefixes imported message ids with `x:tweet:`, `x:like:`, or `x:dm:` so they cannot collide with Discord snowflakes
- stamps `twitter:last_import` and `twitter:last_archive` in `sync_state`
### `search`
Searches archived messages. FTS is the default mode and works without embeddings.

View File

@ -128,6 +128,8 @@ func (r *runtime) dispatch(rest []string) error {
return r.withServices(true, func() error { return r.runTail(rest[1:]) })
case "wiretap":
return r.withLocalStoreDefault(false, func() error { return r.runWiretap(rest[1:]) })
case "twitter", "x":
return r.withLocalStoreDefault(false, func() error { return r.runTwitter(rest[1:]) })
case "search":
autoShareUpdate := !hasBoolFlag(rest[1:], "--dm")
return r.withLocalStoreDefault(autoShareUpdate, func() error { return r.runSearch(rest[1:]) })

View File

@ -1,6 +1,7 @@
package cli
import (
"archive/zip"
"bytes"
"context"
"encoding/json"
@ -182,6 +183,49 @@ func TestWiretapImportsDesktopDirectMessages(t *testing.T) {
require.Contains(t, out.String(), "secret DM launch plan")
}
func TestTwitterImportCommand(t *testing.T) {
ctx := context.Background()
dir := t.TempDir()
cfgPath := filepath.Join(dir, "config.toml")
dbPath := filepath.Join(dir, "discrawl.db")
archivePath := filepath.Join(dir, "twitter.zip")
writeTwitterArchiveFixture(t, archivePath)
cfg := config.Default()
cfg.DBPath = dbPath
cfg.Discord.TokenSource = "none"
require.NoError(t, config.Write(cfgPath, cfg))
var out bytes.Buffer
require.NoError(t, Run(ctx, []string{"--config", cfgPath, "twitter", "import", "--archive", archivePath}, &out, &bytes.Buffer{}))
require.Contains(t, out.String(), "tweets=1")
require.Contains(t, out.String(), "dm_messages=1")
out.Reset()
require.NoError(t, Run(ctx, []string{"--config", cfgPath, "search", "--guild", "x", "secret roadmap"}, &out, &bytes.Buffer{}))
require.Contains(t, out.String(), "secret roadmap")
}
func writeTwitterArchiveFixture(t *testing.T, path string) {
t.Helper()
file, err := os.Create(path)
require.NoError(t, err)
defer func() { _ = file.Close() }()
zw := zip.NewWriter(file)
defer func() { require.NoError(t, zw.Close()) }()
writeTwitterZipEntry(t, zw, "data/account.js", `window.YTD.account.part0 = [{"account":{"username":"steipete","accountId":"25401953","accountDisplayName":"Peter Steinberger"}}]`)
writeTwitterZipEntry(t, zw, "data/tweets.js", `window.YTD.tweets.part0 = [{"tweet":{"id_str":"1952542067017584782","created_at":"Tue Aug 05 01:27:59 +0000 2025","full_text":"archive tweet search text","entities":{"user_mentions":[]}}}]`)
writeTwitterZipEntry(t, zw, "data/direct-messages.js", `window.YTD.direct_messages.part0 = [{"dmConversation":{"conversationId":"929-25401953","messages":[{"messageCreate":{"recipientId":"929","senderId":"25401953","id":"1052590933307461636","createdAt":"2018-10-17T16:03:29.391Z","text":"secret roadmap","mediaUrls":[]}}]}}]`)
}
func writeTwitterZipEntry(t *testing.T, zw *zip.Writer, name, body string) {
t.Helper()
w, err := zw.Create(name)
require.NoError(t, err)
_, err = w.Write([]byte(body))
require.NoError(t, err)
}
func TestParseMessageWindow(t *testing.T) {
rt := &runtime{now: func() time.Time {
return time.Date(2026, 4, 24, 12, 0, 0, 0, time.UTC)

View File

@ -13,6 +13,7 @@ import (
"github.com/steipete/discrawl/internal/discorddesktop"
"github.com/steipete/discrawl/internal/store"
"github.com/steipete/discrawl/internal/syncer"
"github.com/steipete/discrawl/internal/twitterarchive"
)
func (r *runtime) print(value any) error {
@ -85,6 +86,8 @@ Commands:
sync
tail
wiretap
twitter
x
search
messages
dms
@ -134,6 +137,10 @@ func printHuman(w io.Writer, value any) error {
_, err := fmt.Fprintf(w, "path=%s\nfiles=%d\nskipped=%d\nobjects=%d\nguilds=%d\nchannels=%d\nmessages=%d\ndm_messages=%d\ndm_channels=%d\nguild_messages=%d\nskipped_messages=%d\nskipped_channels=%d\ndry_run=%t\n",
v.Path, v.FilesScanned, v.FilesSkipped, v.JSONObjects, v.Guilds, v.Channels, v.Messages, v.DMMessages, v.DMChannels, v.GuildMessages, v.SkippedMessages, v.SkippedChannels, v.DryRun)
return err
case twitterarchive.Stats:
_, err := fmt.Fprintf(w, "path=%s\nfiles=%d\naccounts=%d\ntweets=%d\nlikes=%d\ndm_conversations=%d\ndm_messages=%d\nskipped=%d\ndry_run=%t\n",
v.Path, v.FilesScanned, v.Accounts, v.Tweets, v.Likes, v.DMConversations, v.DMMessages, v.Skipped, v.DryRun)
return err
case store.Status:
_, err := fmt.Fprintf(w, "db=%s\nguilds=%d\nchannels=%d\nthreads=%d\nmessages=%d\nmembers=%d\nembedding_backlog=%d\nlast_sync=%s\nlast_tail_event=%s\n",
v.DBPath, v.GuildCount, v.ChannelCount, v.ThreadCount, v.MessageCount, v.MemberCount, v.EmbeddingBacklog,

View File

@ -0,0 +1,44 @@
package cli
import (
"flag"
"fmt"
"io"
"github.com/steipete/discrawl/internal/twitterarchive"
)
func (r *runtime) runTwitter(args []string) error {
if len(args) == 0 {
return usageErr(fmt.Errorf("twitter requires subcommand: import"))
}
switch args[0] {
case "import":
return r.runTwitterImport(args[1:])
default:
return usageErr(fmt.Errorf("unknown twitter subcommand %q", args[0]))
}
}
func (r *runtime) runTwitterImport(args []string) error {
fs := flag.NewFlagSet("twitter import", flag.ContinueOnError)
fs.SetOutput(io.Discard)
archivePath := fs.String("archive", "", "")
fs.StringVar(archivePath, "path", "", "")
dryRun := fs.Bool("dry-run", false, "")
if err := fs.Parse(args); err != nil {
return usageErr(err)
}
if fs.NArg() > 0 {
return usageErr(fmt.Errorf("twitter import takes flags only"))
}
stats, err := twitterarchive.Import(r.ctx, r.store, twitterarchive.Options{
Path: *archivePath,
DryRun: *dryRun,
Now: r.now,
})
if err != nil {
return err
}
return r.print(stats)
}

View File

@ -0,0 +1,235 @@
package twitterarchive
import (
"archive/zip"
"encoding/json"
"errors"
"fmt"
"io"
"path/filepath"
"sort"
"strconv"
"strings"
"time"
"github.com/steipete/discrawl/internal/store"
)
func classify(path string) string {
normalized := filepath.ToSlash(path)
base := filepath.Base(normalized)
switch {
case strings.HasSuffix(normalized, "/data/account.js") || normalized == "data/account.js":
return "account"
case (base == "tweets.js" || strings.HasPrefix(base, "tweets-part")) && strings.HasSuffix(base, ".js"):
return "tweets"
case base == "like.js" || base == "likes.js":
return "likes"
case (base == "direct-messages.js" || base == "direct-messages-group.js") && strings.HasSuffix(base, ".js"):
return "dms"
default:
return ""
}
}
func readZipText(file *zip.File) (string, error) {
rc, err := file.Open()
if err != nil {
return "", err
}
defer func() { _ = rc.Close() }()
var b strings.Builder
if _, err := io.Copy(&b, rc); err != nil {
return "", err
}
return b.String(), nil
}
func parseArchiveArray(content string) ([]map[string]any, error) {
idx := strings.Index(content, "=")
if idx < 0 {
return nil, errors.New("missing assignment")
}
payload := strings.TrimSuffix(strings.TrimSpace(content[idx+1:]), ";")
dec := json.NewDecoder(strings.NewReader(payload))
dec.UseNumber()
var records []map[string]any
if err := dec.Decode(&records); err != nil {
return nil, err
}
return records, nil
}
func dmAttachments(messageID, channelID, authorID string, raw map[string]any) []store.AttachmentRecord {
urls := stringSlice(raw["mediaUrls"])
attachments := make([]store.AttachmentRecord, 0, len(urls))
for i, url := range urls {
attachments = append(attachments, store.AttachmentRecord{
AttachmentID: fmt.Sprintf("x:dm:%s:media:%d", messageID, i),
MessageID: "x:dm:" + messageID,
GuildID: GuildID,
ChannelID: channelID,
AuthorID: authorID,
Filename: filepath.Base(url),
URL: url,
})
}
return attachments
}
func participantsForMessage(conversationID, senderID, recipientID string) []string {
set := map[string]struct{}{}
for _, value := range []string{senderID, recipientID} {
if value != "" {
set[value] = struct{}{}
}
}
for _, part := range strings.Split(conversationID, "-") {
if part != "" {
set[part] = struct{}{}
}
}
out := make([]string, 0, len(set))
for id := range set {
out = append(out, id)
}
sort.Strings(out)
return out
}
func conversationName(conversationID, selfID string) string {
parts := strings.Split(conversationID, "-")
if len(parts) == 2 {
for _, part := range parts {
if part != "" && part != selfID {
return "dm-" + shortID(part)
}
}
}
return "group-" + shortID(conversationID)
}
func authorLabel(id string, acc account) string {
if id == acc.ID && acc.Username != "" {
return acc.Username
}
return "user-" + shortID(id)
}
func parseTime(value string) time.Time {
value = strings.TrimSpace(value)
for _, layout := range []string{time.RFC3339Nano, time.RFC3339, "Mon Jan 02 15:04:05 -0700 2006"} {
if t, err := time.Parse(layout, value); err == nil {
return t
}
}
return time.Time{}
}
func twitterSnowflakeTime(id string) time.Time {
n, err := strconv.ParseInt(id, 10, 64)
if err != nil || n <= 0 {
return time.Time{}
}
const twitterEpochMs = int64(1288834974657)
ms := (n >> 22) + twitterEpochMs
return time.UnixMilli(ms).UTC()
}
func sortedFiles(files []*zip.File) []*zip.File {
out := append([]*zip.File(nil), files...)
sort.Slice(out, func(i, j int) bool { return out[i].Name < out[j].Name })
return out
}
func sortedChannels(channels map[string]store.ChannelRecord) []store.ChannelRecord {
out := make([]store.ChannelRecord, 0, len(channels))
for _, ch := range channels {
out = append(out, ch)
}
sort.Slice(out, func(i, j int) bool { return out[i].ID < out[j].ID })
return out
}
func sortedMembers(members map[string]store.MemberRecord) []store.MemberRecord {
out := make([]store.MemberRecord, 0, len(members))
for _, member := range members {
out = append(out, member)
}
sort.Slice(out, func(i, j int) bool { return out[i].UserID < out[j].UserID })
return out
}
func memberKey(guildID, userID string) string { return guildID + "\x00" + userID }
func prefixedTweetID(id string) string {
if id == "" {
return ""
}
return "x:tweet:" + id
}
func shortID(id string) string {
if len(id) <= 8 {
return id
}
return id[len(id)-8:]
}
func rawJSON(value any) string {
data, err := json.Marshal(value)
if err != nil {
return `{}`
}
return string(data)
}
func stringValue(value any) string {
switch typed := value.(type) {
case string:
return strings.TrimSpace(typed)
case json.Number:
return typed.String()
case float64:
return strconv.FormatInt(int64(typed), 10)
default:
return ""
}
}
func firstString(values ...any) string {
for _, value := range values {
if s := stringValue(value); s != "" {
return s
}
}
return ""
}
func firstAny(values ...any) any {
for _, value := range values {
if stringValue(value) != "" {
return value
}
}
return nil
}
func nestedString(raw map[string]any, key, nested string) string {
child, _ := raw[key].(map[string]any)
return stringValue(child[nested])
}
func stringSlice(value any) []string {
raw, ok := value.([]any)
if !ok {
return nil
}
out := make([]string, 0, len(raw))
for _, item := range raw {
if s := stringValue(item); s != "" {
out = append(out, s)
}
}
return out
}

View File

@ -0,0 +1,396 @@
package twitterarchive
import (
"archive/zip"
"context"
"errors"
"fmt"
"html"
"strings"
"time"
"github.com/steipete/discrawl/internal/store"
)
const (
GuildID = "x"
GuildName = "X / Twitter Archive"
tweetsChannelID = "x:tweets"
likesChannelID = "x:likes"
syncScope = "twitter:last_import"
archiveScope = "twitter:last_archive"
)
type Options struct {
Path string
DryRun bool
Now func() time.Time
}
type Stats struct {
Path string `json:"path"`
FilesScanned int `json:"files_scanned"`
Accounts int `json:"accounts"`
Tweets int `json:"tweets"`
Likes int `json:"likes"`
DMConversations int `json:"dm_conversations"`
DMMessages int `json:"dm_messages"`
Skipped int `json:"skipped"`
DryRun bool `json:"dry_run,omitempty"`
StartedAt time.Time `json:"started_at"`
FinishedAt time.Time `json:"finished_at"`
}
type account struct {
ID string
Username string
DisplayName string
Email string
CreatedAt string
Raw map[string]any
}
type importSnapshot struct {
account account
channels map[string]store.ChannelRecord
members map[string]store.MemberRecord
messages []store.MessageMutation
conversations map[string]struct{}
}
func Import(ctx context.Context, st *store.Store, opts Options) (Stats, error) {
if st == nil && !opts.DryRun {
return Stats{}, errors.New("store is required")
}
if strings.TrimSpace(opts.Path) == "" {
return Stats{}, errors.New("archive path is required")
}
now := opts.Now
if now == nil {
now = time.Now
}
stats := Stats{Path: opts.Path, DryRun: opts.DryRun, StartedAt: now().UTC()}
reader, err := zip.OpenReader(opts.Path)
if err != nil {
stats.FinishedAt = now().UTC()
return stats, fmt.Errorf("open twitter archive: %w", err)
}
defer func() { _ = reader.Close() }()
snap := importSnapshot{
channels: map[string]store.ChannelRecord{},
members: map[string]store.MemberRecord{},
conversations: map[string]struct{}{},
}
snap.channels[tweetsChannelID] = channel(tweetsChannelID, "tweets", "tweet")
snap.channels[likesChannelID] = channel(likesChannelID, "likes", "like")
for _, file := range sortedFiles(reader.File) {
if ctx.Err() != nil {
return stats, ctx.Err()
}
kind := classify(file.Name)
if kind == "" {
continue
}
content, err := readZipText(file)
if err != nil {
stats.Skipped++
continue
}
records, err := parseArchiveArray(content)
if err != nil {
stats.Skipped++
continue
}
stats.FilesScanned++
switch kind {
case "account":
if parseAccount(records, &snap) {
stats.Accounts = 1
}
case "tweets":
stats.Tweets += parseTweets(records, &snap)
case "likes":
stats.Likes += parseLikes(records, &snap, now().UTC())
case "dms":
conversations, messages := parseDMs(records, &snap)
stats.DMConversations += conversations
stats.DMMessages += messages
}
}
if snap.account.ID != "" {
snap.members[memberKey(GuildID, snap.account.ID)] = memberFromAccount(snap.account)
}
for i := range snap.messages {
if snap.messages[i].Record.AuthorID == "" && snap.account.ID != "" {
snap.messages[i].Record.AuthorID = snap.account.ID
snap.messages[i].Record.AuthorName = snap.account.Username
}
}
if !opts.DryRun {
if err := writeSnapshot(ctx, st, snap, opts.Path); err != nil {
return stats, err
}
}
stats.FinishedAt = now().UTC()
return stats, nil
}
func writeSnapshot(ctx context.Context, st *store.Store, snap importSnapshot, path string) error {
if err := st.UpsertGuild(ctx, store.GuildRecord{ID: GuildID, Name: GuildName, RawJSON: rawJSON(map[string]any{"platform": "x"})}); err != nil {
return err
}
for _, member := range sortedMembers(snap.members) {
if err := st.UpsertMember(ctx, member); err != nil {
return err
}
}
for _, ch := range sortedChannels(snap.channels) {
if err := st.UpsertChannel(ctx, ch); err != nil {
return err
}
}
const chunkSize = 1000
for start := 0; start < len(snap.messages); start += chunkSize {
end := start + chunkSize
if end > len(snap.messages) {
end = len(snap.messages)
}
if err := st.UpsertMessages(ctx, snap.messages[start:end]); err != nil {
return err
}
}
if err := st.SetSyncState(ctx, syncScope, time.Now().UTC().Format(time.RFC3339Nano)); err != nil {
return err
}
return st.SetSyncState(ctx, archiveScope, path)
}
func parseAccount(records []map[string]any, snap *importSnapshot) bool {
for _, record := range records {
raw, ok := record["account"].(map[string]any)
if !ok {
continue
}
acc := account{
ID: stringValue(raw["accountId"]),
Username: stringValue(raw["username"]),
DisplayName: firstString(raw["accountDisplayName"], raw["name"]),
Email: stringValue(raw["email"]),
CreatedAt: stringValue(raw["createdAt"]),
Raw: raw,
}
if acc.ID == "" || acc.Username == "" {
continue
}
snap.account = acc
snap.members[memberKey(GuildID, acc.ID)] = memberFromAccount(acc)
return true
}
return false
}
func parseTweets(records []map[string]any, snap *importSnapshot) int {
count := 0
for _, record := range records {
raw, ok := record["tweet"].(map[string]any)
if !ok {
continue
}
id := firstString(raw["id_str"], raw["id"])
text := html.UnescapeString(stringValue(raw["full_text"]))
createdAt := parseTime(firstString(raw["created_at"], raw["createdAt"]))
if id == "" || text == "" || createdAt.IsZero() {
continue
}
authorID := firstString(raw["author_id"], raw["user_id_str"], nestedString(raw, "user", "id_str"), snap.account.ID)
authorName := snap.account.Username
msg := store.MessageMutation{
Record: store.MessageRecord{
ID: "x:tweet:" + id,
GuildID: GuildID,
ChannelID: tweetsChannelID,
ChannelName: "tweets",
AuthorID: authorID,
AuthorName: authorName,
MessageType: 0,
CreatedAt: createdAt.UTC().Format(time.RFC3339Nano),
Content: text,
NormalizedContent: text,
ReplyToMessageID: prefixedTweetID(firstString(raw["in_reply_to_status_id_str"], raw["in_reply_to_status_id"])),
HasAttachments: hasMedia(raw),
RawJSON: rawJSON(map[string]any{"platform": "x", "type": "tweet", "tweet": raw}),
},
Mentions: tweetMentions(id, authorID, createdAt, raw),
}
snap.messages = append(snap.messages, msg)
count++
}
return count
}
func parseLikes(records []map[string]any, snap *importSnapshot, fallback time.Time) int {
count := 0
for _, record := range records {
raw, ok := record["like"].(map[string]any)
if !ok {
continue
}
tweetID := stringValue(raw["tweetId"])
text := html.UnescapeString(stringValue(raw["fullText"]))
if tweetID == "" || text == "" {
continue
}
createdAt := twitterSnowflakeTime(tweetID)
if createdAt.IsZero() {
createdAt = fallback
}
msg := store.MessageMutation{Record: store.MessageRecord{
ID: "x:like:" + tweetID,
GuildID: GuildID,
ChannelID: likesChannelID,
ChannelName: "likes",
AuthorID: "x:liked",
AuthorName: "liked",
MessageType: 0,
CreatedAt: createdAt.UTC().Format(time.RFC3339Nano),
Content: text,
NormalizedContent: text,
RawJSON: rawJSON(map[string]any{"platform": "x", "type": "like", "like": raw}),
}}
snap.members[memberKey(GuildID, "x:liked")] = store.MemberRecord{GuildID: GuildID, UserID: "x:liked", Username: "liked", DisplayName: "Liked Tweets", RoleIDsJSON: `[]`, RawJSON: `{}`}
snap.messages = append(snap.messages, msg)
count++
}
return count
}
func parseDMs(records []map[string]any, snap *importSnapshot) (int, int) {
conversations := 0
messages := 0
for _, record := range records {
rawConv, ok := record["dmConversation"].(map[string]any)
if !ok {
continue
}
conversationID := stringValue(rawConv["conversationId"])
rawMessages, ok := rawConv["messages"].([]any)
if conversationID == "" || !ok {
continue
}
channelID := "x:dm:" + conversationID
if _, seen := snap.conversations[conversationID]; !seen {
snap.conversations[conversationID] = struct{}{}
snap.channels[channelID] = channel(channelID, conversationName(conversationID, snap.account.ID), "dm")
conversations++
}
for _, item := range rawMessages {
rawMessage, ok := item.(map[string]any)
if !ok {
continue
}
messageCreate, ok := rawMessage["messageCreate"].(map[string]any)
if !ok {
continue
}
id := stringValue(messageCreate["id"])
text := html.UnescapeString(stringValue(messageCreate["text"]))
createdAt := parseTime(stringValue(messageCreate["createdAt"]))
senderID := stringValue(messageCreate["senderId"])
if id == "" || createdAt.IsZero() || (text == "" && len(stringSlice(messageCreate["mediaUrls"])) == 0) {
continue
}
participants := participantsForMessage(conversationID, senderID, stringValue(messageCreate["recipientId"]))
for _, participant := range participants {
upsertGenericMember(snap, participant)
}
attachments := dmAttachments(id, channelID, senderID, messageCreate)
snap.messages = append(snap.messages, store.MessageMutation{
Record: store.MessageRecord{
ID: "x:dm:" + id,
GuildID: GuildID,
ChannelID: channelID,
ChannelName: snap.channels[channelID].Name,
AuthorID: senderID,
AuthorName: authorLabel(senderID, snap.account),
MessageType: 0,
CreatedAt: createdAt.UTC().Format(time.RFC3339Nano),
Content: text,
NormalizedContent: text,
HasAttachments: len(attachments) > 0,
RawJSON: rawJSON(map[string]any{"platform": "x", "type": "dm", "conversation_id": conversationID, "message": messageCreate}),
},
Attachments: attachments,
})
messages++
}
}
return conversations, messages
}
func channel(id, name, kind string) store.ChannelRecord {
return store.ChannelRecord{ID: id, GuildID: GuildID, Kind: kind, Name: name, RawJSON: rawJSON(map[string]any{"platform": "x", "kind": kind})}
}
func memberFromAccount(acc account) store.MemberRecord {
return store.MemberRecord{
GuildID: GuildID,
UserID: acc.ID,
Username: acc.Username,
DisplayName: acc.DisplayName,
JoinedAt: acc.CreatedAt,
RoleIDsJSON: `[]`,
RawJSON: rawJSON(map[string]any{"platform": "x", "account": acc.Raw, "email": acc.Email}),
}
}
func upsertGenericMember(snap *importSnapshot, id string) {
if id == "" {
return
}
if snap.account.ID == id {
snap.members[memberKey(GuildID, id)] = memberFromAccount(snap.account)
return
}
key := memberKey(GuildID, id)
if _, ok := snap.members[key]; ok {
return
}
snap.members[key] = store.MemberRecord{GuildID: GuildID, UserID: id, Username: "user-" + shortID(id), RoleIDsJSON: `[]`, RawJSON: rawJSON(map[string]any{"platform": "x"})}
}
func tweetMentions(tweetID, authorID string, createdAt time.Time, raw map[string]any) []store.MentionEventRecord {
entities, _ := raw["entities"].(map[string]any)
rawMentions, _ := entities["user_mentions"].([]any)
var mentions []store.MentionEventRecord
for _, item := range rawMentions {
mention, ok := item.(map[string]any)
if !ok {
continue
}
targetID := stringValue(firstAny(mention["id_str"], mention["id"]))
targetName := firstString(mention["screen_name"], mention["name"])
if targetID == "" && targetName == "" {
continue
}
mentions = append(mentions, store.MentionEventRecord{
MessageID: "x:tweet:" + tweetID,
GuildID: GuildID,
ChannelID: tweetsChannelID,
AuthorID: authorID,
TargetType: "user",
TargetID: targetID,
TargetName: targetName,
EventAt: createdAt.UTC().Format(time.RFC3339Nano),
})
}
return mentions
}
func hasMedia(raw map[string]any) bool {
extended, _ := raw["extended_entities"].(map[string]any)
media, _ := extended["media"].([]any)
return len(media) > 0
}

View File

@ -0,0 +1,121 @@
package twitterarchive
import (
"archive/zip"
"context"
"os"
"path/filepath"
"testing"
"time"
"github.com/stretchr/testify/require"
"github.com/steipete/discrawl/internal/store"
)
func TestImportTwitterArchiveWritesSearchableMessages(t *testing.T) {
ctx := context.Background()
dir := t.TempDir()
archivePath := filepath.Join(dir, "twitter.zip")
writeTestArchive(t, archivePath)
st, err := store.Open(ctx, filepath.Join(dir, "discrawl.db"))
require.NoError(t, err)
defer func() { _ = st.Close() }()
stats, err := Import(ctx, st, Options{
Path: archivePath,
Now: func() time.Time { return time.Date(2026, 4, 26, 12, 0, 0, 0, time.UTC) },
})
require.NoError(t, err)
require.Equal(t, 1, stats.Accounts)
require.Equal(t, 1, stats.Tweets)
require.Equal(t, 1, stats.Likes)
require.Equal(t, 1, stats.DMConversations)
require.Equal(t, 1, stats.DMMessages)
results, err := st.SearchMessages(ctx, store.SearchOptions{Query: "pull requests", GuildIDs: []string{GuildID}, Limit: 10})
require.NoError(t, err)
require.Len(t, results, 1)
require.Equal(t, "x:tweet:1952542067017584782", results[0].MessageID)
require.Equal(t, "steipete", results[0].AuthorName)
results, err = st.SearchMessages(ctx, store.SearchOptions{Query: "secret roadmap", GuildIDs: []string{GuildID}, Limit: 10})
require.NoError(t, err)
require.Len(t, results, 1)
require.Equal(t, "x:dm:1052590933307461636", results[0].MessageID)
cursor, err := st.GetSyncState(ctx, archiveScope)
require.NoError(t, err)
require.Equal(t, archivePath, cursor)
}
func TestImportTwitterArchiveDryRunDoesNotWrite(t *testing.T) {
ctx := context.Background()
dir := t.TempDir()
archivePath := filepath.Join(dir, "twitter.zip")
writeTestArchive(t, archivePath)
stats, err := Import(ctx, nil, Options{Path: archivePath, DryRun: true})
require.NoError(t, err)
require.Equal(t, 1, stats.Tweets)
require.True(t, stats.DryRun)
}
func writeTestArchive(t *testing.T, path string) {
t.Helper()
file, err := os.Create(path)
require.NoError(t, err)
defer func() { _ = file.Close() }()
zw := zip.NewWriter(file)
defer func() { require.NoError(t, zw.Close()) }()
writeZipEntry(t, zw, "data/account.js", `window.YTD.account.part0 = [{
"account": {
"email": "steipete@gmail.com",
"username": "steipete",
"accountId": "25401953",
"createdAt": "2009-03-19T22:54:05.000Z",
"accountDisplayName": "Peter Steinberger"
}
}]`)
writeZipEntry(t, zw, "data/tweets.js", `window.YTD.tweets.part0 = [{
"tweet": {
"id_str": "1952542067017584782",
"created_at": "Tue Aug 05 01:27:59 +0000 2025",
"full_text": "Getting pull requests with zero user testing aint it.",
"entities": {"user_mentions": [{"screen_name": "alice", "id_str": "42"}]}
}
}]`)
writeZipEntry(t, zw, "data/like.js", `window.YTD.like.part0 = [{
"like": {
"tweetId": "1952539858771275983",
"fullText": "Liked archive import smoke"
}
}]`)
writeZipEntry(t, zw, "data/direct-messages.js", `window.YTD.direct_messages.part0 = [{
"dmConversation": {
"conversationId": "929-25401953",
"messages": [{
"messageCreate": {
"recipientId": "929",
"senderId": "25401953",
"id": "1052590933307461636",
"createdAt": "2018-10-17T16:03:29.391Z",
"text": "secret roadmap",
"mediaUrls": [],
"urls": []
}
}]
}
}]`)
}
func writeZipEntry(t *testing.T, zw *zip.Writer, name, body string) {
t.Helper()
w, err := zw.Create(name)
require.NoError(t, err)
_, err = w.Write([]byte(body))
require.NoError(t, err)
}