telecrawl/internal/telegramdesktop/scripts/import_tdata.py
2026-05-08 16:45:24 +01:00

140 lines
4.6 KiB
Python

#!/usr/bin/env python3
import argparse
import asyncio
import hashlib
import json
from datetime import datetime, timezone
from opentele2.api import UseCurrentSession
from opentele2.td import TDesktop
def iso(dt):
if not dt:
return ""
if dt.tzinfo is None:
dt = dt.replace(tzinfo=timezone.utc)
return dt.astimezone(timezone.utc).isoformat()
def stable_pk(chat_id, message_id):
digest = hashlib.blake2b(f"{chat_id}:{message_id}".encode(), digest_size=8).digest()
value = int.from_bytes(digest, "big", signed=False) & ((1 << 63) - 1)
return value or 1
def entity_kind(entity):
name = type(entity).__name__.lower()
if "user" in name:
return "user"
if "channel" in name:
return "channel"
if "chat" in name:
return "group"
return name or "unknown"
def display_name(entity, fallback):
for attr in ("title", "first_name", "last_name", "username"):
value = getattr(entity, attr, None)
if value:
if attr == "first_name":
last = getattr(entity, "last_name", None)
return f"{value} {last}".strip() if last else value
return value
return fallback or str(getattr(entity, "id", ""))
def media_type(message):
media = getattr(message, "media", None)
if not media:
return ""
name = type(media).__name__
return name.replace("MessageMedia", "").lower() or name.lower()
async def main():
parser = argparse.ArgumentParser()
parser.add_argument("--tdata", required=True)
parser.add_argument("--session", required=True)
parser.add_argument("--dialogs-limit", type=int, default=200)
parser.add_argument("--messages-limit", type=int, default=500)
args = parser.parse_args()
started = datetime.now(timezone.utc)
td = TDesktop(args.tdata)
if not td.isLoaded():
raise SystemExit("tdata did not load")
client = await td.ToTelethon(session=args.session, flag=UseCurrentSession)
await client.connect()
if not await client.is_user_authorized():
raise SystemExit("Telegram session is not authorized")
dialogs = await client.get_dialogs(limit=None if args.dialogs_limit <= 0 else args.dialogs_limit)
out_chats = []
out_messages = []
for dialog in dialogs:
entity = dialog.entity
chat_id = str(dialog.id)
chat_name = display_name(entity, getattr(dialog, "name", ""))
limit = None if args.messages_limit <= 0 else args.messages_limit
messages = await client.get_messages(entity, limit=limit)
last_message_at = None
for msg in messages:
if not getattr(msg, "id", None):
continue
if getattr(msg, "date", None) and (last_message_at is None or msg.date > last_message_at):
last_message_at = msg.date
sender_id = ""
sender = getattr(msg, "sender", None)
if sender is not None:
sender_id = str(getattr(sender, "id", "") or "")
elif getattr(msg, "sender_id", None):
sender_id = str(msg.sender_id)
sender_name = display_name(sender, "") if sender else ""
text = getattr(msg, "message", "") or ""
out_messages.append(
{
"source_pk": stable_pk(chat_id, msg.id),
"chat_id": chat_id,
"chat_name": chat_name,
"message_id": str(msg.id),
"sender_id": sender_id,
"sender_name": sender_name,
"timestamp": iso(getattr(msg, "date", None)),
"from_me": bool(getattr(msg, "out", False)),
"text": text,
"message_type": type(msg).__name__,
"media_type": media_type(msg),
"media_title": "",
}
)
out_chats.append(
{
"id": chat_id,
"kind": entity_kind(entity),
"name": chat_name,
"username": getattr(entity, "username", "") or "",
"last_message_at": iso(last_message_at),
"unread_count": int(getattr(dialog, "unread_count", 0) or 0),
"message_count": len(messages),
}
)
await client.disconnect()
print(
json.dumps(
{
"source_path": args.tdata,
"started_at": iso(started),
"finished_at": iso(datetime.now(timezone.utc)),
"chats": out_chats,
"messages": out_messages,
},
ensure_ascii=False,
)
)
asyncio.run(main())