discrawl/internal/syncer/enrichment.go
2026-05-05 10:07:56 +01:00

264 lines
6.9 KiB
Go

package syncer
import (
"bufio"
"context"
"errors"
"io"
"mime"
"net/http"
"path/filepath"
"strings"
"time"
"github.com/bwmarrin/discordgo"
"github.com/openclaw/discrawl/internal/store"
)
const (
attachmentFetchMaxBytes = 256 << 10
attachmentIndexMaxChars = 8 << 10
)
var attachmentHTTPClient = &http.Client{Timeout: 5 * time.Second}
func buildMessageMutation(
ctx context.Context,
message *discordgo.Message,
channelName string,
fallbackGuildID string,
embeddings bool,
attachmentText bool,
) (store.MessageMutation, error) {
guildID := effectiveMessageGuildID(message, fallbackGuildID)
attachments, attachmentParts, err := extractAttachments(ctx, message, guildID, attachmentText)
if err != nil {
return store.MessageMutation{}, err
}
normalized := normalizeMessageParts(message, attachmentParts)
record := toMessageRecord(message, channelName, guildID, normalized)
return store.MessageMutation{
Record: record,
EventType: "upsert",
PayloadJSON: record.RawJSON,
Options: store.WriteOptions{
EnqueueEmbedding: embeddings,
},
Attachments: attachments,
Mentions: extractMentions(message, guildID),
}, nil
}
func extractAttachments(ctx context.Context, message *discordgo.Message, guildID string, attachmentText bool) ([]store.AttachmentRecord, []string, error) {
if message == nil || len(message.Attachments) == 0 {
return nil, nil, nil
}
records := make([]store.AttachmentRecord, 0, len(message.Attachments))
parts := make([]string, 0, len(message.Attachments)*2)
for _, attachment := range message.Attachments {
if attachment == nil || attachment.ID == "" {
continue
}
record := store.AttachmentRecord{
AttachmentID: attachment.ID,
MessageID: message.ID,
GuildID: guildID,
ChannelID: message.ChannelID,
Filename: attachment.Filename,
ContentType: attachment.ContentType,
Size: int64(attachment.Size),
URL: attachment.URL,
ProxyURL: attachment.ProxyURL,
}
if message.Author != nil {
record.AuthorID = message.Author.ID
}
if name := strings.TrimSpace(attachment.Filename); name != "" {
parts = append(parts, name)
}
if attachmentText && shouldFetchAttachmentText(attachment) && attachment.URL != "" {
text, err := fetchAttachmentText(ctx, attachment.URL)
if err != nil {
text = ""
}
if text != "" {
record.TextContent = text
parts = append(parts, text)
}
}
records = append(records, record)
}
return records, parts, nil
}
func extractMentions(message *discordgo.Message, guildID string) []store.MentionEventRecord {
if message == nil {
return nil
}
eventAt := message.Timestamp.UTC().Format(time.RFC3339Nano)
authorID := ""
if message.Author != nil {
authorID = message.Author.ID
}
mentions := make([]store.MentionEventRecord, 0, len(message.Mentions)+len(message.MentionRoles))
seen := make(map[string]struct{}, len(message.Mentions)+len(message.MentionRoles))
for _, user := range message.Mentions {
if user == nil || user.ID == "" {
continue
}
key := "user:" + user.ID
if _, ok := seen[key]; ok {
continue
}
seen[key] = struct{}{}
targetName := strings.TrimSpace(user.GlobalName)
if targetName == "" {
targetName = user.Username
}
mentions = append(mentions, store.MentionEventRecord{
MessageID: message.ID,
GuildID: guildID,
ChannelID: message.ChannelID,
AuthorID: authorID,
TargetType: "user",
TargetID: user.ID,
TargetName: targetName,
EventAt: eventAt,
})
}
for _, roleID := range message.MentionRoles {
roleID = strings.TrimSpace(roleID)
if roleID == "" {
continue
}
key := "role:" + roleID
if _, ok := seen[key]; ok {
continue
}
seen[key] = struct{}{}
mentions = append(mentions, store.MentionEventRecord{
MessageID: message.ID,
GuildID: guildID,
ChannelID: message.ChannelID,
AuthorID: authorID,
TargetType: "role",
TargetID: roleID,
EventAt: eventAt,
})
}
return mentions
}
func shouldFetchAttachmentText(attachment *discordgo.MessageAttachment) bool {
if attachment == nil {
return false
}
if isBlockedAttachment(attachment.Filename, attachment.ContentType) {
return false
}
contentType := strings.ToLower(strings.TrimSpace(attachment.ContentType))
if strings.HasPrefix(contentType, "text/") || contentType == "application/json" {
return true
}
switch strings.ToLower(filepath.Ext(strings.TrimSpace(attachment.Filename))) {
case ".txt", ".md", ".log", ".csv", ".json", ".yaml", ".yml", ".xml":
return true
default:
return false
}
}
func fetchAttachmentText(ctx context.Context, url string) (string, error) {
req, err := http.NewRequestWithContext(ctx, http.MethodGet, url, nil)
if err != nil {
return "", err
}
resp, err := attachmentHTTPClient.Do(req)
if err != nil {
return "", err
}
defer func() { _ = resp.Body.Close() }()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return "", nil
}
if resp.ContentLength > attachmentFetchMaxBytes {
return "", nil
}
contentType := normalizedMediaType(resp.Header.Get("Content-Type"))
if contentType != "" && !isAllowedFetchedContentType(contentType) {
return "", nil
}
reader := bufio.NewReader(resp.Body)
peek, err := reader.Peek(512)
if err != nil && !errors.Is(err, io.EOF) {
return "", err
}
if len(peek) != 0 && !isAllowedFetchedContentType(normalizedMediaType(http.DetectContentType(peek))) {
return "", nil
}
body, err := io.ReadAll(io.LimitReader(reader, attachmentFetchMaxBytes+1))
if err != nil {
return "", err
}
if len(body) > attachmentFetchMaxBytes {
return "", nil
}
return clampText(string(body), attachmentIndexMaxChars), nil
}
func isBlockedAttachment(filename, contentType string) bool {
switch strings.ToLower(filepath.Ext(strings.TrimSpace(filename))) {
case ".app", ".bat", ".cmd", ".com", ".dll", ".dmg", ".exe", ".jar", ".msi", ".pkg", ".ps1", ".scr", ".sh":
return true
}
switch normalizedMediaType(contentType) {
case "application/java-archive",
"application/octet-stream",
"application/vnd.microsoft.portable-executable",
"application/x-apple-diskimage",
"application/x-dosexec",
"application/x-msdownload",
"application/x-msi",
"application/x-sh":
return true
default:
return false
}
}
func isAllowedFetchedContentType(contentType string) bool {
if contentType == "" {
return false
}
if strings.HasPrefix(contentType, "text/") {
return true
}
switch contentType {
case "application/json", "application/xml", "text/xml":
return true
default:
return false
}
}
func normalizedMediaType(value string) string {
value = strings.TrimSpace(value)
if value == "" {
return ""
}
mediaType, _, err := mime.ParseMediaType(value)
if err == nil {
return strings.ToLower(mediaType)
}
return strings.ToLower(strings.TrimSpace(strings.Split(value, ";")[0]))
}
func clampText(text string, limit int) string {
text = strings.TrimSpace(text)
if limit <= 0 || len(text) <= limit {
return text
}
return strings.TrimSpace(text[:limit])
}