497 lines
11 KiB
Go
497 lines
11 KiB
Go
package cmd
|
|
|
|
import (
|
|
"fmt"
|
|
"regexp"
|
|
"strings"
|
|
"unicode/utf16"
|
|
)
|
|
|
|
const (
|
|
fmtBold = "bold"
|
|
fmtBoldItalic = "bolditalic"
|
|
)
|
|
|
|
// MarkdownElementType represents the type of markdown element
|
|
type MarkdownElementType int
|
|
|
|
const (
|
|
MDText MarkdownElementType = iota
|
|
MDHeading1
|
|
MDHeading2
|
|
MDHeading3
|
|
MDHeading4
|
|
MDHeading5
|
|
MDHeading6
|
|
MDBold
|
|
MDItalic
|
|
MDBoldItalic
|
|
MDCode
|
|
MDCodeBlock
|
|
MDLink
|
|
MDImage
|
|
MDListItem
|
|
MDNumberedList
|
|
MDBlockquote
|
|
MDHorizontalRule
|
|
MDParagraph
|
|
MDEmptyLine
|
|
MDTable
|
|
)
|
|
|
|
// MarkdownElement represents a parsed markdown element
|
|
type MarkdownElement struct {
|
|
Type MarkdownElementType
|
|
Content string
|
|
Children []MarkdownElement
|
|
URL string // for links
|
|
Level int // for headings and lists
|
|
TableCells [][]string // for tables: rows of cells
|
|
}
|
|
|
|
// TextStyle represents text formatting
|
|
type TextStyle struct {
|
|
Bold bool
|
|
Italic bool
|
|
Code bool
|
|
Link string
|
|
Start int64
|
|
End int64
|
|
}
|
|
|
|
// ParagraphStyle represents paragraph-level formatting
|
|
type ParagraphStyle struct {
|
|
Type MarkdownElementType
|
|
Start int64
|
|
End int64
|
|
}
|
|
|
|
// utf16Len returns the number of UTF-16 code units in a string
|
|
func utf16Len(s string) int64 {
|
|
return int64(len(utf16.Encode([]rune(s))))
|
|
}
|
|
|
|
// ParseMarkdown parses markdown text into structured elements
|
|
func ParseMarkdown(text string) []MarkdownElement {
|
|
var elements []MarkdownElement
|
|
lines := strings.Split(text, "\n")
|
|
|
|
inCodeBlock := false
|
|
var codeBlockContent strings.Builder
|
|
|
|
for i := 0; i < len(lines); i++ {
|
|
line := lines[i]
|
|
|
|
// Handle code blocks
|
|
if strings.HasPrefix(line, "```") {
|
|
if inCodeBlock {
|
|
// End code block
|
|
elements = append(elements, MarkdownElement{
|
|
Type: MDCodeBlock,
|
|
Content: codeBlockContent.String(),
|
|
})
|
|
codeBlockContent.Reset()
|
|
inCodeBlock = false
|
|
} else {
|
|
// Start code block
|
|
inCodeBlock = true
|
|
}
|
|
continue
|
|
}
|
|
|
|
if inCodeBlock {
|
|
if codeBlockContent.Len() > 0 {
|
|
codeBlockContent.WriteString("\n")
|
|
}
|
|
codeBlockContent.WriteString(line)
|
|
continue
|
|
}
|
|
|
|
// Empty line
|
|
if strings.TrimSpace(line) == "" {
|
|
continue
|
|
}
|
|
|
|
// Horizontal rule
|
|
if isHorizontalRule(line) {
|
|
elements = append(elements, MarkdownElement{
|
|
Type: MDHorizontalRule,
|
|
})
|
|
continue
|
|
}
|
|
|
|
// Headings
|
|
if headingLevel, content := parseHeading(line); headingLevel > 0 {
|
|
headingType := MDHeading1
|
|
switch headingLevel {
|
|
case 1:
|
|
headingType = MDHeading1
|
|
case 2:
|
|
headingType = MDHeading2
|
|
case 3:
|
|
headingType = MDHeading3
|
|
case 4:
|
|
headingType = MDHeading4
|
|
case 5:
|
|
headingType = MDHeading5
|
|
case 6:
|
|
headingType = MDHeading6
|
|
}
|
|
elements = append(elements, MarkdownElement{
|
|
Type: headingType,
|
|
Content: content,
|
|
})
|
|
continue
|
|
}
|
|
|
|
// Blockquote
|
|
if strings.HasPrefix(line, "> ") {
|
|
content := strings.TrimPrefix(line, "> ")
|
|
if debugMarkdown {
|
|
fmt.Printf("[PARSE] Blockquote detected: %q -> %q\n", line, content)
|
|
}
|
|
elements = append(elements, MarkdownElement{
|
|
Type: MDBlockquote,
|
|
Content: content,
|
|
})
|
|
continue
|
|
}
|
|
|
|
// Numbered list
|
|
if match := regexp.MustCompile(`^(\d+)\.\s+(.+)`).FindStringSubmatch(line); match != nil {
|
|
elements = append(elements, MarkdownElement{
|
|
Type: MDNumberedList,
|
|
Content: match[2],
|
|
})
|
|
continue
|
|
}
|
|
|
|
// Bullet list
|
|
if strings.HasPrefix(line, "- ") || strings.HasPrefix(line, "* ") {
|
|
content := strings.TrimPrefix(strings.TrimPrefix(line, "- "), "* ")
|
|
elements = append(elements, MarkdownElement{
|
|
Type: MDListItem,
|
|
Content: content,
|
|
})
|
|
continue
|
|
}
|
|
|
|
// Table detection - line starts with | and has multiple |
|
|
if strings.HasPrefix(line, "|") && strings.Count(line, "|") >= 2 {
|
|
if debugMarkdown {
|
|
fmt.Printf("[TABLE DEBUG] Found potential table row: %q\n", line)
|
|
if i+1 < len(lines) {
|
|
fmt.Printf("[TABLE DEBUG] Next line: %q, isSep: %v\n", lines[i+1], isTableSeparator(lines[i+1]))
|
|
}
|
|
}
|
|
// Check if next line is separator (|---|---| pattern)
|
|
if i+1 < len(lines) && isTableSeparator(lines[i+1]) {
|
|
if debugMarkdown {
|
|
fmt.Printf("[TABLE DEBUG] Parsing table starting at line %d\n", i)
|
|
}
|
|
// Parse table
|
|
tableCells := parseMarkdownTable(lines[i:])
|
|
elements = append(elements, MarkdownElement{
|
|
Type: MDTable,
|
|
TableCells: tableCells,
|
|
})
|
|
// Skip all table lines
|
|
i += len(tableCells) // loop increment handles separator line offset
|
|
continue
|
|
}
|
|
}
|
|
|
|
// Regular paragraph
|
|
elements = append(elements, MarkdownElement{
|
|
Type: MDParagraph,
|
|
Content: line,
|
|
})
|
|
}
|
|
|
|
return elements
|
|
}
|
|
|
|
// isTableSeparator checks if a line is a markdown table separator (|---|---|)
|
|
func isTableSeparator(line string) bool {
|
|
trimmed := strings.TrimSpace(line)
|
|
if !strings.HasPrefix(trimmed, "|") || !strings.HasSuffix(trimmed, "|") {
|
|
return false
|
|
}
|
|
// Remove outer pipes
|
|
inner := strings.Trim(trimmed, "|")
|
|
// Split by | and check each segment
|
|
segments := strings.Split(inner, "|")
|
|
for _, seg := range segments {
|
|
seg = strings.TrimSpace(seg)
|
|
if seg == "" {
|
|
continue
|
|
}
|
|
// Each segment should be only dashes (with optional leading/trailing colon for alignment)
|
|
for i, c := range seg {
|
|
if c != '-' && c != ' ' && c != ':' {
|
|
return false
|
|
}
|
|
// Colon only allowed at start or end for alignment
|
|
if c == ':' && i != 0 && i != len(seg)-1 {
|
|
return false
|
|
}
|
|
}
|
|
// Must have at least one dash
|
|
if strings.Count(seg, "-") == 0 {
|
|
return false
|
|
}
|
|
}
|
|
return len(segments) > 1
|
|
}
|
|
|
|
// parseMarkdownTable parses a markdown table into rows of cells
|
|
func parseMarkdownTable(lines []string) [][]string {
|
|
var rows [][]string
|
|
|
|
for _, line := range lines {
|
|
line = strings.TrimSpace(line)
|
|
if line == "" {
|
|
break
|
|
}
|
|
if !strings.HasPrefix(line, "|") {
|
|
break
|
|
}
|
|
// Skip separator line
|
|
if isTableSeparator(line) {
|
|
continue
|
|
}
|
|
|
|
// Parse row: | cell1 | cell2 | cell3 |
|
|
cells := parseTableRow(line)
|
|
if len(cells) > 0 {
|
|
rows = append(rows, cells)
|
|
}
|
|
}
|
|
|
|
return rows
|
|
}
|
|
|
|
// parseTableRow parses a single table row into cells
|
|
func parseTableRow(line string) []string {
|
|
// Remove outer pipes
|
|
trimmed := strings.Trim(line, "|")
|
|
|
|
// Split by |
|
|
parts := strings.Split(trimmed, "|")
|
|
|
|
cells := make([]string, 0, len(parts))
|
|
for _, part := range parts {
|
|
cell := strings.TrimSpace(part)
|
|
cells = append(cells, cell)
|
|
}
|
|
|
|
return cells
|
|
}
|
|
|
|
// InlineMatch represents a matched inline pattern
|
|
const inlineTypeCode = "code"
|
|
|
|
type InlineMatch struct {
|
|
Start int
|
|
End int
|
|
Content string
|
|
Type string
|
|
URL string
|
|
}
|
|
|
|
// ParseInlineFormatting parses inline markdown formatting within text
|
|
// Returns styles with indices relative to the stripped plain text (UTF-16 code units)
|
|
func ParseInlineFormatting(text string) ([]TextStyle, string) {
|
|
var matches []InlineMatch
|
|
|
|
// Find all links [text](url)
|
|
linkRegex := regexp.MustCompile(`\[([^\]]+)\]\(([^)]+)\)`)
|
|
for _, idx := range linkRegex.FindAllStringSubmatchIndex(text, -1) {
|
|
matches = append(matches, InlineMatch{
|
|
Start: idx[0],
|
|
End: idx[1],
|
|
Content: text[idx[2]:idx[3]],
|
|
Type: "link",
|
|
URL: text[idx[4]:idx[5]],
|
|
})
|
|
}
|
|
|
|
// Find all inline code `code`
|
|
codeRegex := regexp.MustCompile("`([^`]+)`")
|
|
for _, idx := range codeRegex.FindAllStringSubmatchIndex(text, -1) {
|
|
matches = append(matches, InlineMatch{
|
|
Start: idx[0],
|
|
End: idx[1],
|
|
Content: text[idx[2]:idx[3]],
|
|
Type: inlineTypeCode,
|
|
})
|
|
}
|
|
|
|
// Find bold-italic ***text***
|
|
biRegex := regexp.MustCompile(`\*\*\*([^*]+)\*\*\*`)
|
|
for _, idx := range biRegex.FindAllStringSubmatchIndex(text, -1) {
|
|
matches = append(matches, InlineMatch{
|
|
Start: idx[0],
|
|
End: idx[1],
|
|
Content: text[idx[2]:idx[3]],
|
|
Type: "bolditalic",
|
|
})
|
|
}
|
|
|
|
// Find bold **text** (not overlapping with other patterns)
|
|
boldRegex := regexp.MustCompile(`\*\*([^*]+)\*\*`)
|
|
for _, idx := range boldRegex.FindAllStringSubmatchIndex(text, -1) {
|
|
overlaps := false
|
|
for _, m := range matches {
|
|
if idx[0] < m.End && idx[1] > m.Start {
|
|
overlaps = true
|
|
break
|
|
}
|
|
}
|
|
if !overlaps {
|
|
matches = append(matches, InlineMatch{
|
|
Start: idx[0],
|
|
End: idx[1],
|
|
Content: text[idx[2]:idx[3]],
|
|
Type: fmtBold,
|
|
})
|
|
}
|
|
}
|
|
|
|
// For italic, we need to be careful not to match asterisks that are part of bold
|
|
boldPositions := make(map[int]bool)
|
|
for _, m := range matches {
|
|
if m.Type == fmtBold || m.Type == fmtBoldItalic {
|
|
for i := m.Start; i <= m.End; i++ {
|
|
boldPositions[i] = true
|
|
}
|
|
}
|
|
}
|
|
|
|
// Find italic *text* but skip positions that are part of bold markers
|
|
italicRegex := regexp.MustCompile(`\*([^*]+)\*`)
|
|
for _, idx := range italicRegex.FindAllStringSubmatchIndex(text, -1) {
|
|
touchesBold := false
|
|
for i := idx[0]; i <= idx[1]; i++ {
|
|
if boldPositions[i] {
|
|
touchesBold = true
|
|
break
|
|
}
|
|
}
|
|
if !touchesBold {
|
|
overlaps := false
|
|
for _, m := range matches {
|
|
if idx[0] < m.End && idx[1] > m.Start {
|
|
overlaps = true
|
|
break
|
|
}
|
|
}
|
|
if !overlaps {
|
|
matches = append(matches, InlineMatch{
|
|
Start: idx[0],
|
|
End: idx[1],
|
|
Content: text[idx[2]:idx[3]],
|
|
Type: "italic",
|
|
})
|
|
}
|
|
}
|
|
}
|
|
|
|
// Sort matches by start position
|
|
for i := 0; i < len(matches)-1; i++ {
|
|
for j := i + 1; j < len(matches); j++ {
|
|
if matches[i].Start > matches[j].Start {
|
|
matches[i], matches[j] = matches[j], matches[i]
|
|
}
|
|
}
|
|
}
|
|
|
|
// Build stripped text and position map simultaneously
|
|
var stripped strings.Builder
|
|
// positionMap stores original byte offset -> stripped UTF-16 offset
|
|
positionMap := make(map[int]int64)
|
|
|
|
currentByte := 0
|
|
var strippedUTF16Len int64 = 0
|
|
|
|
for currentByte < len(text) {
|
|
matchFound := false
|
|
for _, m := range matches {
|
|
if m.Start == currentByte {
|
|
positionMap[currentByte] = strippedUTF16Len
|
|
stripped.WriteString(m.Content)
|
|
strippedUTF16Len += utf16Len(m.Content)
|
|
currentByte = m.End
|
|
matchFound = true
|
|
break
|
|
}
|
|
}
|
|
|
|
if !matchFound {
|
|
positionMap[currentByte] = strippedUTF16Len
|
|
char, size := nextRune(text[currentByte:])
|
|
stripped.WriteString(char)
|
|
strippedUTF16Len += utf16Len(char)
|
|
currentByte += size
|
|
}
|
|
}
|
|
|
|
positionMap[len(text)] = strippedUTF16Len
|
|
strippedText := stripped.String()
|
|
|
|
// Convert matches to styles with stripped UTF-16 positions
|
|
styles := make([]TextStyle, 0, len(matches))
|
|
for _, m := range matches {
|
|
styles = append(styles, TextStyle{
|
|
Start: positionMap[m.Start],
|
|
End: positionMap[m.End],
|
|
Bold: m.Type == fmtBold || m.Type == fmtBoldItalic,
|
|
Italic: m.Type == "italic" || m.Type == fmtBoldItalic,
|
|
Code: m.Type == inlineTypeCode,
|
|
Link: m.URL,
|
|
})
|
|
}
|
|
|
|
return styles, strippedText
|
|
}
|
|
|
|
// nextRune returns the first rune and its byte size from a string
|
|
func nextRune(s string) (string, int) {
|
|
for i, r := range s {
|
|
if i > 0 {
|
|
return s[:i], i
|
|
}
|
|
if len(s) == 1 {
|
|
return s, 1
|
|
}
|
|
_ = r
|
|
}
|
|
return "", 0
|
|
}
|
|
|
|
func parseHeading(line string) (int, string) {
|
|
headingRegex := regexp.MustCompile(`^(#{1,6})\s+(.+)$`)
|
|
match := headingRegex.FindStringSubmatch(line)
|
|
if match == nil {
|
|
return 0, ""
|
|
}
|
|
return len(match[1]), match[2]
|
|
}
|
|
|
|
func isHorizontalRule(line string) bool {
|
|
trimmed := strings.TrimSpace(line)
|
|
if len(trimmed) < 3 {
|
|
return false
|
|
}
|
|
char := trimmed[0]
|
|
if char != '-' && char != '*' && char != '_' {
|
|
return false
|
|
}
|
|
for _, c := range trimmed {
|
|
if c != rune(char) && c != ' ' {
|
|
return false
|
|
}
|
|
}
|
|
return true
|
|
}
|