gogcli/internal/cmd/docs_markdown.go

497 lines
11 KiB
Go

package cmd
import (
"fmt"
"regexp"
"strings"
"unicode/utf16"
)
const (
fmtBold = "bold"
fmtBoldItalic = "bolditalic"
)
// MarkdownElementType represents the type of markdown element
type MarkdownElementType int
const (
MDText MarkdownElementType = iota
MDHeading1
MDHeading2
MDHeading3
MDHeading4
MDHeading5
MDHeading6
MDBold
MDItalic
MDBoldItalic
MDCode
MDCodeBlock
MDLink
MDImage
MDListItem
MDNumberedList
MDBlockquote
MDHorizontalRule
MDParagraph
MDEmptyLine
MDTable
)
// MarkdownElement represents a parsed markdown element
type MarkdownElement struct {
Type MarkdownElementType
Content string
Children []MarkdownElement
URL string // for links
Level int // for headings and lists
TableCells [][]string // for tables: rows of cells
}
// TextStyle represents text formatting
type TextStyle struct {
Bold bool
Italic bool
Code bool
Link string
Start int64
End int64
}
// ParagraphStyle represents paragraph-level formatting
type ParagraphStyle struct {
Type MarkdownElementType
Start int64
End int64
}
// utf16Len returns the number of UTF-16 code units in a string
func utf16Len(s string) int64 {
return int64(len(utf16.Encode([]rune(s))))
}
// ParseMarkdown parses markdown text into structured elements
func ParseMarkdown(text string) []MarkdownElement {
var elements []MarkdownElement
lines := strings.Split(text, "\n")
inCodeBlock := false
var codeBlockContent strings.Builder
for i := 0; i < len(lines); i++ {
line := lines[i]
// Handle code blocks
if strings.HasPrefix(line, "```") {
if inCodeBlock {
// End code block
elements = append(elements, MarkdownElement{
Type: MDCodeBlock,
Content: codeBlockContent.String(),
})
codeBlockContent.Reset()
inCodeBlock = false
} else {
// Start code block
inCodeBlock = true
}
continue
}
if inCodeBlock {
if codeBlockContent.Len() > 0 {
codeBlockContent.WriteString("\n")
}
codeBlockContent.WriteString(line)
continue
}
// Empty line
if strings.TrimSpace(line) == "" {
continue
}
// Horizontal rule
if isHorizontalRule(line) {
elements = append(elements, MarkdownElement{
Type: MDHorizontalRule,
})
continue
}
// Headings
if headingLevel, content := parseHeading(line); headingLevel > 0 {
headingType := MDHeading1
switch headingLevel {
case 1:
headingType = MDHeading1
case 2:
headingType = MDHeading2
case 3:
headingType = MDHeading3
case 4:
headingType = MDHeading4
case 5:
headingType = MDHeading5
case 6:
headingType = MDHeading6
}
elements = append(elements, MarkdownElement{
Type: headingType,
Content: content,
})
continue
}
// Blockquote
if strings.HasPrefix(line, "> ") {
content := strings.TrimPrefix(line, "> ")
if debugMarkdown {
fmt.Printf("[PARSE] Blockquote detected: %q -> %q\n", line, content)
}
elements = append(elements, MarkdownElement{
Type: MDBlockquote,
Content: content,
})
continue
}
// Numbered list
if match := regexp.MustCompile(`^(\d+)\.\s+(.+)`).FindStringSubmatch(line); match != nil {
elements = append(elements, MarkdownElement{
Type: MDNumberedList,
Content: match[2],
})
continue
}
// Bullet list
if strings.HasPrefix(line, "- ") || strings.HasPrefix(line, "* ") {
content := strings.TrimPrefix(strings.TrimPrefix(line, "- "), "* ")
elements = append(elements, MarkdownElement{
Type: MDListItem,
Content: content,
})
continue
}
// Table detection - line starts with | and has multiple |
if strings.HasPrefix(line, "|") && strings.Count(line, "|") >= 2 {
if debugMarkdown {
fmt.Printf("[TABLE DEBUG] Found potential table row: %q\n", line)
if i+1 < len(lines) {
fmt.Printf("[TABLE DEBUG] Next line: %q, isSep: %v\n", lines[i+1], isTableSeparator(lines[i+1]))
}
}
// Check if next line is separator (|---|---| pattern)
if i+1 < len(lines) && isTableSeparator(lines[i+1]) {
if debugMarkdown {
fmt.Printf("[TABLE DEBUG] Parsing table starting at line %d\n", i)
}
// Parse table
tableCells := parseMarkdownTable(lines[i:])
elements = append(elements, MarkdownElement{
Type: MDTable,
TableCells: tableCells,
})
// Skip all table lines
i += len(tableCells) // loop increment handles separator line offset
continue
}
}
// Regular paragraph
elements = append(elements, MarkdownElement{
Type: MDParagraph,
Content: line,
})
}
return elements
}
// isTableSeparator checks if a line is a markdown table separator (|---|---|)
func isTableSeparator(line string) bool {
trimmed := strings.TrimSpace(line)
if !strings.HasPrefix(trimmed, "|") || !strings.HasSuffix(trimmed, "|") {
return false
}
// Remove outer pipes
inner := strings.Trim(trimmed, "|")
// Split by | and check each segment
segments := strings.Split(inner, "|")
for _, seg := range segments {
seg = strings.TrimSpace(seg)
if seg == "" {
continue
}
// Each segment should be only dashes (with optional leading/trailing colon for alignment)
for i, c := range seg {
if c != '-' && c != ' ' && c != ':' {
return false
}
// Colon only allowed at start or end for alignment
if c == ':' && i != 0 && i != len(seg)-1 {
return false
}
}
// Must have at least one dash
if strings.Count(seg, "-") == 0 {
return false
}
}
return len(segments) > 1
}
// parseMarkdownTable parses a markdown table into rows of cells
func parseMarkdownTable(lines []string) [][]string {
var rows [][]string
for _, line := range lines {
line = strings.TrimSpace(line)
if line == "" {
break
}
if !strings.HasPrefix(line, "|") {
break
}
// Skip separator line
if isTableSeparator(line) {
continue
}
// Parse row: | cell1 | cell2 | cell3 |
cells := parseTableRow(line)
if len(cells) > 0 {
rows = append(rows, cells)
}
}
return rows
}
// parseTableRow parses a single table row into cells
func parseTableRow(line string) []string {
// Remove outer pipes
trimmed := strings.Trim(line, "|")
// Split by |
parts := strings.Split(trimmed, "|")
cells := make([]string, 0, len(parts))
for _, part := range parts {
cell := strings.TrimSpace(part)
cells = append(cells, cell)
}
return cells
}
// InlineMatch represents a matched inline pattern
const inlineTypeCode = "code"
type InlineMatch struct {
Start int
End int
Content string
Type string
URL string
}
// ParseInlineFormatting parses inline markdown formatting within text
// Returns styles with indices relative to the stripped plain text (UTF-16 code units)
func ParseInlineFormatting(text string) ([]TextStyle, string) {
var matches []InlineMatch
// Find all links [text](url)
linkRegex := regexp.MustCompile(`\[([^\]]+)\]\(([^)]+)\)`)
for _, idx := range linkRegex.FindAllStringSubmatchIndex(text, -1) {
matches = append(matches, InlineMatch{
Start: idx[0],
End: idx[1],
Content: text[idx[2]:idx[3]],
Type: "link",
URL: text[idx[4]:idx[5]],
})
}
// Find all inline code `code`
codeRegex := regexp.MustCompile("`([^`]+)`")
for _, idx := range codeRegex.FindAllStringSubmatchIndex(text, -1) {
matches = append(matches, InlineMatch{
Start: idx[0],
End: idx[1],
Content: text[idx[2]:idx[3]],
Type: inlineTypeCode,
})
}
// Find bold-italic ***text***
biRegex := regexp.MustCompile(`\*\*\*([^*]+)\*\*\*`)
for _, idx := range biRegex.FindAllStringSubmatchIndex(text, -1) {
matches = append(matches, InlineMatch{
Start: idx[0],
End: idx[1],
Content: text[idx[2]:idx[3]],
Type: "bolditalic",
})
}
// Find bold **text** (not overlapping with other patterns)
boldRegex := regexp.MustCompile(`\*\*([^*]+)\*\*`)
for _, idx := range boldRegex.FindAllStringSubmatchIndex(text, -1) {
overlaps := false
for _, m := range matches {
if idx[0] < m.End && idx[1] > m.Start {
overlaps = true
break
}
}
if !overlaps {
matches = append(matches, InlineMatch{
Start: idx[0],
End: idx[1],
Content: text[idx[2]:idx[3]],
Type: fmtBold,
})
}
}
// For italic, we need to be careful not to match asterisks that are part of bold
boldPositions := make(map[int]bool)
for _, m := range matches {
if m.Type == fmtBold || m.Type == fmtBoldItalic {
for i := m.Start; i <= m.End; i++ {
boldPositions[i] = true
}
}
}
// Find italic *text* but skip positions that are part of bold markers
italicRegex := regexp.MustCompile(`\*([^*]+)\*`)
for _, idx := range italicRegex.FindAllStringSubmatchIndex(text, -1) {
touchesBold := false
for i := idx[0]; i <= idx[1]; i++ {
if boldPositions[i] {
touchesBold = true
break
}
}
if !touchesBold {
overlaps := false
for _, m := range matches {
if idx[0] < m.End && idx[1] > m.Start {
overlaps = true
break
}
}
if !overlaps {
matches = append(matches, InlineMatch{
Start: idx[0],
End: idx[1],
Content: text[idx[2]:idx[3]],
Type: "italic",
})
}
}
}
// Sort matches by start position
for i := 0; i < len(matches)-1; i++ {
for j := i + 1; j < len(matches); j++ {
if matches[i].Start > matches[j].Start {
matches[i], matches[j] = matches[j], matches[i]
}
}
}
// Build stripped text and position map simultaneously
var stripped strings.Builder
// positionMap stores original byte offset -> stripped UTF-16 offset
positionMap := make(map[int]int64)
currentByte := 0
var strippedUTF16Len int64 = 0
for currentByte < len(text) {
matchFound := false
for _, m := range matches {
if m.Start == currentByte {
positionMap[currentByte] = strippedUTF16Len
stripped.WriteString(m.Content)
strippedUTF16Len += utf16Len(m.Content)
currentByte = m.End
matchFound = true
break
}
}
if !matchFound {
positionMap[currentByte] = strippedUTF16Len
char, size := nextRune(text[currentByte:])
stripped.WriteString(char)
strippedUTF16Len += utf16Len(char)
currentByte += size
}
}
positionMap[len(text)] = strippedUTF16Len
strippedText := stripped.String()
// Convert matches to styles with stripped UTF-16 positions
styles := make([]TextStyle, 0, len(matches))
for _, m := range matches {
styles = append(styles, TextStyle{
Start: positionMap[m.Start],
End: positionMap[m.End],
Bold: m.Type == fmtBold || m.Type == fmtBoldItalic,
Italic: m.Type == "italic" || m.Type == fmtBoldItalic,
Code: m.Type == inlineTypeCode,
Link: m.URL,
})
}
return styles, strippedText
}
// nextRune returns the first rune and its byte size from a string
func nextRune(s string) (string, int) {
for i, r := range s {
if i > 0 {
return s[:i], i
}
if len(s) == 1 {
return s, 1
}
_ = r
}
return "", 0
}
func parseHeading(line string) (int, string) {
headingRegex := regexp.MustCompile(`^(#{1,6})\s+(.+)$`)
match := headingRegex.FindStringSubmatch(line)
if match == nil {
return 0, ""
}
return len(match[1]), match[2]
}
func isHorizontalRule(line string) bool {
trimmed := strings.TrimSpace(line)
if len(trimmed) < 3 {
return false
}
char := trimmed[0]
if char != '-' && char != '*' && char != '_' {
return false
}
for _, c := range trimmed {
if c != rune(char) && c != ' ' {
return false
}
}
return true
}