From 6fa3286edcc82db84b8d6a8b84a9caa010705027 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 22 Apr 2026 15:54:56 -0700 Subject: [PATCH] feat: export normalized markdown --- internal/markdown/export.go | 216 +++++++++++++++++++++++++++++++ internal/markdown/export_test.go | 46 +++++++ 2 files changed, 262 insertions(+) create mode 100644 internal/markdown/export.go create mode 100644 internal/markdown/export_test.go diff --git a/internal/markdown/export.go b/internal/markdown/export.go new file mode 100644 index 0000000..cb66927 --- /dev/null +++ b/internal/markdown/export.go @@ -0,0 +1,216 @@ +package markdown + +import ( + "context" + "fmt" + "os" + "path/filepath" + "sort" + "strings" + "time" + + "github.com/vincentkoc/notioncrawl/internal/notiontext" + "github.com/vincentkoc/notioncrawl/internal/store" +) + +type Exporter struct { + Store *store.Store + Dir string +} + +type Summary struct { + Pages int + Files []string +} + +func (e Exporter) Export(ctx context.Context) (Summary, error) { + if e.Store == nil { + return Summary{}, fmt.Errorf("missing store") + } + if e.Dir == "" { + return Summary{}, fmt.Errorf("missing markdown dir") + } + if err := os.MkdirAll(e.Dir, 0o755); err != nil { + return Summary{}, err + } + pages, err := e.Store.Pages(ctx) + if err != nil { + return Summary{}, err + } + var s Summary + for _, page := range pages { + path, err := e.writePage(ctx, page) + if err != nil { + return s, err + } + s.Pages++ + s.Files = append(s.Files, path) + } + return s, nil +} + +func (e Exporter) writePage(ctx context.Context, page store.Page) (string, error) { + spaceName, err := e.Store.SpaceName(ctx, page.SpaceID) + if err != nil { + return "", err + } + blocks, err := e.Store.PageBlocks(ctx, page.ID) + if err != nil { + return "", err + } + comments, err := e.Store.PageComments(ctx, page.ID) + if err != nil { + return "", err + } + spaceSlug := notiontext.Slug(spaceName) + titleSlug := notiontext.Slug(page.Title) + name := fmt.Sprintf("%s-%s.md", titleSlug, notiontext.ShortID(page.ID)) + path := filepath.Join(e.Dir, spaceSlug, name) + if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil { + return "", err + } + var b strings.Builder + writeFrontMatter(&b, page, spaceName) + if page.Title != "" { + fmt.Fprintf(&b, "# %s\n\n", notiontext.MarkdownEscape(page.Title)) + } + renderBlocks(&b, page.ID, blocks) + if len(comments) > 0 { + if !strings.HasSuffix(b.String(), "\n\n") { + b.WriteString("\n") + } + b.WriteString("## Comments\n\n") + for _, c := range comments { + text := notiontext.MarkdownEscape(c.Text) + if text == "" { + continue + } + fmt.Fprintf(&b, "- %s\n", text) + } + } + out := strings.TrimRight(b.String(), " \n") + "\n" + return path, os.WriteFile(path, []byte(out), 0o644) +} + +func writeFrontMatter(b *strings.Builder, page store.Page, spaceName string) { + b.WriteString("---\n") + writeKV(b, "id", page.ID) + writeKV(b, "space_id", page.SpaceID) + writeKV(b, "space", spaceName) + writeKV(b, "title", page.Title) + writeKV(b, "source", page.Source) + writeKV(b, "notion_url", page.URL) + writeKV(b, "created_time", formatMS(page.CreatedTime)) + writeKV(b, "last_edited_time", formatMS(page.LastEditedTime)) + b.WriteString("---\n\n") +} + +func writeKV(b *strings.Builder, key, value string) { + if value == "" { + return + } + value = strings.ReplaceAll(value, "\n", " ") + value = strings.ReplaceAll(value, `"`, `\"`) + fmt.Fprintf(b, "%s: \"%s\"\n", key, value) +} + +func renderBlocks(b *strings.Builder, pageID string, blocks []store.Block) { + children := map[string][]store.Block{} + for _, block := range blocks { + if block.ID == pageID { + continue + } + parent := block.ParentID + children[parent] = append(children[parent], block) + } + for parent := range children { + sort.SliceStable(children[parent], func(i, j int) bool { + a, z := children[parent][i], children[parent][j] + if a.CreatedTime == z.CreatedTime { + return a.ID < z.ID + } + return a.CreatedTime < z.CreatedTime + }) + } + renderChildren(b, pageID, children, 0) + if len(children[pageID]) == 0 { + var loose []store.Block + for _, block := range blocks { + if block.ID != pageID && block.ParentID != pageID { + loose = append(loose, block) + } + } + for _, block := range loose { + renderBlock(b, block, 0) + } + } +} + +func renderChildren(b *strings.Builder, parentID string, children map[string][]store.Block, depth int) { + for _, block := range children[parentID] { + renderBlock(b, block, depth) + renderChildren(b, block.ID, children, depth+1) + } +} + +func renderBlock(b *strings.Builder, block store.Block, depth int) { + text := notiontext.MarkdownEscape(block.Text) + indent := strings.Repeat(" ", depth) + switch block.Type { + case "header", "heading_1": + writeLine(b, "# "+text) + case "sub_header", "heading_2": + writeLine(b, "## "+text) + case "sub_sub_header", "heading_3": + writeLine(b, "### "+text) + case "bulleted_list", "bulleted_list_item": + writeLine(b, indent+"- "+fallback(text, block.Type)) + case "numbered_list", "numbered_list_item": + writeLine(b, indent+"1. "+fallback(text, block.Type)) + case "to_do", "to_do_item": + writeLine(b, indent+"- [ ] "+fallback(text, block.Type)) + case "quote": + writeLine(b, "> "+fallback(text, block.Type)) + case "code": + b.WriteString("```text\n") + b.WriteString(text) + b.WriteString("\n```\n\n") + case "divider": + writeLine(b, "---") + case "image", "file", "pdf", "video", "figma", "drive": + writeLine(b, fmt.Sprintf("[%s: %s]", block.Type, fallback(text, block.ID))) + case "column", "column_list", "table", "table_row", "collection_view": + if text != "" { + writeLine(b, text) + } + default: + if text != "" { + writeLine(b, text) + } else if block.Type != "" { + writeLine(b, fmt.Sprintf("[%s]", block.Type)) + } + } +} + +func writeLine(b *strings.Builder, line string) { + line = strings.TrimRight(line, " ") + if line == "" { + return + } + b.WriteString(line) + b.WriteString("\n\n") +} + +func fallback(s, fallback string) string { + if strings.TrimSpace(s) != "" { + return s + } + return fallback +} + +func formatMS(ms int64) string { + if ms <= 0 { + return "" + } + return time.UnixMilli(ms).UTC().Format(time.RFC3339) +} diff --git a/internal/markdown/export_test.go b/internal/markdown/export_test.go new file mode 100644 index 0000000..347e3b8 --- /dev/null +++ b/internal/markdown/export_test.go @@ -0,0 +1,46 @@ +package markdown + +import ( + "context" + "os" + "path/filepath" + "strings" + "testing" + + "github.com/vincentkoc/notioncrawl/internal/store" +) + +func TestExporterWritesMarkdown(t *testing.T) { + ctx := context.Background() + st, err := store.Open(filepath.Join(t.TempDir(), "notioncrawl.db")) + if err != nil { + t.Fatal(err) + } + defer st.Close() + now := store.NowMS() + if err := st.UpsertSpace(ctx, store.Space{ID: "space1", Name: "Engineering", Source: "test", SyncedAt: now}); err != nil { + t.Fatal(err) + } + if err := st.UpsertPage(ctx, store.Page{ID: "page1", SpaceID: "space1", Title: "Launch Plan", Alive: true, Source: "test", SyncedAt: now}); err != nil { + t.Fatal(err) + } + if err := st.UpsertBlock(ctx, store.Block{ID: "block1", PageID: "page1", ParentID: "page1", Type: "bulleted_list", Text: "ship it", Alive: true, Source: "test", SyncedAt: now}); err != nil { + t.Fatal(err) + } + dir := t.TempDir() + s, err := Exporter{Store: st, Dir: dir}.Export(ctx) + if err != nil { + t.Fatal(err) + } + if s.Pages != 1 || len(s.Files) != 1 { + t.Fatalf("unexpected summary: %+v", s) + } + b, err := os.ReadFile(s.Files[0]) + if err != nil { + t.Fatal(err) + } + text := string(b) + if !strings.Contains(text, "# Launch Plan") || !strings.Contains(text, "- ship it") { + t.Fatalf("unexpected markdown:\n%s", text) + } +}