From b80faa9e7ec08cceb734cf8060f9fbaf92ad4318 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Mon, 27 Apr 2026 11:03:23 -0700 Subject: [PATCH] fix(markdown): preserve unicode export paths --- README.md | 2 +- SPEC.md | 4 +++- internal/markdown/export_test.go | 26 ++++++++++++++++++++++++++ internal/notiontext/text.go | 28 +++++++++++++++++++++++----- internal/notiontext/text_test.go | 21 +++++++++++++++++++++ 5 files changed, 74 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 4d59c39..05b07de 100644 --- a/README.md +++ b/README.md @@ -22,7 +22,7 @@ to without holding Notion credentials. - official API page/block/user/comment ingestion - Notion database metadata and row ingestion through the official API - current Notion data-source API support plus legacy database endpoint support -- normalized Markdown export organized by space and page path +- normalized Markdown export organized by Unicode-safe space and page paths - CSV/TSV export for crawled Notion database rows - compressed JSONL git-share snapshots plus import/update workflows - archive status, activity reporting, and SQLite maintenance commands diff --git a/SPEC.md b/SPEC.md index 08221f1..f82e778 100644 --- a/SPEC.md +++ b/SPEC.md @@ -104,7 +104,9 @@ Core tables: ## Markdown Archive -Markdown export writes deterministic paths: +Markdown export writes deterministic Unicode-safe paths. Path components keep +readable letters, numbers, CJK text, and emoji while replacing filesystem path +separators and unsafe punctuation with dashes: ```text pages//-.md diff --git a/internal/markdown/export_test.go b/internal/markdown/export_test.go index a215767..0398f6b 100644 --- a/internal/markdown/export_test.go +++ b/internal/markdown/export_test.go @@ -78,3 +78,29 @@ func TestExporterUsesDisplayOrder(t *testing.T) { t.Fatalf("markdown did not preserve display order:\n%s", text) } } + +func TestExporterPreservesUnicodePathNames(t *testing.T) { + ctx := context.Background() + st, err := store.Open(filepath.Join(t.TempDir(), "notcrawl.db")) + if err != nil { + t.Fatal(err) + } + defer st.Close() + now := store.NowMS() + if err := st.UpsertSpace(ctx, store.Space{ID: "space1", Name: "研究 🚀", Source: "test", SyncedAt: now}); err != nil { + t.Fatal(err) + } + if err := st.UpsertPage(ctx, store.Page{ID: "page1", SpaceID: "space1", Title: "計画 ✅ / Q2", Alive: true, Source: "test", SyncedAt: now}); err != nil { + t.Fatal(err) + } + + dir := t.TempDir() + s, err := Exporter{Store: st, Dir: dir}.Export(ctx) + if err != nil { + t.Fatal(err) + } + want := filepath.Join(dir, "研究-🚀", "計画-✅-q2-page1.md") + if len(s.Files) != 1 || s.Files[0] != want { + t.Fatalf("unexpected export path: %+v, want %s", s.Files, want) + } +} diff --git a/internal/notiontext/text.go b/internal/notiontext/text.go index 150b603..53d4c13 100644 --- a/internal/notiontext/text.go +++ b/internal/notiontext/text.go @@ -5,6 +5,7 @@ import ( "fmt" "regexp" "strings" + "unicode" ) var spaceRE = regexp.MustCompile(`\s+`) @@ -60,8 +61,8 @@ func MarkdownEscape(s string) string { func ShortID(id string) string { clean := strings.ReplaceAll(id, "-", "") - if len(clean) > 8 { - return clean[:8] + if len(clean) > 16 { + return clean[:8] + "-" + clean[len(clean)-8:] } if clean == "" { return "unknown" @@ -73,13 +74,22 @@ func Slug(s string) string { s = strings.ToLower(Normalize(s)) var b strings.Builder lastDash := false + wrote := false for _, r := range s { switch { - case r >= 'a' && r <= 'z', r >= '0' && r <= '9': + case isSlugRune(r): b.WriteRune(r) lastDash = false - case r == '-' || r == '_' || r == ' ' || r == '/' || r == '.': - if !lastDash { + wrote = true + case isSlugSeparator(r): + if wrote && !lastDash { + b.WriteByte('-') + lastDash = true + } + case unicode.IsControl(r): + continue + default: + if wrote && !lastDash { b.WriteByte('-') lastDash = true } @@ -92,6 +102,14 @@ func Slug(s string) string { return out } +func isSlugRune(r rune) bool { + return unicode.IsLetter(r) || unicode.IsNumber(r) || unicode.IsMark(r) || (r > unicode.MaxASCII && unicode.IsSymbol(r)) || r == '\u200d' +} + +func isSlugSeparator(r rune) bool { + return unicode.IsSpace(r) || strings.ContainsRune(`-_/.\:;`, r) +} + func MarshalRaw(m map[string]any) string { b, err := json.Marshal(m) if err != nil { diff --git a/internal/notiontext/text_test.go b/internal/notiontext/text_test.go index 1a5bffe..43f9cbd 100644 --- a/internal/notiontext/text_test.go +++ b/internal/notiontext/text_test.go @@ -15,3 +15,24 @@ func TestSlug(t *testing.T) { t.Fatalf("got %q", got) } } + +func TestSlugPreservesUnicodePathText(t *testing.T) { + got := Slug("研究 🚀 / 計画 ✅") + if got != "研究-🚀-計画-✅" { + t.Fatalf("got %q", got) + } +} + +func TestSlugRemovesUnsafePathText(t *testing.T) { + got := Slug(`A/B\C:D*E?F"GI|J`) + if got != "a-b-c-d-e-f-g-h-i-j" { + t.Fatalf("got %q", got) + } +} + +func TestShortIDKeepsEnoughEntropyForDesktopIDs(t *testing.T) { + got := ShortID("24f71240-0000-0000-0000-123456789abc") + if got != "24f71240-56789abc" { + t.Fatalf("got %q", got) + } +}