fix(markdown): preserve unicode export paths

Preserve readable Unicode path components and make Markdown filenames collision-safe for Notion Desktop IDs.
This commit is contained in:
Vincent Koc 2026-04-27 11:04:23 -07:00 committed by GitHub
parent 25be299fab
commit a906ed4845
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 74 additions and 7 deletions

View File

@ -22,7 +22,7 @@ to without holding Notion credentials.
- official API page/block/user/comment ingestion
- Notion database metadata and row ingestion through the official API
- current Notion data-source API support plus legacy database endpoint support
- normalized Markdown export organized by space and page path
- normalized Markdown export organized by Unicode-safe space and page paths
- CSV/TSV export for crawled Notion database rows
- compressed JSONL git-share snapshots plus import/update workflows
- archive status, activity reporting, and SQLite maintenance commands

View File

@ -104,7 +104,9 @@ Core tables:
## Markdown Archive
Markdown export writes deterministic paths:
Markdown export writes deterministic Unicode-safe paths. Path components keep
readable letters, numbers, CJK text, and emoji while replacing filesystem path
separators and unsafe punctuation with dashes:
```text
pages/<space-slug>/<page-title>-<short-id>.md

View File

@ -78,3 +78,29 @@ func TestExporterUsesDisplayOrder(t *testing.T) {
t.Fatalf("markdown did not preserve display order:\n%s", text)
}
}
func TestExporterPreservesUnicodePathNames(t *testing.T) {
ctx := context.Background()
st, err := store.Open(filepath.Join(t.TempDir(), "notcrawl.db"))
if err != nil {
t.Fatal(err)
}
defer st.Close()
now := store.NowMS()
if err := st.UpsertSpace(ctx, store.Space{ID: "space1", Name: "研究 🚀", Source: "test", SyncedAt: now}); err != nil {
t.Fatal(err)
}
if err := st.UpsertPage(ctx, store.Page{ID: "page1", SpaceID: "space1", Title: "計画 ✅ / Q2", Alive: true, Source: "test", SyncedAt: now}); err != nil {
t.Fatal(err)
}
dir := t.TempDir()
s, err := Exporter{Store: st, Dir: dir}.Export(ctx)
if err != nil {
t.Fatal(err)
}
want := filepath.Join(dir, "研究-🚀", "計画-✅-q2-page1.md")
if len(s.Files) != 1 || s.Files[0] != want {
t.Fatalf("unexpected export path: %+v, want %s", s.Files, want)
}
}

View File

@ -5,6 +5,7 @@ import (
"fmt"
"regexp"
"strings"
"unicode"
)
var spaceRE = regexp.MustCompile(`\s+`)
@ -60,8 +61,8 @@ func MarkdownEscape(s string) string {
func ShortID(id string) string {
clean := strings.ReplaceAll(id, "-", "")
if len(clean) > 8 {
return clean[:8]
if len(clean) > 16 {
return clean[:8] + "-" + clean[len(clean)-8:]
}
if clean == "" {
return "unknown"
@ -73,13 +74,22 @@ func Slug(s string) string {
s = strings.ToLower(Normalize(s))
var b strings.Builder
lastDash := false
wrote := false
for _, r := range s {
switch {
case r >= 'a' && r <= 'z', r >= '0' && r <= '9':
case isSlugRune(r):
b.WriteRune(r)
lastDash = false
case r == '-' || r == '_' || r == ' ' || r == '/' || r == '.':
if !lastDash {
wrote = true
case isSlugSeparator(r):
if wrote && !lastDash {
b.WriteByte('-')
lastDash = true
}
case unicode.IsControl(r):
continue
default:
if wrote && !lastDash {
b.WriteByte('-')
lastDash = true
}
@ -92,6 +102,14 @@ func Slug(s string) string {
return out
}
func isSlugRune(r rune) bool {
return unicode.IsLetter(r) || unicode.IsNumber(r) || unicode.IsMark(r) || (r > unicode.MaxASCII && unicode.IsSymbol(r)) || r == '\u200d'
}
func isSlugSeparator(r rune) bool {
return unicode.IsSpace(r) || strings.ContainsRune(`-_/.\:;`, r)
}
func MarshalRaw(m map[string]any) string {
b, err := json.Marshal(m)
if err != nil {

View File

@ -15,3 +15,24 @@ func TestSlug(t *testing.T) {
t.Fatalf("got %q", got)
}
}
func TestSlugPreservesUnicodePathText(t *testing.T) {
got := Slug("研究 🚀 / 計画 ✅")
if got != "研究-🚀-計画-✅" {
t.Fatalf("got %q", got)
}
}
func TestSlugRemovesUnsafePathText(t *testing.T) {
got := Slug(`A/B\C:D*E?F"G<H>I|J`)
if got != "a-b-c-d-e-f-g-h-i-j" {
t.Fatalf("got %q", got)
}
}
func TestShortIDKeepsEnoughEntropyForDesktopIDs(t *testing.T) {
got := ShortID("24f71240-0000-0000-0000-123456789abc")
if got != "24f71240-56789abc" {
t.Fatalf("got %q", got)
}
}