fix(markdown): preserve unicode export paths
Preserve readable Unicode path components and make Markdown filenames collision-safe for Notion Desktop IDs.
This commit is contained in:
parent
25be299fab
commit
a906ed4845
@ -22,7 +22,7 @@ to without holding Notion credentials.
|
||||
- official API page/block/user/comment ingestion
|
||||
- Notion database metadata and row ingestion through the official API
|
||||
- current Notion data-source API support plus legacy database endpoint support
|
||||
- normalized Markdown export organized by space and page path
|
||||
- normalized Markdown export organized by Unicode-safe space and page paths
|
||||
- CSV/TSV export for crawled Notion database rows
|
||||
- compressed JSONL git-share snapshots plus import/update workflows
|
||||
- archive status, activity reporting, and SQLite maintenance commands
|
||||
|
||||
4
SPEC.md
4
SPEC.md
@ -104,7 +104,9 @@ Core tables:
|
||||
|
||||
## Markdown Archive
|
||||
|
||||
Markdown export writes deterministic paths:
|
||||
Markdown export writes deterministic Unicode-safe paths. Path components keep
|
||||
readable letters, numbers, CJK text, and emoji while replacing filesystem path
|
||||
separators and unsafe punctuation with dashes:
|
||||
|
||||
```text
|
||||
pages/<space-slug>/<page-title>-<short-id>.md
|
||||
|
||||
@ -78,3 +78,29 @@ func TestExporterUsesDisplayOrder(t *testing.T) {
|
||||
t.Fatalf("markdown did not preserve display order:\n%s", text)
|
||||
}
|
||||
}
|
||||
|
||||
func TestExporterPreservesUnicodePathNames(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
st, err := store.Open(filepath.Join(t.TempDir(), "notcrawl.db"))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer st.Close()
|
||||
now := store.NowMS()
|
||||
if err := st.UpsertSpace(ctx, store.Space{ID: "space1", Name: "研究 🚀", Source: "test", SyncedAt: now}); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := st.UpsertPage(ctx, store.Page{ID: "page1", SpaceID: "space1", Title: "計画 ✅ / Q2", Alive: true, Source: "test", SyncedAt: now}); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
dir := t.TempDir()
|
||||
s, err := Exporter{Store: st, Dir: dir}.Export(ctx)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
want := filepath.Join(dir, "研究-🚀", "計画-✅-q2-page1.md")
|
||||
if len(s.Files) != 1 || s.Files[0] != want {
|
||||
t.Fatalf("unexpected export path: %+v, want %s", s.Files, want)
|
||||
}
|
||||
}
|
||||
|
||||
@ -5,6 +5,7 @@ import (
|
||||
"fmt"
|
||||
"regexp"
|
||||
"strings"
|
||||
"unicode"
|
||||
)
|
||||
|
||||
var spaceRE = regexp.MustCompile(`\s+`)
|
||||
@ -60,8 +61,8 @@ func MarkdownEscape(s string) string {
|
||||
|
||||
func ShortID(id string) string {
|
||||
clean := strings.ReplaceAll(id, "-", "")
|
||||
if len(clean) > 8 {
|
||||
return clean[:8]
|
||||
if len(clean) > 16 {
|
||||
return clean[:8] + "-" + clean[len(clean)-8:]
|
||||
}
|
||||
if clean == "" {
|
||||
return "unknown"
|
||||
@ -73,13 +74,22 @@ func Slug(s string) string {
|
||||
s = strings.ToLower(Normalize(s))
|
||||
var b strings.Builder
|
||||
lastDash := false
|
||||
wrote := false
|
||||
for _, r := range s {
|
||||
switch {
|
||||
case r >= 'a' && r <= 'z', r >= '0' && r <= '9':
|
||||
case isSlugRune(r):
|
||||
b.WriteRune(r)
|
||||
lastDash = false
|
||||
case r == '-' || r == '_' || r == ' ' || r == '/' || r == '.':
|
||||
if !lastDash {
|
||||
wrote = true
|
||||
case isSlugSeparator(r):
|
||||
if wrote && !lastDash {
|
||||
b.WriteByte('-')
|
||||
lastDash = true
|
||||
}
|
||||
case unicode.IsControl(r):
|
||||
continue
|
||||
default:
|
||||
if wrote && !lastDash {
|
||||
b.WriteByte('-')
|
||||
lastDash = true
|
||||
}
|
||||
@ -92,6 +102,14 @@ func Slug(s string) string {
|
||||
return out
|
||||
}
|
||||
|
||||
func isSlugRune(r rune) bool {
|
||||
return unicode.IsLetter(r) || unicode.IsNumber(r) || unicode.IsMark(r) || (r > unicode.MaxASCII && unicode.IsSymbol(r)) || r == '\u200d'
|
||||
}
|
||||
|
||||
func isSlugSeparator(r rune) bool {
|
||||
return unicode.IsSpace(r) || strings.ContainsRune(`-_/.\:;`, r)
|
||||
}
|
||||
|
||||
func MarshalRaw(m map[string]any) string {
|
||||
b, err := json.Marshal(m)
|
||||
if err != nil {
|
||||
|
||||
@ -15,3 +15,24 @@ func TestSlug(t *testing.T) {
|
||||
t.Fatalf("got %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSlugPreservesUnicodePathText(t *testing.T) {
|
||||
got := Slug("研究 🚀 / 計画 ✅")
|
||||
if got != "研究-🚀-計画-✅" {
|
||||
t.Fatalf("got %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSlugRemovesUnsafePathText(t *testing.T) {
|
||||
got := Slug(`A/B\C:D*E?F"G<H>I|J`)
|
||||
if got != "a-b-c-d-e-f-g-h-i-j" {
|
||||
t.Fatalf("got %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestShortIDKeepsEnoughEntropyForDesktopIDs(t *testing.T) {
|
||||
got := ShortID("24f71240-0000-0000-0000-123456789abc")
|
||||
if got != "24f71240-56789abc" {
|
||||
t.Fatalf("got %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user