feat(markdown): group exports by teamspace
Some checks failed
Validation / validate (push) Has been cancelled

This commit is contained in:
Vincent Koc 2026-04-27 11:52:14 -07:00
parent 6c21691f0d
commit 01b419bd54
No known key found for this signature in database
12 changed files with 324 additions and 38 deletions

View File

@ -22,7 +22,7 @@ to without holding Notion credentials.
- official API page/block/user/comment ingestion
- Notion database metadata and row ingestion through the official API
- current Notion data-source API support plus legacy database endpoint support
- normalized Markdown export organized by Unicode-safe space and page paths
- normalized Markdown export organized by Unicode-safe workspace, teamspace, and page paths
- CSV/TSV export for crawled Notion database rows
- compressed JSONL git-share snapshots plus import/update workflows
- archive status, activity reporting, and SQLite maintenance commands

View File

@ -91,6 +91,7 @@ Core tables:
- `spaces`
- `users`
- `teams`
- `pages`
- `blocks`
- `collections`
@ -109,9 +110,11 @@ readable letters, numbers, CJK text, and emoji while replacing filesystem path
separators and unsafe punctuation with dashes:
```text
pages/<space-slug>/<page-title>-<short-id>.md
pages/<space-slug>/<team-slug>/<page-title>-<short-id>.md
```
The team slug is omitted when no teamspace can be resolved.
Each export removes stale generated `.md` files under the Markdown root while
leaving non-Markdown sidecar files alone.

View File

@ -206,7 +206,7 @@ func runSync(ctx context.Context, stdout io.Writer, cfg config.Config, args []st
if err != nil {
return err
}
fmt.Fprintf(stdout, "desktop: pages=%d blocks=%d collections=%d comments=%d snapshot=%s\n", s.Pages, s.Blocks, s.Collections, s.Comments, s.Source.Snapshot)
fmt.Fprintf(stdout, "desktop: pages=%d blocks=%d teams=%d collections=%d comments=%d snapshot=%s\n", s.Pages, s.Blocks, s.Teams, s.Collections, s.Comments, s.Source.Snapshot)
case "api":
s, err := notionapi.Client{
BaseURL: cfg.Notion.API.BaseURL,
@ -223,7 +223,7 @@ func runSync(ctx context.Context, stdout io.Writer, cfg config.Config, args []st
if err != nil {
return err
}
fmt.Fprintf(stdout, "desktop: pages=%d blocks=%d collections=%d comments=%d snapshot=%s\n", s.Pages, s.Blocks, s.Collections, s.Comments, s.Source.Snapshot)
fmt.Fprintf(stdout, "desktop: pages=%d blocks=%d teams=%d collections=%d comments=%d snapshot=%s\n", s.Pages, s.Blocks, s.Teams, s.Collections, s.Comments, s.Source.Snapshot)
}
if cfg.Notion.API.Enabled && cfg.APIToken() != "" {
s, err := notionapi.Client{

View File

@ -61,6 +61,14 @@ func (e Exporter) writePage(ctx context.Context, page store.Page) (string, error
if err != nil {
return "", err
}
teamID, err := e.Store.PageTeamID(ctx, page)
if err != nil {
return "", err
}
teamName, err := e.Store.TeamName(ctx, teamID)
if err != nil {
return "", err
}
blocks, err := e.Store.PageBlocks(ctx, page.ID)
if err != nil {
return "", err
@ -72,12 +80,17 @@ func (e Exporter) writePage(ctx context.Context, page store.Page) (string, error
spaceSlug := notiontext.Slug(spaceName)
titleSlug := maxSlug(notiontext.Slug(page.Title), 96)
name := fmt.Sprintf("%s-%s.md", titleSlug, notiontext.ShortID(page.ID))
path := filepath.Join(e.Dir, spaceSlug, name)
parts := []string{e.Dir, spaceSlug}
if teamName != "" {
parts = append(parts, notiontext.Slug(teamName))
}
parts = append(parts, name)
path := filepath.Join(parts...)
if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil {
return "", err
}
var b strings.Builder
writeFrontMatter(&b, page, spaceName)
writeFrontMatter(&b, page, spaceName, teamID, teamName)
if page.Title != "" {
fmt.Fprintf(&b, "# %s\n\n", notiontext.MarkdownEscape(page.Title))
}
@ -99,11 +112,13 @@ func (e Exporter) writePage(ctx context.Context, page store.Page) (string, error
return path, os.WriteFile(path, []byte(out), 0o644)
}
func writeFrontMatter(b *strings.Builder, page store.Page, spaceName string) {
func writeFrontMatter(b *strings.Builder, page store.Page, spaceName, teamID, teamName string) {
b.WriteString("---\n")
writeKV(b, "id", page.ID)
writeKV(b, "space_id", page.SpaceID)
writeKV(b, "space", spaceName)
writeKV(b, "team_id", teamID)
writeKV(b, "team", teamName)
writeKV(b, "title", page.Title)
writeKV(b, "source", page.Source)
writeKV(b, "notion_url", page.URL)

View File

@ -105,6 +105,67 @@ func TestExporterPreservesUnicodePathNames(t *testing.T) {
}
}
func TestExporterUsesWorkspaceAndTeamspacePath(t *testing.T) {
ctx := context.Background()
st, err := store.Open(filepath.Join(t.TempDir(), "notcrawl.db"))
if err != nil {
t.Fatal(err)
}
defer st.Close()
now := store.NowMS()
if err := st.UpsertSpace(ctx, store.Space{ID: "space1", Name: "Acme Org", Source: "test", SyncedAt: now}); err != nil {
t.Fatal(err)
}
if err := st.UpsertTeam(ctx, store.Team{ID: "team1", SpaceID: "space1", Name: "Research Lab", Source: "test", SyncedAt: now}); err != nil {
t.Fatal(err)
}
if err := st.UpsertPage(ctx, store.Page{ID: "page1", SpaceID: "space1", ParentID: "team1", ParentTable: "team", Title: "Plan", Alive: true, Source: "test", SyncedAt: now}); err != nil {
t.Fatal(err)
}
dir := t.TempDir()
s, err := Exporter{Store: st, Dir: dir}.Export(ctx)
if err != nil {
t.Fatal(err)
}
want := filepath.Join(dir, "acme-org", "research-lab", "plan-page1.md")
if len(s.Files) != 1 || s.Files[0] != want {
t.Fatalf("unexpected export path: %+v, want %s", s.Files, want)
}
b, err := os.ReadFile(want)
if err != nil {
t.Fatal(err)
}
text := string(b)
if !strings.Contains(text, `team_id: "team1"`) || !strings.Contains(text, `team: "Research Lab"`) {
t.Fatalf("missing team front matter:\n%s", text)
}
}
func TestExporterUsesReadableMissingSpaceFallback(t *testing.T) {
ctx := context.Background()
st, err := store.Open(filepath.Join(t.TempDir(), "notcrawl.db"))
if err != nil {
t.Fatal(err)
}
defer st.Close()
now := store.NowMS()
spaceID := "52f1c029-ec85-4ff5-bd43-c6d6ea9259e0"
if err := st.UpsertPage(ctx, store.Page{ID: "page1", SpaceID: spaceID, Title: "Loose", Alive: true, Source: "test", SyncedAt: now}); err != nil {
t.Fatal(err)
}
dir := t.TempDir()
s, err := Exporter{Store: st, Dir: dir}.Export(ctx)
if err != nil {
t.Fatal(err)
}
want := filepath.Join(dir, "space-52f1c029-ea9259e0", "loose-page1.md")
if len(s.Files) != 1 || s.Files[0] != want {
t.Fatalf("unexpected export path: %+v, want %s", s.Files, want)
}
}
func TestExporterPrunesStaleMarkdown(t *testing.T) {
ctx := context.Background()
st, err := store.Open(filepath.Join(t.TempDir(), "notcrawl.db"))

View File

@ -249,15 +249,16 @@ func (c Client) ingestCollection(ctx context.Context, st *store.Store, collectio
name = id
}
if err := st.UpsertCollection(ctx, store.Collection{
ID: id,
SpaceID: parent.string("workspace"),
ParentID: parentID,
Name: name,
SchemaJSON: marshalAny(collection["properties"]),
FormatJSON: marshalAny(collection),
RawJSON: raw,
Source: SourceName,
SyncedAt: store.NowMS(),
ID: id,
SpaceID: parent.string("workspace"),
ParentID: parentID,
ParentTable: parent.string("type"),
Name: name,
SchemaJSON: marshalAny(collection["properties"]),
FormatJSON: marshalAny(collection),
RawJSON: raw,
Source: SourceName,
SyncedAt: store.NowMS(),
}); err != nil {
return 0, err
}

View File

@ -28,6 +28,7 @@ type Summary struct {
Source Source
Spaces int
Users int
Teams int
Pages int
Blocks int
Collections int
@ -68,6 +69,9 @@ func Ingest(ctx context.Context, st *store.Store, path, cacheDir string) (Summar
if s.Users, err = ingestUsers(ctx, st, db); err != nil {
return s, err
}
if s.Teams, err = ingestTeams(ctx, st, db); err != nil {
return s, err
}
if s.Collections, err = ingestCollections(ctx, st, db); err != nil {
return s, err
}
@ -176,9 +180,38 @@ func ingestUsers(ctx context.Context, st *store.Store, db *sql.DB) (int, error)
return n, rows.Err()
}
func ingestTeams(ctx context.Context, st *store.Store, db *sql.DB) (int, error) {
rows, err := db.QueryContext(ctx, `select id, space_id, parent_id, parent_table, coalesce(name, ''),
coalesce(json_object('id', id, 'space_id', space_id, 'parent_id', parent_id, 'parent_table', parent_table,
'name', name, 'description', description, 'team_pages', team_pages, 'settings', settings), '{}')
from team where coalesce(archived_at, 0) = 0`)
if err != nil {
return 0, ignoreMissingTable(err)
}
defer rows.Close()
n := 0
for rows.Next() {
var x store.Team
if err := rows.Scan(&x.ID, &x.SpaceID, &x.ParentID, &x.ParentTable, &x.Name, &x.RawJSON); err != nil {
return n, err
}
if x.Name == "" {
x.Name = x.ID
}
x.Source = SourceName
x.SyncedAt = store.NowMS()
if err := st.UpsertTeam(ctx, x); err != nil {
return n, err
}
n++
}
return n, rows.Err()
}
func ingestCollections(ctx context.Context, st *store.Store, db *sql.DB) (int, error) {
rows, err := db.QueryContext(ctx, `select id, space_id, parent_id, coalesce(name, ''), coalesce(schema, ''), coalesce(format, ''),
coalesce(json_object('id', id, 'space_id', space_id, 'parent_id', parent_id, 'name', name, 'schema', schema, 'format', format), '{}')
rows, err := db.QueryContext(ctx, `select id, space_id, parent_id, parent_table, coalesce(name, ''), coalesce(schema, ''), coalesce(format, ''),
coalesce(json_object('id', id, 'space_id', space_id, 'parent_id', parent_id, 'parent_table', parent_table,
'name', name, 'schema', schema, 'format', format), '{}')
from collection where alive = 1`)
if err != nil {
return 0, ignoreMissingTable(err)
@ -187,7 +220,7 @@ func ingestCollections(ctx context.Context, st *store.Store, db *sql.DB) (int, e
n := 0
for rows.Next() {
var x store.Collection
if err := rows.Scan(&x.ID, &x.SpaceID, &x.ParentID, &x.Name, &x.SchemaJSON, &x.FormatJSON, &x.RawJSON); err != nil {
if err := rows.Scan(&x.ID, &x.SpaceID, &x.ParentID, &x.ParentTable, &x.Name, &x.SchemaJSON, &x.FormatJSON, &x.RawJSON); err != nil {
return n, err
}
x.Name = notiontext.TitleFromProperties(x.Name)

View File

@ -21,6 +21,7 @@ import (
var exportTables = []string{
"spaces",
"users",
"teams",
"pages",
"blocks",
"collections",

View File

@ -3,6 +3,7 @@ package store
import (
"context"
"database/sql"
"strings"
)
func (s *Store) Pages(ctx context.Context) ([]Page, error) {
@ -28,7 +29,7 @@ func (s *Store) Pages(ctx context.Context) ([]Page, error) {
}
func (s *Store) Collections(ctx context.Context) ([]Collection, error) {
rows, err := s.db.QueryContext(ctx, `select id, space_id, parent_id, name, schema_json, format_json, raw_json, source, synced_at
rows, err := s.db.QueryContext(ctx, `select id, space_id, parent_id, parent_table, name, schema_json, format_json, raw_json, source, synced_at
from collections order by lower(coalesce(name, id)), id`)
if err != nil {
return nil, err
@ -37,7 +38,7 @@ func (s *Store) Collections(ctx context.Context) ([]Collection, error) {
var collections []Collection
for rows.Next() {
var c Collection
if err := rows.Scan(&c.ID, &c.SpaceID, &c.ParentID, &c.Name, &c.SchemaJSON, &c.FormatJSON, &c.RawJSON, &c.Source, &c.SyncedAt); err != nil {
if err := rows.Scan(&c.ID, &c.SpaceID, &c.ParentID, &c.ParentTable, &c.Name, &c.SchemaJSON, &c.FormatJSON, &c.RawJSON, &c.Source, &c.SyncedAt); err != nil {
return nil, err
}
collections = append(collections, c)
@ -47,8 +48,8 @@ func (s *Store) Collections(ctx context.Context) ([]Collection, error) {
func (s *Store) Collection(ctx context.Context, id string) (Collection, error) {
var c Collection
err := s.db.QueryRowContext(ctx, `select id, space_id, parent_id, name, schema_json, format_json, raw_json, source, synced_at
from collections where id = ?`, id).Scan(&c.ID, &c.SpaceID, &c.ParentID, &c.Name, &c.SchemaJSON, &c.FormatJSON, &c.RawJSON, &c.Source, &c.SyncedAt)
err := s.db.QueryRowContext(ctx, `select id, space_id, parent_id, parent_table, name, schema_json, format_json, raw_json, source, synced_at
from collections where id = ?`, id).Scan(&c.ID, &c.SpaceID, &c.ParentID, &c.ParentTable, &c.Name, &c.SchemaJSON, &c.FormatJSON, &c.RawJSON, &c.Source, &c.SyncedAt)
return c, err
}
@ -126,12 +127,83 @@ func (s *Store) SpaceName(ctx context.Context, id string) (string, error) {
err := s.db.QueryRowContext(ctx, `select name from spaces where id = ?`, id).Scan(&name)
if err != nil {
if err == sql.ErrNoRows {
return id, nil
return "space-" + shortID(id), nil
}
return "", err
}
if name.Valid && name.String != "" {
return name.String, nil
}
return id, nil
return "space-" + shortID(id), nil
}
func (s *Store) TeamName(ctx context.Context, id string) (string, error) {
if id == "" {
return "", nil
}
var name sql.NullString
err := s.db.QueryRowContext(ctx, `select name from teams where id = ?`, id).Scan(&name)
if err != nil {
if err == sql.ErrNoRows {
return "team-" + shortID(id), nil
}
return "", err
}
if name.Valid && name.String != "" {
return name.String, nil
}
return "team-" + shortID(id), nil
}
func (s *Store) PageTeamID(ctx context.Context, page Page) (string, error) {
seen := map[string]bool{page.ID: true}
return s.resolveTeamID(ctx, page.ParentTable, page.ParentID, page.CollectionID, seen)
}
func (s *Store) resolveTeamID(ctx context.Context, table, id, collectionID string, seen map[string]bool) (string, error) {
if table == "team" {
return id, nil
}
if table == "collection" && id == "" {
id = collectionID
}
if id == "" || seen[table+":"+id] {
return "", nil
}
seen[table+":"+id] = true
switch table {
case "block":
var parentID, parentTable sql.NullString
err := s.db.QueryRowContext(ctx, `select parent_id, parent_table from blocks where id = ?`, id).Scan(&parentID, &parentTable)
if err != nil {
if err == sql.ErrNoRows {
return "", nil
}
return "", err
}
return s.resolveTeamID(ctx, parentTable.String, parentID.String, "", seen)
case "collection", "database", "data_source":
var parentID, parentTable sql.NullString
err := s.db.QueryRowContext(ctx, `select parent_id, parent_table from collections where id = ?`, id).Scan(&parentID, &parentTable)
if err != nil {
if err == sql.ErrNoRows {
return "", nil
}
return "", err
}
return s.resolveTeamID(ctx, parentTable.String, parentID.String, "", seen)
default:
return "", nil
}
}
func shortID(id string) string {
clean := strings.ReplaceAll(id, "-", "")
if len(clean) > 16 {
return clean[:8] + "-" + clean[len(clean)-8:]
}
if clean == "" {
return "unknown"
}
return clean
}

View File

@ -99,6 +99,7 @@ type Status struct {
WALBytes int64 `json:"wal_bytes"`
Spaces int `json:"spaces"`
Users int `json:"users"`
Teams int `json:"teams"`
Pages int `json:"pages"`
Blocks int `json:"blocks"`
Collections int `json:"collections"`
@ -141,6 +142,17 @@ func (s *Store) init(ctx context.Context) error {
source text not null,
synced_at integer not null
)`,
`create table if not exists teams (
id text primary key,
space_id text,
parent_id text,
parent_table text,
name text not null,
raw_json text,
source text not null,
synced_at integer not null
)`,
`create index if not exists teams_space_id on teams(space_id)`,
`create table if not exists pages (
id text primary key,
space_id text,
@ -188,6 +200,7 @@ func (s *Store) init(ctx context.Context) error {
id text primary key,
space_id text,
parent_id text,
parent_table text,
name text,
schema_json text,
format_json text,
@ -253,6 +266,9 @@ func (s *Store) init(ctx context.Context) error {
if err := s.ensureColumn(ctx, "blocks", "display_order", "integer not null default 0"); err != nil {
return err
}
if err := s.ensureColumn(ctx, "collections", "parent_table", "text"); err != nil {
return err
}
if _, err := s.db.ExecContext(ctx, `create index if not exists blocks_page_alive_order on blocks(page_id, alive, parent_id, display_order, created_time, id)`); err != nil {
return err
}
@ -322,6 +338,21 @@ func (s *Store) UpsertUser(ctx context.Context, x User) error {
return err
}
func (s *Store) UpsertTeam(ctx context.Context, x Team) error {
_, err := s.db.ExecContext(ctx, `insert into teams(id, space_id, parent_id, parent_table, name, raw_json, source, synced_at)
values (?, ?, ?, ?, ?, ?, ?, ?)
on conflict(id) do update set
space_id=excluded.space_id,
parent_id=excluded.parent_id,
parent_table=excluded.parent_table,
name=excluded.name,
raw_json=excluded.raw_json,
source=excluded.source,
synced_at=excluded.synced_at`,
x.ID, x.SpaceID, x.ParentID, x.ParentTable, x.Name, x.RawJSON, x.Source, x.SyncedAt)
return err
}
func (s *Store) UpsertPage(ctx context.Context, x Page) error {
_, err := s.db.ExecContext(ctx, `insert into pages(
id, space_id, parent_id, parent_table, collection_id, title, url, icon, cover, properties_json,
@ -385,12 +416,12 @@ func (s *Store) UpsertBlock(ctx context.Context, x Block) error {
}
func (s *Store) UpsertCollection(ctx context.Context, x Collection) error {
_, err := s.db.ExecContext(ctx, `insert into collections(id, space_id, parent_id, name, schema_json, format_json, raw_json, source, synced_at)
values (?, ?, ?, ?, ?, ?, ?, ?, ?)
on conflict(id) do update set space_id=excluded.space_id, parent_id=excluded.parent_id, name=excluded.name,
_, err := s.db.ExecContext(ctx, `insert into collections(id, space_id, parent_id, parent_table, name, schema_json, format_json, raw_json, source, synced_at)
values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?)
on conflict(id) do update set space_id=excluded.space_id, parent_id=excluded.parent_id, parent_table=excluded.parent_table, name=excluded.name,
schema_json=excluded.schema_json, format_json=excluded.format_json, raw_json=excluded.raw_json,
source=excluded.source, synced_at=excluded.synced_at`,
x.ID, x.SpaceID, x.ParentID, x.Name, x.SchemaJSON, x.FormatJSON, x.RawJSON, x.Source, x.SyncedAt)
x.ID, x.SpaceID, x.ParentID, x.ParentTable, x.Name, x.SchemaJSON, x.FormatJSON, x.RawJSON, x.Source, x.SyncedAt)
return err
}
@ -560,6 +591,7 @@ func (s *Store) Status(ctx context.Context) (Status, error) {
}{
{`select count(*) from spaces`, &status.Spaces},
{`select count(*) from users`, &status.Users},
{`select count(*) from teams`, &status.Teams},
{`select count(*) from pages`, &status.Pages},
{`select count(*) from blocks`, &status.Blocks},
{`select count(*) from collections`, &status.Collections},

View File

@ -94,6 +94,62 @@ func TestStoreBuildsPageFTSInDisplayTreeOrder(t *testing.T) {
}
}
func TestStoreResolvesPageTeamThroughCollectionParent(t *testing.T) {
st, err := Open(filepath.Join(t.TempDir(), "notcrawl.db"))
if err != nil {
t.Fatal(err)
}
defer st.Close()
ctx := context.Background()
now := NowMS()
if err := st.UpsertTeam(ctx, Team{ID: "team1", SpaceID: "space1", Name: "Research", Source: "test", SyncedAt: now}); err != nil {
t.Fatal(err)
}
if err := st.UpsertCollection(ctx, Collection{ID: "collection1", SpaceID: "space1", ParentID: "team1", ParentTable: "team", Name: "Roadmap", Source: "test", SyncedAt: now}); err != nil {
t.Fatal(err)
}
page := Page{ID: "page1", SpaceID: "space1", ParentID: "collection1", ParentTable: "collection", CollectionID: "collection1", Title: "Row", Alive: true, Source: "test", SyncedAt: now}
if err := st.UpsertPage(ctx, page); err != nil {
t.Fatal(err)
}
teamID, err := st.PageTeamID(ctx, page)
if err != nil {
t.Fatal(err)
}
if teamID != "team1" {
t.Fatalf("expected team1, got %q", teamID)
}
}
func TestStoreResolvesPageTeamThroughBlockParent(t *testing.T) {
st, err := Open(filepath.Join(t.TempDir(), "notcrawl.db"))
if err != nil {
t.Fatal(err)
}
defer st.Close()
ctx := context.Background()
now := NowMS()
if err := st.UpsertTeam(ctx, Team{ID: "team1", SpaceID: "space1", Name: "Research", Source: "test", SyncedAt: now}); err != nil {
t.Fatal(err)
}
if err := st.UpsertBlock(ctx, Block{ID: "block1", SpaceID: "space1", ParentID: "team1", ParentTable: "team", Type: "text", Text: "parent", Alive: true, Source: "test", SyncedAt: now}); err != nil {
t.Fatal(err)
}
page := Page{ID: "page1", SpaceID: "space1", ParentID: "block1", ParentTable: "block", Title: "Child", Alive: true, Source: "test", SyncedAt: now}
if err := st.UpsertPage(ctx, page); err != nil {
t.Fatal(err)
}
teamID, err := st.PageTeamID(ctx, page)
if err != nil {
t.Fatal(err)
}
if teamID != "team1" {
t.Fatalf("expected team1, got %q", teamID)
}
}
func TestStoreStatusAndOptimize(t *testing.T) {
path := filepath.Join(t.TempDir(), "notcrawl.db")
st, err := Open(path)

View File

@ -17,6 +17,17 @@ type User struct {
SyncedAt int64
}
type Team struct {
ID string
SpaceID string
ParentID string
ParentTable string
Name string
RawJSON string
Source string
SyncedAt int64
}
type Page struct {
ID string
SpaceID string
@ -57,15 +68,16 @@ type Block struct {
}
type Collection struct {
ID string
SpaceID string
ParentID string
Name string
SchemaJSON string
FormatJSON string
RawJSON string
Source string
SyncedAt int64
ID string
SpaceID string
ParentID string
ParentTable string
Name string
SchemaJSON string
FormatJSON string
RawJSON string
Source string
SyncedAt int64
}
type Comment struct {