From ea691828c68424f0aec3e0fbd6fca9715b70dfb5 Mon Sep 17 00:00:00 2001 From: Vincent Koc Date: Wed, 29 Apr 2026 04:32:52 -0700 Subject: [PATCH] fix(export): add bulk database export (#22) --- README.md | 3 +- SPEC.md | 1 + cmd/notcrawl/main.go | 100 ++++++++++++++++++++++++++++++++++++++ cmd/notcrawl/main_test.go | 63 +++++++++++++++++++++++- 4 files changed, 165 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 75e1ae1..87e4426 100644 --- a/README.md +++ b/README.md @@ -59,6 +59,7 @@ export NOTION_TOKEN="secret_..." notcrawl sync --source api notcrawl databases notcrawl export-db --database DATABASE_ID --format csv --output roadmap.csv +notcrawl export-db --all --dir exports/csv ``` Default paths: @@ -79,7 +80,7 @@ Default paths: - `sync` ingests from `desktop`, `api`, or `all` - `export-md` renders normalized Markdown files from SQLite - `databases` lists crawled Notion databases -- `export-db` exports a crawled Notion database to CSV or TSV +- `export-db` exports one crawled Notion database, or all databases with `--all --dir`, to CSV or TSV - `search` searches page and comment text through FTS5 - `sql` runs read-only SQL against the archive - `publish` exports SQLite tables and Markdown into a git share repo diff --git a/SPEC.md b/SPEC.md index b1b968d..4e88456 100644 --- a/SPEC.md +++ b/SPEC.md @@ -163,6 +163,7 @@ those pages through `pages.collection_id`. ```text notcrawl export-db --database --format csv --output rows.csv notcrawl export-db --database --format tsv --output rows.tsv +notcrawl export-db --all --dir exports/csv ``` The first columns are stable metadata: diff --git a/cmd/notcrawl/main.go b/cmd/notcrawl/main.go index 659fb70..43395f9 100644 --- a/cmd/notcrawl/main.go +++ b/cmd/notcrawl/main.go @@ -8,12 +8,14 @@ import ( "fmt" "io" "os" + "path/filepath" "strings" "github.com/vincentkoc/notcrawl/internal/config" "github.com/vincentkoc/notcrawl/internal/markdown" "github.com/vincentkoc/notcrawl/internal/notionapi" "github.com/vincentkoc/notcrawl/internal/notiondesktop" + "github.com/vincentkoc/notcrawl/internal/notiontext" "github.com/vincentkoc/notcrawl/internal/report" "github.com/vincentkoc/notcrawl/internal/share" "github.com/vincentkoc/notcrawl/internal/store" @@ -276,11 +278,25 @@ func runDatabases(ctx context.Context, stdout io.Writer, cfg config.Config) erro func runExportDatabase(ctx context.Context, stdout io.Writer, cfg config.Config, args []string) error { fs := flag.NewFlagSet("export-db", flag.ContinueOnError) databaseID := fs.String("database", "", "database id to export") + all := fs.Bool("all", false, "export every crawled database") + dir := fs.String("dir", "", "directory for --all exports") format := fs.String("format", "csv", "output format: csv or tsv") output := fs.String("output", "", "output file path, defaults to stdout") if err := fs.Parse(args); err != nil { return err } + if *all { + if *databaseID != "" { + return fmt.Errorf("export-db cannot combine --all and --database") + } + if *output != "" { + return fmt.Errorf("export-db cannot combine --all and --output") + } + if *dir == "" { + return fmt.Errorf("export-db --all requires --dir") + } + return runExportAllDatabases(ctx, stdout, cfg, tableexport.Format(*format), *dir) + } if *databaseID == "" { return fmt.Errorf("export-db requires --database") } @@ -313,6 +329,89 @@ func runExportDatabase(ctx context.Context, stdout io.Writer, cfg config.Config, return nil } +func runExportAllDatabases(ctx context.Context, stdout io.Writer, cfg config.Config, format tableexport.Format, dir string) error { + ext, err := exportExtension(format) + if err != nil { + return err + } + dir, err = config.ExpandPath(dir) + if err != nil { + return err + } + if err := os.MkdirAll(dir, 0o755); err != nil { + return err + } + st, err := store.Open(cfg.DBPath) + if err != nil { + return err + } + defer st.Close() + collections, err := st.Collections(ctx) + if err != nil { + return err + } + index, err := os.Create(filepath.Join(dir, "index.tsv")) + if err != nil { + return err + } + fmt.Fprintln(index, "id\tname\tsource\trows\tcolumns\tfile") + exporter := tableexport.Exporter{Store: st} + used := map[string]bool{} + var databases, rows int + for _, collection := range collections { + name := exportDatabaseFilename(collection, ext, used) + path := filepath.Join(dir, name) + file, err := os.Create(path) + if err != nil { + _ = index.Close() + return err + } + s, exportErr := exporter.Export(ctx, collection.ID, format, file) + closeErr := file.Close() + if exportErr != nil { + _ = index.Close() + return exportErr + } + if closeErr != nil { + _ = index.Close() + return closeErr + } + databases++ + rows += s.Rows + fmt.Fprintf(index, "%s\t%s\t%s\t%d\t%d\t%s\n", collection.ID, collection.Name, collection.Source, s.Rows, s.Columns, name) + } + if err := index.Close(); err != nil { + return err + } + fmt.Fprintf(stdout, "exported %d databases and %d rows to %s\n", databases, rows, dir) + return nil +} + +func exportExtension(format tableexport.Format) (string, error) { + switch format { + case "", tableexport.FormatCSV: + return "csv", nil + case tableexport.FormatTSV: + return "tsv", nil + default: + return "", fmt.Errorf("unsupported format %q", format) + } +} + +func exportDatabaseFilename(collection store.Collection, ext string, used map[string]bool) string { + baseName := collection.Name + if strings.TrimSpace(baseName) == "" { + baseName = collection.ID + } + base := notiontext.Slug(baseName) + "-" + notiontext.ShortID(collection.ID) + name := base + "." + ext + for i := 2; used[name]; i++ { + name = fmt.Sprintf("%s-%d.%s", base, i, ext) + } + used[name] = true + return name +} + func runSearch(ctx context.Context, stdout io.Writer, cfg config.Config, args []string) error { if len(args) == 0 { return fmt.Errorf("search query required") @@ -489,6 +588,7 @@ Commands: export-md Render normalized Markdown from SQLite databases List crawled Notion databases export-db --database ID Export a database as CSV or TSV + export-db --all --dir DIR Export every database as CSV or TSV search QUERY Search page text sql QUERY Run read-only SQL publish [--push] Export data and Markdown into a git share repo diff --git a/cmd/notcrawl/main_test.go b/cmd/notcrawl/main_test.go index 632ff48..c90ef6f 100644 --- a/cmd/notcrawl/main_test.go +++ b/cmd/notcrawl/main_test.go @@ -1,6 +1,15 @@ package main -import "testing" +import ( + "bytes" + "context" + "os" + "path/filepath" + "strings" + "testing" + + "github.com/vincentkoc/notcrawl/internal/store" +) func TestSearchFieldCollapsesRecordSeparators(t *testing.T) { got := searchField("line one\nline\ttwo line three") @@ -8,3 +17,55 @@ func TestSearchFieldCollapsesRecordSeparators(t *testing.T) { t.Fatalf("unexpected field: %q", got) } } + +func TestExportDatabaseAllWritesFilesAndIndex(t *testing.T) { + ctx := context.Background() + dir := t.TempDir() + dbPath := filepath.Join(dir, "notcrawl.db") + st, err := store.Open(dbPath) + if err != nil { + t.Fatal(err) + } + now := store.NowMS() + for _, collection := range []store.Collection{ + {ID: "db1", Name: "Roadmap", Source: "test", SyncedAt: now, SchemaJSON: `{"Name":{"type":"title"}}`}, + {ID: "db2", Name: "Launch Plan", Source: "test", SyncedAt: now, SchemaJSON: `{"Task":{"type":"title"}}`}, + } { + if err := st.UpsertCollection(ctx, collection); err != nil { + t.Fatal(err) + } + } + if err := st.UpsertPage(ctx, store.Page{ + ID: "page1", CollectionID: "db1", Title: "Ship", URL: "https://example.com/ship", Alive: true, Source: "test", SyncedAt: now, + PropertiesJSON: `{"Name":{"type":"title","title":[{"plain_text":"Ship"}]}}`, + }); err != nil { + t.Fatal(err) + } + if err := st.Close(); err != nil { + t.Fatal(err) + } + + outDir := filepath.Join(dir, "csv") + var stdout, stderr bytes.Buffer + err = run(ctx, []string{"--config", filepath.Join(dir, "missing.toml"), "--db", dbPath, "export-db", "--all", "--dir", outDir}, &stdout, &stderr) + if err != nil { + t.Fatalf("export-db --all failed: %v\nstderr:\n%s", err, stderr.String()) + } + if got := stdout.String(); !strings.Contains(got, "exported 2 databases and 1 rows") { + t.Fatalf("unexpected stdout: %s", got) + } + for _, name := range []string{"roadmap-db1.csv", "launch-plan-db2.csv", "index.tsv"} { + if _, err := os.Stat(filepath.Join(outDir, name)); err != nil { + t.Fatalf("missing %s: %v", name, err) + } + } + index, err := os.ReadFile(filepath.Join(outDir, "index.tsv")) + if err != nil { + t.Fatal(err) + } + for _, want := range []string{"id\tname\tsource\trows\tcolumns\tfile", "db1\tRoadmap\ttest\t1\t4\troadmap-db1.csv"} { + if !strings.Contains(string(index), want) { + t.Fatalf("index missing %q:\n%s", want, index) + } + } +}