fix(export): add bulk database export (#22)

This commit is contained in:
Vincent Koc 2026-04-29 04:32:52 -07:00 committed by GitHub
parent 2dcf13a432
commit ea691828c6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 165 additions and 2 deletions

View File

@ -59,6 +59,7 @@ export NOTION_TOKEN="secret_..."
notcrawl sync --source api
notcrawl databases
notcrawl export-db --database DATABASE_ID --format csv --output roadmap.csv
notcrawl export-db --all --dir exports/csv
```
Default paths:
@ -79,7 +80,7 @@ Default paths:
- `sync` ingests from `desktop`, `api`, or `all`
- `export-md` renders normalized Markdown files from SQLite
- `databases` lists crawled Notion databases
- `export-db` exports a crawled Notion database to CSV or TSV
- `export-db` exports one crawled Notion database, or all databases with `--all --dir`, to CSV or TSV
- `search` searches page and comment text through FTS5
- `sql` runs read-only SQL against the archive
- `publish` exports SQLite tables and Markdown into a git share repo

View File

@ -163,6 +163,7 @@ those pages through `pages.collection_id`.
```text
notcrawl export-db --database <database-id> --format csv --output rows.csv
notcrawl export-db --database <database-id> --format tsv --output rows.tsv
notcrawl export-db --all --dir exports/csv
```
The first columns are stable metadata:

View File

@ -8,12 +8,14 @@ import (
"fmt"
"io"
"os"
"path/filepath"
"strings"
"github.com/vincentkoc/notcrawl/internal/config"
"github.com/vincentkoc/notcrawl/internal/markdown"
"github.com/vincentkoc/notcrawl/internal/notionapi"
"github.com/vincentkoc/notcrawl/internal/notiondesktop"
"github.com/vincentkoc/notcrawl/internal/notiontext"
"github.com/vincentkoc/notcrawl/internal/report"
"github.com/vincentkoc/notcrawl/internal/share"
"github.com/vincentkoc/notcrawl/internal/store"
@ -276,11 +278,25 @@ func runDatabases(ctx context.Context, stdout io.Writer, cfg config.Config) erro
func runExportDatabase(ctx context.Context, stdout io.Writer, cfg config.Config, args []string) error {
fs := flag.NewFlagSet("export-db", flag.ContinueOnError)
databaseID := fs.String("database", "", "database id to export")
all := fs.Bool("all", false, "export every crawled database")
dir := fs.String("dir", "", "directory for --all exports")
format := fs.String("format", "csv", "output format: csv or tsv")
output := fs.String("output", "", "output file path, defaults to stdout")
if err := fs.Parse(args); err != nil {
return err
}
if *all {
if *databaseID != "" {
return fmt.Errorf("export-db cannot combine --all and --database")
}
if *output != "" {
return fmt.Errorf("export-db cannot combine --all and --output")
}
if *dir == "" {
return fmt.Errorf("export-db --all requires --dir")
}
return runExportAllDatabases(ctx, stdout, cfg, tableexport.Format(*format), *dir)
}
if *databaseID == "" {
return fmt.Errorf("export-db requires --database")
}
@ -313,6 +329,89 @@ func runExportDatabase(ctx context.Context, stdout io.Writer, cfg config.Config,
return nil
}
func runExportAllDatabases(ctx context.Context, stdout io.Writer, cfg config.Config, format tableexport.Format, dir string) error {
ext, err := exportExtension(format)
if err != nil {
return err
}
dir, err = config.ExpandPath(dir)
if err != nil {
return err
}
if err := os.MkdirAll(dir, 0o755); err != nil {
return err
}
st, err := store.Open(cfg.DBPath)
if err != nil {
return err
}
defer st.Close()
collections, err := st.Collections(ctx)
if err != nil {
return err
}
index, err := os.Create(filepath.Join(dir, "index.tsv"))
if err != nil {
return err
}
fmt.Fprintln(index, "id\tname\tsource\trows\tcolumns\tfile")
exporter := tableexport.Exporter{Store: st}
used := map[string]bool{}
var databases, rows int
for _, collection := range collections {
name := exportDatabaseFilename(collection, ext, used)
path := filepath.Join(dir, name)
file, err := os.Create(path)
if err != nil {
_ = index.Close()
return err
}
s, exportErr := exporter.Export(ctx, collection.ID, format, file)
closeErr := file.Close()
if exportErr != nil {
_ = index.Close()
return exportErr
}
if closeErr != nil {
_ = index.Close()
return closeErr
}
databases++
rows += s.Rows
fmt.Fprintf(index, "%s\t%s\t%s\t%d\t%d\t%s\n", collection.ID, collection.Name, collection.Source, s.Rows, s.Columns, name)
}
if err := index.Close(); err != nil {
return err
}
fmt.Fprintf(stdout, "exported %d databases and %d rows to %s\n", databases, rows, dir)
return nil
}
func exportExtension(format tableexport.Format) (string, error) {
switch format {
case "", tableexport.FormatCSV:
return "csv", nil
case tableexport.FormatTSV:
return "tsv", nil
default:
return "", fmt.Errorf("unsupported format %q", format)
}
}
func exportDatabaseFilename(collection store.Collection, ext string, used map[string]bool) string {
baseName := collection.Name
if strings.TrimSpace(baseName) == "" {
baseName = collection.ID
}
base := notiontext.Slug(baseName) + "-" + notiontext.ShortID(collection.ID)
name := base + "." + ext
for i := 2; used[name]; i++ {
name = fmt.Sprintf("%s-%d.%s", base, i, ext)
}
used[name] = true
return name
}
func runSearch(ctx context.Context, stdout io.Writer, cfg config.Config, args []string) error {
if len(args) == 0 {
return fmt.Errorf("search query required")
@ -489,6 +588,7 @@ Commands:
export-md Render normalized Markdown from SQLite
databases List crawled Notion databases
export-db --database ID Export a database as CSV or TSV
export-db --all --dir DIR Export every database as CSV or TSV
search QUERY Search page text
sql QUERY Run read-only SQL
publish [--push] Export data and Markdown into a git share repo

View File

@ -1,6 +1,15 @@
package main
import "testing"
import (
"bytes"
"context"
"os"
"path/filepath"
"strings"
"testing"
"github.com/vincentkoc/notcrawl/internal/store"
)
func TestSearchFieldCollapsesRecordSeparators(t *testing.T) {
got := searchField("line one\nline\ttwo line three")
@ -8,3 +17,55 @@ func TestSearchFieldCollapsesRecordSeparators(t *testing.T) {
t.Fatalf("unexpected field: %q", got)
}
}
func TestExportDatabaseAllWritesFilesAndIndex(t *testing.T) {
ctx := context.Background()
dir := t.TempDir()
dbPath := filepath.Join(dir, "notcrawl.db")
st, err := store.Open(dbPath)
if err != nil {
t.Fatal(err)
}
now := store.NowMS()
for _, collection := range []store.Collection{
{ID: "db1", Name: "Roadmap", Source: "test", SyncedAt: now, SchemaJSON: `{"Name":{"type":"title"}}`},
{ID: "db2", Name: "Launch Plan", Source: "test", SyncedAt: now, SchemaJSON: `{"Task":{"type":"title"}}`},
} {
if err := st.UpsertCollection(ctx, collection); err != nil {
t.Fatal(err)
}
}
if err := st.UpsertPage(ctx, store.Page{
ID: "page1", CollectionID: "db1", Title: "Ship", URL: "https://example.com/ship", Alive: true, Source: "test", SyncedAt: now,
PropertiesJSON: `{"Name":{"type":"title","title":[{"plain_text":"Ship"}]}}`,
}); err != nil {
t.Fatal(err)
}
if err := st.Close(); err != nil {
t.Fatal(err)
}
outDir := filepath.Join(dir, "csv")
var stdout, stderr bytes.Buffer
err = run(ctx, []string{"--config", filepath.Join(dir, "missing.toml"), "--db", dbPath, "export-db", "--all", "--dir", outDir}, &stdout, &stderr)
if err != nil {
t.Fatalf("export-db --all failed: %v\nstderr:\n%s", err, stderr.String())
}
if got := stdout.String(); !strings.Contains(got, "exported 2 databases and 1 rows") {
t.Fatalf("unexpected stdout: %s", got)
}
for _, name := range []string{"roadmap-db1.csv", "launch-plan-db2.csv", "index.tsv"} {
if _, err := os.Stat(filepath.Join(outDir, name)); err != nil {
t.Fatalf("missing %s: %v", name, err)
}
}
index, err := os.ReadFile(filepath.Join(outDir, "index.tsv"))
if err != nil {
t.Fatal(err)
}
for _, want := range []string{"id\tname\tsource\trows\tcolumns\tfile", "db1\tRoadmap\ttest\t1\t4\troadmap-db1.csv"} {
if !strings.Contains(string(index), want) {
t.Fatalf("index missing %q:\n%s", want, index)
}
}
}