fix(export): add bulk database export (#22)
This commit is contained in:
parent
2dcf13a432
commit
ea691828c6
@ -59,6 +59,7 @@ export NOTION_TOKEN="secret_..."
|
||||
notcrawl sync --source api
|
||||
notcrawl databases
|
||||
notcrawl export-db --database DATABASE_ID --format csv --output roadmap.csv
|
||||
notcrawl export-db --all --dir exports/csv
|
||||
```
|
||||
|
||||
Default paths:
|
||||
@ -79,7 +80,7 @@ Default paths:
|
||||
- `sync` ingests from `desktop`, `api`, or `all`
|
||||
- `export-md` renders normalized Markdown files from SQLite
|
||||
- `databases` lists crawled Notion databases
|
||||
- `export-db` exports a crawled Notion database to CSV or TSV
|
||||
- `export-db` exports one crawled Notion database, or all databases with `--all --dir`, to CSV or TSV
|
||||
- `search` searches page and comment text through FTS5
|
||||
- `sql` runs read-only SQL against the archive
|
||||
- `publish` exports SQLite tables and Markdown into a git share repo
|
||||
|
||||
1
SPEC.md
1
SPEC.md
@ -163,6 +163,7 @@ those pages through `pages.collection_id`.
|
||||
```text
|
||||
notcrawl export-db --database <database-id> --format csv --output rows.csv
|
||||
notcrawl export-db --database <database-id> --format tsv --output rows.tsv
|
||||
notcrawl export-db --all --dir exports/csv
|
||||
```
|
||||
|
||||
The first columns are stable metadata:
|
||||
|
||||
@ -8,12 +8,14 @@ import (
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
|
||||
"github.com/vincentkoc/notcrawl/internal/config"
|
||||
"github.com/vincentkoc/notcrawl/internal/markdown"
|
||||
"github.com/vincentkoc/notcrawl/internal/notionapi"
|
||||
"github.com/vincentkoc/notcrawl/internal/notiondesktop"
|
||||
"github.com/vincentkoc/notcrawl/internal/notiontext"
|
||||
"github.com/vincentkoc/notcrawl/internal/report"
|
||||
"github.com/vincentkoc/notcrawl/internal/share"
|
||||
"github.com/vincentkoc/notcrawl/internal/store"
|
||||
@ -276,11 +278,25 @@ func runDatabases(ctx context.Context, stdout io.Writer, cfg config.Config) erro
|
||||
func runExportDatabase(ctx context.Context, stdout io.Writer, cfg config.Config, args []string) error {
|
||||
fs := flag.NewFlagSet("export-db", flag.ContinueOnError)
|
||||
databaseID := fs.String("database", "", "database id to export")
|
||||
all := fs.Bool("all", false, "export every crawled database")
|
||||
dir := fs.String("dir", "", "directory for --all exports")
|
||||
format := fs.String("format", "csv", "output format: csv or tsv")
|
||||
output := fs.String("output", "", "output file path, defaults to stdout")
|
||||
if err := fs.Parse(args); err != nil {
|
||||
return err
|
||||
}
|
||||
if *all {
|
||||
if *databaseID != "" {
|
||||
return fmt.Errorf("export-db cannot combine --all and --database")
|
||||
}
|
||||
if *output != "" {
|
||||
return fmt.Errorf("export-db cannot combine --all and --output")
|
||||
}
|
||||
if *dir == "" {
|
||||
return fmt.Errorf("export-db --all requires --dir")
|
||||
}
|
||||
return runExportAllDatabases(ctx, stdout, cfg, tableexport.Format(*format), *dir)
|
||||
}
|
||||
if *databaseID == "" {
|
||||
return fmt.Errorf("export-db requires --database")
|
||||
}
|
||||
@ -313,6 +329,89 @@ func runExportDatabase(ctx context.Context, stdout io.Writer, cfg config.Config,
|
||||
return nil
|
||||
}
|
||||
|
||||
func runExportAllDatabases(ctx context.Context, stdout io.Writer, cfg config.Config, format tableexport.Format, dir string) error {
|
||||
ext, err := exportExtension(format)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
dir, err = config.ExpandPath(dir)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if err := os.MkdirAll(dir, 0o755); err != nil {
|
||||
return err
|
||||
}
|
||||
st, err := store.Open(cfg.DBPath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer st.Close()
|
||||
collections, err := st.Collections(ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
index, err := os.Create(filepath.Join(dir, "index.tsv"))
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
fmt.Fprintln(index, "id\tname\tsource\trows\tcolumns\tfile")
|
||||
exporter := tableexport.Exporter{Store: st}
|
||||
used := map[string]bool{}
|
||||
var databases, rows int
|
||||
for _, collection := range collections {
|
||||
name := exportDatabaseFilename(collection, ext, used)
|
||||
path := filepath.Join(dir, name)
|
||||
file, err := os.Create(path)
|
||||
if err != nil {
|
||||
_ = index.Close()
|
||||
return err
|
||||
}
|
||||
s, exportErr := exporter.Export(ctx, collection.ID, format, file)
|
||||
closeErr := file.Close()
|
||||
if exportErr != nil {
|
||||
_ = index.Close()
|
||||
return exportErr
|
||||
}
|
||||
if closeErr != nil {
|
||||
_ = index.Close()
|
||||
return closeErr
|
||||
}
|
||||
databases++
|
||||
rows += s.Rows
|
||||
fmt.Fprintf(index, "%s\t%s\t%s\t%d\t%d\t%s\n", collection.ID, collection.Name, collection.Source, s.Rows, s.Columns, name)
|
||||
}
|
||||
if err := index.Close(); err != nil {
|
||||
return err
|
||||
}
|
||||
fmt.Fprintf(stdout, "exported %d databases and %d rows to %s\n", databases, rows, dir)
|
||||
return nil
|
||||
}
|
||||
|
||||
func exportExtension(format tableexport.Format) (string, error) {
|
||||
switch format {
|
||||
case "", tableexport.FormatCSV:
|
||||
return "csv", nil
|
||||
case tableexport.FormatTSV:
|
||||
return "tsv", nil
|
||||
default:
|
||||
return "", fmt.Errorf("unsupported format %q", format)
|
||||
}
|
||||
}
|
||||
|
||||
func exportDatabaseFilename(collection store.Collection, ext string, used map[string]bool) string {
|
||||
baseName := collection.Name
|
||||
if strings.TrimSpace(baseName) == "" {
|
||||
baseName = collection.ID
|
||||
}
|
||||
base := notiontext.Slug(baseName) + "-" + notiontext.ShortID(collection.ID)
|
||||
name := base + "." + ext
|
||||
for i := 2; used[name]; i++ {
|
||||
name = fmt.Sprintf("%s-%d.%s", base, i, ext)
|
||||
}
|
||||
used[name] = true
|
||||
return name
|
||||
}
|
||||
|
||||
func runSearch(ctx context.Context, stdout io.Writer, cfg config.Config, args []string) error {
|
||||
if len(args) == 0 {
|
||||
return fmt.Errorf("search query required")
|
||||
@ -489,6 +588,7 @@ Commands:
|
||||
export-md Render normalized Markdown from SQLite
|
||||
databases List crawled Notion databases
|
||||
export-db --database ID Export a database as CSV or TSV
|
||||
export-db --all --dir DIR Export every database as CSV or TSV
|
||||
search QUERY Search page text
|
||||
sql QUERY Run read-only SQL
|
||||
publish [--push] Export data and Markdown into a git share repo
|
||||
|
||||
@ -1,6 +1,15 @@
|
||||
package main
|
||||
|
||||
import "testing"
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/vincentkoc/notcrawl/internal/store"
|
||||
)
|
||||
|
||||
func TestSearchFieldCollapsesRecordSeparators(t *testing.T) {
|
||||
got := searchField("line one\nline\ttwo line three")
|
||||
@ -8,3 +17,55 @@ func TestSearchFieldCollapsesRecordSeparators(t *testing.T) {
|
||||
t.Fatalf("unexpected field: %q", got)
|
||||
}
|
||||
}
|
||||
|
||||
func TestExportDatabaseAllWritesFilesAndIndex(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
dir := t.TempDir()
|
||||
dbPath := filepath.Join(dir, "notcrawl.db")
|
||||
st, err := store.Open(dbPath)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
now := store.NowMS()
|
||||
for _, collection := range []store.Collection{
|
||||
{ID: "db1", Name: "Roadmap", Source: "test", SyncedAt: now, SchemaJSON: `{"Name":{"type":"title"}}`},
|
||||
{ID: "db2", Name: "Launch Plan", Source: "test", SyncedAt: now, SchemaJSON: `{"Task":{"type":"title"}}`},
|
||||
} {
|
||||
if err := st.UpsertCollection(ctx, collection); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
if err := st.UpsertPage(ctx, store.Page{
|
||||
ID: "page1", CollectionID: "db1", Title: "Ship", URL: "https://example.com/ship", Alive: true, Source: "test", SyncedAt: now,
|
||||
PropertiesJSON: `{"Name":{"type":"title","title":[{"plain_text":"Ship"}]}}`,
|
||||
}); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := st.Close(); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
outDir := filepath.Join(dir, "csv")
|
||||
var stdout, stderr bytes.Buffer
|
||||
err = run(ctx, []string{"--config", filepath.Join(dir, "missing.toml"), "--db", dbPath, "export-db", "--all", "--dir", outDir}, &stdout, &stderr)
|
||||
if err != nil {
|
||||
t.Fatalf("export-db --all failed: %v\nstderr:\n%s", err, stderr.String())
|
||||
}
|
||||
if got := stdout.String(); !strings.Contains(got, "exported 2 databases and 1 rows") {
|
||||
t.Fatalf("unexpected stdout: %s", got)
|
||||
}
|
||||
for _, name := range []string{"roadmap-db1.csv", "launch-plan-db2.csv", "index.tsv"} {
|
||||
if _, err := os.Stat(filepath.Join(outDir, name)); err != nil {
|
||||
t.Fatalf("missing %s: %v", name, err)
|
||||
}
|
||||
}
|
||||
index, err := os.ReadFile(filepath.Join(outDir, "index.tsv"))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
for _, want := range []string{"id\tname\tsource\trows\tcolumns\tfile", "db1\tRoadmap\ttest\t1\t4\troadmap-db1.csv"} {
|
||||
if !strings.Contains(string(index), want) {
|
||||
t.Fatalf("index missing %q:\n%s", want, index)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user