feat: add Notion database export
This commit is contained in:
parent
b1369ef3f5
commit
0a1d9b1992
@ -20,7 +20,9 @@ to without holding Notion credentials.
|
||||
- local SQLite storage with FTS5
|
||||
- read-only local desktop cache ingestion from macOS Notion
|
||||
- official API page/block/user/comment ingestion
|
||||
- Notion database metadata and row ingestion through the official API
|
||||
- normalized Markdown export organized by space and page path
|
||||
- CSV/TSV export for crawled Notion database rows
|
||||
- compressed JSONL git-share snapshots plus import/update workflows
|
||||
- read-only SQL access for ad hoc inspection
|
||||
|
||||
@ -51,6 +53,8 @@ Or use the official Notion API:
|
||||
```bash
|
||||
export NOTION_TOKEN="secret_..."
|
||||
notcrawl sync --source api
|
||||
notcrawl databases
|
||||
notcrawl export-db --database DATABASE_ID --format csv --output roadmap.csv
|
||||
```
|
||||
|
||||
Default paths:
|
||||
@ -67,6 +71,8 @@ Default paths:
|
||||
- `doctor` checks config, SQLite, desktop cache, and token presence
|
||||
- `sync` ingests from `desktop`, `api`, or `all`
|
||||
- `export-md` renders normalized Markdown files from SQLite
|
||||
- `databases` lists crawled Notion databases
|
||||
- `export-db` exports a crawled Notion database to CSV or TSV
|
||||
- `search` searches page and comment text through FTS5
|
||||
- `sql` runs read-only SQL against the archive
|
||||
- `publish` exports SQLite tables and Markdown into a git share repo
|
||||
|
||||
24
SPEC.md
24
SPEC.md
@ -23,11 +23,13 @@ V1 scope:
|
||||
- official Notion API sync
|
||||
- pages and blocks
|
||||
- databases/data sources as collections
|
||||
- database rows as pages linked to their collection
|
||||
- comments and discussions where available
|
||||
- users and spaces/workspaces
|
||||
- FTS5 search over rendered page/comment text
|
||||
- raw SQL access
|
||||
- Markdown export
|
||||
- CSV/TSV export for database rows
|
||||
- git-backed archive publishing and subscription
|
||||
|
||||
Out of scope for V1:
|
||||
@ -130,3 +132,25 @@ pages/**/*.md
|
||||
SQLite without requiring Notion credentials.
|
||||
|
||||
`update` pulls the latest snapshot and imports it.
|
||||
|
||||
## Database Export
|
||||
|
||||
API sync discovers databases visible to the integration, stores database
|
||||
metadata in `collections`, queries each database for row pages, and links those
|
||||
pages through `pages.collection_id`.
|
||||
|
||||
`export-db` renders row properties into delimited text:
|
||||
|
||||
```text
|
||||
notcrawl export-db --database <database-id> --format csv --output rows.csv
|
||||
notcrawl export-db --database <database-id> --format tsv --output rows.tsv
|
||||
```
|
||||
|
||||
The first columns are stable metadata:
|
||||
|
||||
- `page_id`
|
||||
- `page_title`
|
||||
- `url`
|
||||
|
||||
Remaining columns come from the database schema, with any extra row properties
|
||||
appended alphabetically.
|
||||
|
||||
@ -16,6 +16,7 @@ import (
|
||||
"github.com/vincentkoc/notcrawl/internal/notiondesktop"
|
||||
"github.com/vincentkoc/notcrawl/internal/share"
|
||||
"github.com/vincentkoc/notcrawl/internal/store"
|
||||
"github.com/vincentkoc/notcrawl/internal/tableexport"
|
||||
)
|
||||
|
||||
func main() {
|
||||
@ -69,6 +70,10 @@ func run(ctx context.Context, args []string, stdout, stderr io.Writer) error {
|
||||
return runSync(ctx, stdout, cfg, cmdArgs)
|
||||
case "export-md":
|
||||
return runExportMarkdown(ctx, stdout, cfg)
|
||||
case "databases":
|
||||
return runDatabases(ctx, stdout, cfg)
|
||||
case "export-db":
|
||||
return runExportDatabase(ctx, stdout, cfg, cmdArgs)
|
||||
case "search":
|
||||
return runSearch(ctx, stdout, cfg, cmdArgs)
|
||||
case "sql":
|
||||
@ -140,7 +145,7 @@ func runSync(ctx context.Context, stdout io.Writer, cfg config.Config, args []st
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
fmt.Fprintf(stdout, "api: users=%d pages=%d blocks=%d comments=%d\n", s.Users, s.Pages, s.Blocks, s.Comments)
|
||||
fmt.Fprintf(stdout, "api: users=%d pages=%d databases=%d database_rows=%d blocks=%d comments=%d\n", s.Users, s.Pages, s.Databases, s.DatabaseRows, s.Blocks, s.Comments)
|
||||
case "all":
|
||||
if cfg.Notion.Desktop.Enabled {
|
||||
s, err := notiondesktop.Ingest(ctx, st, cfg.Notion.Desktop.Path, cfg.CacheDir)
|
||||
@ -158,7 +163,7 @@ func runSync(ctx context.Context, stdout io.Writer, cfg config.Config, args []st
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
fmt.Fprintf(stdout, "api: users=%d pages=%d blocks=%d comments=%d\n", s.Users, s.Pages, s.Blocks, s.Comments)
|
||||
fmt.Fprintf(stdout, "api: users=%d pages=%d databases=%d database_rows=%d blocks=%d comments=%d\n", s.Users, s.Pages, s.Databases, s.DatabaseRows, s.Blocks, s.Comments)
|
||||
}
|
||||
default:
|
||||
return fmt.Errorf("unknown source %q", *source)
|
||||
@ -180,6 +185,63 @@ func runExportMarkdown(ctx context.Context, stdout io.Writer, cfg config.Config)
|
||||
return nil
|
||||
}
|
||||
|
||||
func runDatabases(ctx context.Context, stdout io.Writer, cfg config.Config) error {
|
||||
st, err := store.Open(cfg.DBPath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer st.Close()
|
||||
collections, err := st.Collections(ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
fmt.Fprintln(stdout, "id\tname\tsource")
|
||||
for _, collection := range collections {
|
||||
fmt.Fprintf(stdout, "%s\t%s\t%s\n", collection.ID, collection.Name, collection.Source)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func runExportDatabase(ctx context.Context, stdout io.Writer, cfg config.Config, args []string) error {
|
||||
fs := flag.NewFlagSet("export-db", flag.ContinueOnError)
|
||||
databaseID := fs.String("database", "", "database id to export")
|
||||
format := fs.String("format", "csv", "output format: csv or tsv")
|
||||
output := fs.String("output", "", "output file path, defaults to stdout")
|
||||
if err := fs.Parse(args); err != nil {
|
||||
return err
|
||||
}
|
||||
if *databaseID == "" {
|
||||
return fmt.Errorf("export-db requires --database")
|
||||
}
|
||||
st, err := store.Open(cfg.DBPath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer st.Close()
|
||||
var out io.Writer = stdout
|
||||
var file *os.File
|
||||
if *output != "" {
|
||||
outputPath, err := config.ExpandPath(*output)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
file, err = os.Create(outputPath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer file.Close()
|
||||
out = file
|
||||
}
|
||||
s, err := tableexport.Exporter{Store: st}.Export(ctx, *databaseID, tableexport.Format(*format), out)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if *output != "" {
|
||||
fmt.Fprintf(stdout, "exported %d rows and %d columns from %s to %s\n", s.Rows, s.Columns, s.Database, file.Name())
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func runSearch(ctx context.Context, stdout io.Writer, cfg config.Config, args []string) error {
|
||||
if len(args) == 0 {
|
||||
return fmt.Errorf("search query required")
|
||||
@ -347,6 +409,8 @@ Commands:
|
||||
sync --source api Ingest through the official Notion API
|
||||
sync --source all Run enabled sources
|
||||
export-md Render normalized Markdown from SQLite
|
||||
databases List crawled Notion databases
|
||||
export-db --database ID Export a database as CSV or TSV
|
||||
search QUERY Search page text
|
||||
sql QUERY Run read-only SQL
|
||||
publish [--push] Export data and Markdown into a git share repo
|
||||
|
||||
@ -25,10 +25,12 @@ type Client struct {
|
||||
}
|
||||
|
||||
type Summary struct {
|
||||
Users int
|
||||
Pages int
|
||||
Blocks int
|
||||
Comments int
|
||||
Users int
|
||||
Pages int
|
||||
Blocks int
|
||||
Comments int
|
||||
Databases int
|
||||
DatabaseRows int
|
||||
}
|
||||
|
||||
func (c Client) Sync(ctx context.Context, st *store.Store) (Summary, error) {
|
||||
@ -63,7 +65,7 @@ func (c Client) Sync(ctx context.Context, st *store.Store) (Summary, error) {
|
||||
return s, err
|
||||
}
|
||||
for _, page := range pages {
|
||||
count, comments, err := c.ingestPage(ctx, st, page)
|
||||
count, comments, err := c.ingestPage(ctx, st, page, ingestPageOptions{FetchBlocks: true, FetchComments: true})
|
||||
if err != nil {
|
||||
return s, err
|
||||
}
|
||||
@ -71,6 +73,18 @@ func (c Client) Sync(ctx context.Context, st *store.Store) (Summary, error) {
|
||||
s.Blocks += count
|
||||
s.Comments += comments
|
||||
}
|
||||
databases, err := c.searchDatabases(ctx)
|
||||
if err != nil {
|
||||
return s, err
|
||||
}
|
||||
for _, database := range databases {
|
||||
rows, err := c.ingestDatabase(ctx, st, database)
|
||||
if err != nil {
|
||||
return s, err
|
||||
}
|
||||
s.Databases++
|
||||
s.DatabaseRows += rows
|
||||
}
|
||||
if err := st.SetSyncState(ctx, SourceName, "workspace", "default", time.Now().Format(time.RFC3339)); err != nil {
|
||||
return s, err
|
||||
}
|
||||
@ -128,10 +142,18 @@ func (c Client) listUsers(ctx context.Context) ([]obj, error) {
|
||||
}
|
||||
|
||||
func (c Client) searchPages(ctx context.Context) ([]obj, error) {
|
||||
return c.searchObjects(ctx, "page")
|
||||
}
|
||||
|
||||
func (c Client) searchDatabases(ctx context.Context) ([]obj, error) {
|
||||
return c.searchObjects(ctx, "database")
|
||||
}
|
||||
|
||||
func (c Client) searchObjects(ctx context.Context, objectType string) ([]obj, error) {
|
||||
var out []obj
|
||||
cursor := ""
|
||||
for {
|
||||
body := obj{"page_size": 100, "filter": obj{"property": "object", "value": "page"}}
|
||||
body := obj{"page_size": 100, "filter": obj{"property": "object", "value": objectType}}
|
||||
if cursor != "" {
|
||||
body["start_cursor"] = cursor
|
||||
}
|
||||
@ -154,7 +176,13 @@ func (c Client) searchPages(ctx context.Context) ([]obj, error) {
|
||||
}
|
||||
}
|
||||
|
||||
func (c Client) ingestPage(ctx context.Context, st *store.Store, page obj) (blockCount int, commentCount int, err error) {
|
||||
type ingestPageOptions struct {
|
||||
CollectionID string
|
||||
FetchBlocks bool
|
||||
FetchComments bool
|
||||
}
|
||||
|
||||
func (c Client) ingestPage(ctx context.Context, st *store.Store, page obj, opts ingestPageOptions) (blockCount int, commentCount int, err error) {
|
||||
raw := notiontext.MarshalRaw(page)
|
||||
props := marshalAny(page["properties"])
|
||||
parent := page.mapObj("parent")
|
||||
@ -162,12 +190,20 @@ func (c Client) ingestPage(ctx context.Context, st *store.Store, page obj) (bloc
|
||||
if parentID == "" {
|
||||
parentID = parent.string("database_id")
|
||||
}
|
||||
if parentID == "" {
|
||||
parentID = parent.string("data_source_id")
|
||||
}
|
||||
collectionID := opts.CollectionID
|
||||
if collectionID == "" && (parent.string("type") == "database_id" || parent.string("type") == "data_source_id") {
|
||||
collectionID = parentID
|
||||
}
|
||||
spaceID := parent.string("workspace")
|
||||
p := store.Page{
|
||||
ID: page.string("id"),
|
||||
SpaceID: spaceID,
|
||||
ParentID: parentID,
|
||||
ParentTable: parent.string("type"),
|
||||
CollectionID: collectionID,
|
||||
Title: titleFromAPIPage(page),
|
||||
URL: page.string("url"),
|
||||
PropertiesJSON: props,
|
||||
@ -184,17 +220,85 @@ func (c Client) ingestPage(ctx context.Context, st *store.Store, page obj) (bloc
|
||||
if err := st.UpsertPage(ctx, p); err != nil {
|
||||
return 0, 0, err
|
||||
}
|
||||
blocks, err := c.walkBlocks(ctx, st, p.ID, p.ID, p.SpaceID)
|
||||
if err != nil {
|
||||
return 0, 0, err
|
||||
var blocks, comments int
|
||||
if opts.FetchBlocks {
|
||||
blocks, err = c.walkBlocks(ctx, st, p.ID, p.ID, p.SpaceID)
|
||||
if err != nil {
|
||||
return 0, 0, err
|
||||
}
|
||||
}
|
||||
comments, err := c.ingestComments(ctx, st, p.ID, p.SpaceID)
|
||||
if err != nil {
|
||||
return 0, 0, err
|
||||
if opts.FetchComments {
|
||||
comments, err = c.ingestComments(ctx, st, p.ID, p.SpaceID)
|
||||
if err != nil {
|
||||
return 0, 0, err
|
||||
}
|
||||
}
|
||||
return blocks, comments, nil
|
||||
}
|
||||
|
||||
func (c Client) ingestDatabase(ctx context.Context, st *store.Store, database obj) (int, error) {
|
||||
id := database.string("id")
|
||||
raw := notiontext.MarshalRaw(database)
|
||||
parent := database.mapObj("parent")
|
||||
name := notiontext.Plain(database["title"])
|
||||
if name == "" {
|
||||
name = id
|
||||
}
|
||||
if err := st.UpsertCollection(ctx, store.Collection{
|
||||
ID: id,
|
||||
SpaceID: parent.string("workspace"),
|
||||
ParentID: firstNonEmpty(parent.string("page_id"), parent.string("block_id"), parent.string("workspace")),
|
||||
Name: name,
|
||||
SchemaJSON: marshalAny(database["properties"]),
|
||||
FormatJSON: marshalAny(database),
|
||||
RawJSON: raw,
|
||||
Source: SourceName,
|
||||
SyncedAt: store.NowMS(),
|
||||
}); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
if err := st.UpsertRawRecord(ctx, store.RawRecord{
|
||||
Source: SourceName, RecordTable: "database", RecordID: id, ParentID: parent.string("page_id"),
|
||||
SpaceID: parent.string("workspace"), RawJSON: raw, SyncedAt: store.NowMS(),
|
||||
}); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
return c.queryDatabase(ctx, st, id)
|
||||
}
|
||||
|
||||
func (c Client) queryDatabase(ctx context.Context, st *store.Store, databaseID string) (int, error) {
|
||||
var count int
|
||||
cursor := ""
|
||||
for {
|
||||
body := obj{"page_size": 100}
|
||||
if cursor != "" {
|
||||
body["start_cursor"] = cursor
|
||||
}
|
||||
var resp obj
|
||||
path := fmt.Sprintf("/databases/%s/query", url.PathEscape(databaseID))
|
||||
if err := c.do(ctx, http.MethodPost, path, body, &resp); err != nil {
|
||||
return count, err
|
||||
}
|
||||
for _, item := range asSlice(resp["results"]) {
|
||||
m, ok := item.(map[string]any)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if _, _, err := c.ingestPage(ctx, st, obj(m), ingestPageOptions{CollectionID: databaseID}); err != nil {
|
||||
return count, err
|
||||
}
|
||||
count++
|
||||
}
|
||||
if !truthy(resp["has_more"]) {
|
||||
return count, nil
|
||||
}
|
||||
cursor, _ = resp["next_cursor"].(string)
|
||||
if cursor == "" {
|
||||
return count, nil
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func (c Client) walkBlocks(ctx context.Context, st *store.Store, pageID, parentID, spaceID string) (int, error) {
|
||||
var count int
|
||||
cursor := ""
|
||||
@ -405,3 +509,12 @@ func asSlice(v any) []any {
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func firstNonEmpty(values ...string) string {
|
||||
for _, value := range values {
|
||||
if value != "" {
|
||||
return value
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
99
internal/notionapi/api_test.go
Normal file
99
internal/notionapi/api_test.go
Normal file
@ -0,0 +1,99 @@
|
||||
package notionapi
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
"github.com/vincentkoc/notcrawl/internal/store"
|
||||
)
|
||||
|
||||
func TestSyncIngestsDatabasesAndRows(t *testing.T) {
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
switch r.URL.Path {
|
||||
case "/users":
|
||||
_, _ = w.Write([]byte(`{"object":"list","results":[],"has_more":false}`))
|
||||
case "/search":
|
||||
var body map[string]any
|
||||
if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
filter := body["filter"].(map[string]any)
|
||||
switch filter["value"] {
|
||||
case "page":
|
||||
_, _ = w.Write([]byte(`{"object":"list","results":[],"has_more":false}`))
|
||||
case "database":
|
||||
_, _ = w.Write([]byte(`{
|
||||
"object":"list",
|
||||
"results":[{
|
||||
"object":"database",
|
||||
"id":"db1",
|
||||
"title":[{"plain_text":"Roadmap"}],
|
||||
"parent":{"type":"workspace","workspace":true},
|
||||
"properties":{
|
||||
"Name":{"id":"title","type":"title","title":{}},
|
||||
"Status":{"id":"status","type":"select","select":{}}
|
||||
}
|
||||
}],
|
||||
"has_more":false
|
||||
}`))
|
||||
default:
|
||||
t.Fatalf("unexpected search filter: %v", filter["value"])
|
||||
}
|
||||
case "/databases/db1/query":
|
||||
_, _ = w.Write([]byte(`{
|
||||
"object":"list",
|
||||
"results":[{
|
||||
"object":"page",
|
||||
"id":"page1",
|
||||
"created_time":"2026-01-01T00:00:00Z",
|
||||
"last_edited_time":"2026-01-02T00:00:00Z",
|
||||
"archived":false,
|
||||
"in_trash":false,
|
||||
"url":"https://notion.so/page1",
|
||||
"parent":{"type":"database_id","database_id":"db1"},
|
||||
"properties":{
|
||||
"Name":{"id":"title","type":"title","title":[{"plain_text":"Ship"}]},
|
||||
"Status":{"id":"status","type":"select","select":{"name":"Done"}}
|
||||
}
|
||||
}],
|
||||
"has_more":false
|
||||
}`))
|
||||
default:
|
||||
t.Fatalf("unexpected request: %s %s", r.Method, r.URL.String())
|
||||
}
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
st, err := store.Open(filepath.Join(t.TempDir(), "notcrawl.db"))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer st.Close()
|
||||
|
||||
summary, err := (Client{BaseURL: server.URL, Version: "2022-06-28", Token: "secret"}).Sync(context.Background(), st)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if summary.Databases != 1 || summary.DatabaseRows != 1 {
|
||||
t.Fatalf("unexpected summary: %+v", summary)
|
||||
}
|
||||
collections, err := st.Collections(context.Background())
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if len(collections) != 1 || collections[0].ID != "db1" || collections[0].Name != "Roadmap" {
|
||||
t.Fatalf("unexpected collections: %+v", collections)
|
||||
}
|
||||
rows, err := st.CollectionPages(context.Background(), "db1")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if len(rows) != 1 || rows[0].ID != "page1" || rows[0].CollectionID != "db1" {
|
||||
t.Fatalf("unexpected rows: %+v", rows)
|
||||
}
|
||||
}
|
||||
@ -27,6 +27,53 @@ func (s *Store) Pages(ctx context.Context) ([]Page, error) {
|
||||
return pages, rows.Err()
|
||||
}
|
||||
|
||||
func (s *Store) Collections(ctx context.Context) ([]Collection, error) {
|
||||
rows, err := s.db.QueryContext(ctx, `select id, space_id, parent_id, name, schema_json, format_json, raw_json, source, synced_at
|
||||
from collections order by lower(coalesce(name, id)), id`)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
var collections []Collection
|
||||
for rows.Next() {
|
||||
var c Collection
|
||||
if err := rows.Scan(&c.ID, &c.SpaceID, &c.ParentID, &c.Name, &c.SchemaJSON, &c.FormatJSON, &c.RawJSON, &c.Source, &c.SyncedAt); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
collections = append(collections, c)
|
||||
}
|
||||
return collections, rows.Err()
|
||||
}
|
||||
|
||||
func (s *Store) Collection(ctx context.Context, id string) (Collection, error) {
|
||||
var c Collection
|
||||
err := s.db.QueryRowContext(ctx, `select id, space_id, parent_id, name, schema_json, format_json, raw_json, source, synced_at
|
||||
from collections where id = ?`, id).Scan(&c.ID, &c.SpaceID, &c.ParentID, &c.Name, &c.SchemaJSON, &c.FormatJSON, &c.RawJSON, &c.Source, &c.SyncedAt)
|
||||
return c, err
|
||||
}
|
||||
|
||||
func (s *Store) CollectionPages(ctx context.Context, collectionID string) ([]Page, error) {
|
||||
rows, err := s.db.QueryContext(ctx, `select id, space_id, parent_id, parent_table, collection_id, title, url, icon, cover,
|
||||
properties_json, created_time, last_edited_time, alive, source, raw_json, synced_at
|
||||
from pages where collection_id = ? and alive = 1 order by coalesce(last_edited_time, 0) desc, title`, collectionID)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer rows.Close()
|
||||
var pages []Page
|
||||
for rows.Next() {
|
||||
var p Page
|
||||
var alive int
|
||||
if err := rows.Scan(&p.ID, &p.SpaceID, &p.ParentID, &p.ParentTable, &p.CollectionID, &p.Title, &p.URL, &p.Icon, &p.Cover,
|
||||
&p.PropertiesJSON, &p.CreatedTime, &p.LastEditedTime, &alive, &p.Source, &p.RawJSON, &p.SyncedAt); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
p.Alive = IntBool(alive)
|
||||
pages = append(pages, p)
|
||||
}
|
||||
return pages, rows.Err()
|
||||
}
|
||||
|
||||
func (s *Store) PageBlocks(ctx context.Context, pageID string) ([]Block, error) {
|
||||
rows, err := s.db.QueryContext(ctx, `select id, page_id, space_id, parent_id, parent_table, type, text, properties_json,
|
||||
content_json, format_json, created_time, last_edited_time, alive, source, raw_json, synced_at
|
||||
|
||||
305
internal/tableexport/export.go
Normal file
305
internal/tableexport/export.go
Normal file
@ -0,0 +1,305 @@
|
||||
package tableexport
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/csv"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"sort"
|
||||
"strconv"
|
||||
"strings"
|
||||
|
||||
"github.com/vincentkoc/notcrawl/internal/notiontext"
|
||||
"github.com/vincentkoc/notcrawl/internal/store"
|
||||
)
|
||||
|
||||
type Format string
|
||||
|
||||
const (
|
||||
FormatCSV Format = "csv"
|
||||
FormatTSV Format = "tsv"
|
||||
)
|
||||
|
||||
type Exporter struct {
|
||||
Store *store.Store
|
||||
}
|
||||
|
||||
type Summary struct {
|
||||
Database string
|
||||
Rows int
|
||||
Columns int
|
||||
}
|
||||
|
||||
func (e Exporter) Export(ctx context.Context, databaseID string, format Format, w io.Writer) (Summary, error) {
|
||||
if e.Store == nil {
|
||||
return Summary{}, fmt.Errorf("missing store")
|
||||
}
|
||||
if databaseID == "" {
|
||||
return Summary{}, fmt.Errorf("database id is required")
|
||||
}
|
||||
collection, err := e.Store.Collection(ctx, databaseID)
|
||||
if err != nil {
|
||||
return Summary{}, err
|
||||
}
|
||||
pages, err := e.Store.CollectionPages(ctx, databaseID)
|
||||
if err != nil {
|
||||
return Summary{}, err
|
||||
}
|
||||
columns := columnsFor(collection, pages)
|
||||
writer := csv.NewWriter(w)
|
||||
if format == FormatTSV {
|
||||
writer.Comma = '\t'
|
||||
} else if format != "" && format != FormatCSV {
|
||||
return Summary{}, fmt.Errorf("unsupported format %q", format)
|
||||
}
|
||||
if err := writer.Write(columns); err != nil {
|
||||
return Summary{}, err
|
||||
}
|
||||
for _, page := range pages {
|
||||
props := decodeMap(page.PropertiesJSON)
|
||||
row := make([]string, 0, len(columns))
|
||||
for _, col := range columns {
|
||||
switch col {
|
||||
case "page_id":
|
||||
row = append(row, page.ID)
|
||||
case "page_title":
|
||||
row = append(row, page.Title)
|
||||
case "url":
|
||||
row = append(row, page.URL)
|
||||
default:
|
||||
row = append(row, propertyValueText(props[col]))
|
||||
}
|
||||
}
|
||||
if err := writer.Write(row); err != nil {
|
||||
return Summary{}, err
|
||||
}
|
||||
}
|
||||
writer.Flush()
|
||||
if err := writer.Error(); err != nil {
|
||||
return Summary{}, err
|
||||
}
|
||||
return Summary{Database: collection.ID, Rows: len(pages), Columns: len(columns)}, nil
|
||||
}
|
||||
|
||||
func columnsFor(collection store.Collection, pages []store.Page) []string {
|
||||
seen := map[string]bool{"page_id": true, "page_title": true, "url": true}
|
||||
cols := []string{"page_id", "page_title", "url"}
|
||||
for _, name := range schemaPropertyNames(collection.SchemaJSON) {
|
||||
if !seen[name] {
|
||||
seen[name] = true
|
||||
cols = append(cols, name)
|
||||
}
|
||||
}
|
||||
var extras []string
|
||||
for _, page := range pages {
|
||||
for name := range decodeMap(page.PropertiesJSON) {
|
||||
if !seen[name] {
|
||||
seen[name] = true
|
||||
extras = append(extras, name)
|
||||
}
|
||||
}
|
||||
}
|
||||
sort.Strings(extras)
|
||||
return append(cols, extras...)
|
||||
}
|
||||
|
||||
func schemaPropertyNames(raw string) []string {
|
||||
props := decodeMap(raw)
|
||||
var title []string
|
||||
var rest []string
|
||||
for name, value := range props {
|
||||
m, ok := value.(map[string]any)
|
||||
if ok && m["type"] == "title" {
|
||||
title = append(title, name)
|
||||
continue
|
||||
}
|
||||
rest = append(rest, name)
|
||||
}
|
||||
sort.Strings(title)
|
||||
sort.Strings(rest)
|
||||
return append(title, rest...)
|
||||
}
|
||||
|
||||
func decodeMap(raw string) map[string]any {
|
||||
out := map[string]any{}
|
||||
if strings.TrimSpace(raw) == "" {
|
||||
return out
|
||||
}
|
||||
_ = json.Unmarshal([]byte(raw), &out)
|
||||
return out
|
||||
}
|
||||
|
||||
func propertyValueText(v any) string {
|
||||
m, ok := v.(map[string]any)
|
||||
if !ok {
|
||||
return notiontext.Plain(v)
|
||||
}
|
||||
typ, _ := m["type"].(string)
|
||||
if typ == "" {
|
||||
return notiontext.Plain(v)
|
||||
}
|
||||
switch typ {
|
||||
case "title", "rich_text":
|
||||
return notiontext.Plain(m[typ])
|
||||
case "number":
|
||||
return numberText(m["number"])
|
||||
case "select", "status":
|
||||
return namedObject(m[typ])
|
||||
case "multi_select":
|
||||
return joinNamed(m[typ])
|
||||
case "date":
|
||||
return dateText(m["date"])
|
||||
case "checkbox":
|
||||
if b, ok := m["checkbox"].(bool); ok {
|
||||
return strconv.FormatBool(b)
|
||||
}
|
||||
case "url", "email", "phone_number", "created_time", "last_edited_time":
|
||||
if s, ok := m[typ].(string); ok {
|
||||
return s
|
||||
}
|
||||
case "people", "files":
|
||||
return joinNamed(m[typ])
|
||||
case "relation":
|
||||
return joinIDs(m[typ])
|
||||
case "formula":
|
||||
return formulaText(m["formula"])
|
||||
case "rollup":
|
||||
return rollupText(m["rollup"])
|
||||
case "created_by", "last_edited_by":
|
||||
return namedObject(m[typ])
|
||||
case "unique_id":
|
||||
return uniqueIDText(m["unique_id"])
|
||||
}
|
||||
return notiontext.Plain(v)
|
||||
}
|
||||
|
||||
func namedObject(v any) string {
|
||||
m, ok := v.(map[string]any)
|
||||
if !ok {
|
||||
return ""
|
||||
}
|
||||
if name, ok := m["name"].(string); ok {
|
||||
return name
|
||||
}
|
||||
if id, ok := m["id"].(string); ok {
|
||||
return id
|
||||
}
|
||||
return notiontext.Plain(v)
|
||||
}
|
||||
|
||||
func joinNamed(v any) string {
|
||||
items, ok := v.([]any)
|
||||
if !ok {
|
||||
return ""
|
||||
}
|
||||
parts := make([]string, 0, len(items))
|
||||
for _, item := range items {
|
||||
if text := namedObject(item); text != "" {
|
||||
parts = append(parts, text)
|
||||
}
|
||||
}
|
||||
return strings.Join(parts, ", ")
|
||||
}
|
||||
|
||||
func joinIDs(v any) string {
|
||||
items, ok := v.([]any)
|
||||
if !ok {
|
||||
return ""
|
||||
}
|
||||
parts := make([]string, 0, len(items))
|
||||
for _, item := range items {
|
||||
m, ok := item.(map[string]any)
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if id, ok := m["id"].(string); ok {
|
||||
parts = append(parts, id)
|
||||
}
|
||||
}
|
||||
return strings.Join(parts, ", ")
|
||||
}
|
||||
|
||||
func dateText(v any) string {
|
||||
m, ok := v.(map[string]any)
|
||||
if !ok {
|
||||
return ""
|
||||
}
|
||||
start, _ := m["start"].(string)
|
||||
end, _ := m["end"].(string)
|
||||
if end != "" {
|
||||
return start + "/" + end
|
||||
}
|
||||
return start
|
||||
}
|
||||
|
||||
func formulaText(v any) string {
|
||||
m, ok := v.(map[string]any)
|
||||
if !ok {
|
||||
return ""
|
||||
}
|
||||
typ, _ := m["type"].(string)
|
||||
switch typ {
|
||||
case "string":
|
||||
s, _ := m["string"].(string)
|
||||
return s
|
||||
case "number":
|
||||
return numberText(m["number"])
|
||||
case "boolean":
|
||||
if b, ok := m["boolean"].(bool); ok {
|
||||
return strconv.FormatBool(b)
|
||||
}
|
||||
case "date":
|
||||
return dateText(m["date"])
|
||||
}
|
||||
return notiontext.Plain(v)
|
||||
}
|
||||
|
||||
func rollupText(v any) string {
|
||||
m, ok := v.(map[string]any)
|
||||
if !ok {
|
||||
return ""
|
||||
}
|
||||
typ, _ := m["type"].(string)
|
||||
switch typ {
|
||||
case "number":
|
||||
return numberText(m["number"])
|
||||
case "date":
|
||||
return dateText(m["date"])
|
||||
case "array":
|
||||
items, _ := m["array"].([]any)
|
||||
parts := make([]string, 0, len(items))
|
||||
for _, item := range items {
|
||||
if text := propertyValueText(item); text != "" {
|
||||
parts = append(parts, text)
|
||||
}
|
||||
}
|
||||
return strings.Join(parts, ", ")
|
||||
}
|
||||
return notiontext.Plain(v)
|
||||
}
|
||||
|
||||
func uniqueIDText(v any) string {
|
||||
m, ok := v.(map[string]any)
|
||||
if !ok {
|
||||
return ""
|
||||
}
|
||||
prefix, _ := m["prefix"].(string)
|
||||
number := numberText(m["number"])
|
||||
return prefix + number
|
||||
}
|
||||
|
||||
func numberText(v any) string {
|
||||
switch x := v.(type) {
|
||||
case nil:
|
||||
return ""
|
||||
case float64:
|
||||
return strconv.FormatFloat(x, 'f', -1, 64)
|
||||
case int:
|
||||
return strconv.Itoa(x)
|
||||
case json.Number:
|
||||
return x.String()
|
||||
default:
|
||||
return fmt.Sprint(x)
|
||||
}
|
||||
}
|
||||
47
internal/tableexport/export_test.go
Normal file
47
internal/tableexport/export_test.go
Normal file
@ -0,0 +1,47 @@
|
||||
package tableexport
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/vincentkoc/notcrawl/internal/store"
|
||||
)
|
||||
|
||||
func TestExportDatabaseTSV(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
st, err := store.Open(filepath.Join(t.TempDir(), "notcrawl.db"))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer st.Close()
|
||||
now := store.NowMS()
|
||||
if err := st.UpsertCollection(ctx, store.Collection{
|
||||
ID: "db1", Name: "Roadmap", Source: "test", SyncedAt: now,
|
||||
SchemaJSON: `{"Name":{"type":"title"},"Status":{"type":"select"},"Score":{"type":"number"}}`,
|
||||
}); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := st.UpsertPage(ctx, store.Page{
|
||||
ID: "page1", CollectionID: "db1", Title: "Ship", URL: "https://example.com/ship", Alive: true, Source: "test", SyncedAt: now,
|
||||
PropertiesJSON: `{"Name":{"type":"title","title":[{"plain_text":"Ship"}]},"Status":{"type":"select","select":{"name":"Done"}},"Score":{"type":"number","number":7}}`,
|
||||
}); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
var out bytes.Buffer
|
||||
s, err := (Exporter{Store: st}).Export(ctx, "db1", FormatTSV, &out)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if s.Rows != 1 {
|
||||
t.Fatalf("expected one row, got %d", s.Rows)
|
||||
}
|
||||
got := out.String()
|
||||
for _, want := range []string{"page_id\tpage_title\turl\tName\tScore\tStatus", "page1\tShip\thttps://example.com/ship\tShip\t7\tDone"} {
|
||||
if !strings.Contains(got, want) {
|
||||
t.Fatalf("missing %q in:\n%s", want, got)
|
||||
}
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue
Block a user