feat: support Notion data sources and database maintenance
This commit is contained in:
parent
0a1d9b1992
commit
796a5a03d4
@ -21,9 +21,11 @@ to without holding Notion credentials.
|
||||
- read-only local desktop cache ingestion from macOS Notion
|
||||
- official API page/block/user/comment ingestion
|
||||
- Notion database metadata and row ingestion through the official API
|
||||
- current Notion data-source API support plus legacy database endpoint support
|
||||
- normalized Markdown export organized by space and page path
|
||||
- CSV/TSV export for crawled Notion database rows
|
||||
- compressed JSONL git-share snapshots plus import/update workflows
|
||||
- archive status and SQLite maintenance commands
|
||||
- read-only SQL access for ad hoc inspection
|
||||
|
||||
## Install
|
||||
@ -43,6 +45,7 @@ Use the local Notion Desktop cache:
|
||||
```bash
|
||||
notcrawl init
|
||||
notcrawl doctor
|
||||
notcrawl status
|
||||
notcrawl sync --source desktop
|
||||
notcrawl export-md
|
||||
notcrawl search "launch plan"
|
||||
@ -69,6 +72,8 @@ Default paths:
|
||||
|
||||
- `init` writes a starter config
|
||||
- `doctor` checks config, SQLite, desktop cache, and token presence
|
||||
- `status` prints archive counts, last sync time, and database/WAL size
|
||||
- `maintain` rebuilds FTS, optimizes SQLite indexes, and can run `VACUUM`
|
||||
- `sync` ingests from `desktop`, `api`, or `all`
|
||||
- `export-md` renders normalized Markdown files from SQLite
|
||||
- `databases` lists crawled Notion databases
|
||||
|
||||
16
SPEC.md
16
SPEC.md
@ -22,12 +22,13 @@ V1 scope:
|
||||
- read-only desktop snapshot ingestion
|
||||
- official Notion API sync
|
||||
- pages and blocks
|
||||
- databases/data sources as collections
|
||||
- databases/data sources as collections, including current data-source API endpoints
|
||||
- database rows as pages linked to their collection
|
||||
- comments and discussions where available
|
||||
- users and spaces/workspaces
|
||||
- FTS5 search over rendered page/comment text
|
||||
- raw SQL access
|
||||
- archive status and SQLite maintenance commands
|
||||
- Markdown export
|
||||
- CSV/TSV export for database rows
|
||||
- git-backed archive publishing and subscription
|
||||
@ -72,10 +73,17 @@ API sync uses `NOTION_TOKEN` by default. It must:
|
||||
5. obey `Retry-After` on rate limits
|
||||
6. store raw JSON plus normalized rows
|
||||
|
||||
New configs should use the current Notion API version. Existing configs pinned
|
||||
to legacy `2022-06-28` must continue using deprecated database query endpoints.
|
||||
|
||||
## SQLite Archive
|
||||
|
||||
SQLite is canonical. Markdown is generated output.
|
||||
|
||||
Store startup must enable WAL, foreign keys, a busy timeout, normal
|
||||
synchronous writes, in-memory temp storage, and the crawler query indexes needed
|
||||
for common page, collection, comment, raw-record, and sync-state lookups.
|
||||
|
||||
Core tables:
|
||||
|
||||
- `spaces`
|
||||
@ -135,9 +143,9 @@ SQLite without requiring Notion credentials.
|
||||
|
||||
## Database Export
|
||||
|
||||
API sync discovers databases visible to the integration, stores database
|
||||
metadata in `collections`, queries each database for row pages, and links those
|
||||
pages through `pages.collection_id`.
|
||||
API sync discovers databases/data sources visible to the integration, stores
|
||||
metadata in `collections`, queries each collection for row pages, and links
|
||||
those pages through `pages.collection_id`.
|
||||
|
||||
`export-db` renders row properties into delimited text:
|
||||
|
||||
|
||||
@ -66,6 +66,10 @@ func run(ctx context.Context, args []string, stdout, stderr io.Writer) error {
|
||||
switch cmd {
|
||||
case "doctor":
|
||||
return runDoctor(ctx, stdout, cfg)
|
||||
case "status":
|
||||
return runStatus(ctx, stdout, cfg)
|
||||
case "maintain":
|
||||
return runMaintain(ctx, stdout, cfg, cmdArgs)
|
||||
case "sync":
|
||||
return runSync(ctx, stdout, cfg, cmdArgs)
|
||||
case "export-md":
|
||||
@ -109,7 +113,12 @@ func runDoctor(ctx context.Context, stdout io.Writer, cfg config.Config) error {
|
||||
"api_token_env": cfg.Notion.API.TokenEnv,
|
||||
"api_token_present": cfg.APIToken() != "",
|
||||
}
|
||||
_ = ctx
|
||||
status, err := st.Status(ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
report["status"] = status
|
||||
report["api_version"] = cfg.Notion.API.Version
|
||||
b, err := json.MarshalIndent(report, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
@ -118,6 +127,47 @@ func runDoctor(ctx context.Context, stdout io.Writer, cfg config.Config) error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func runStatus(ctx context.Context, stdout io.Writer, cfg config.Config) error {
|
||||
st, err := store.Open(cfg.DBPath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer st.Close()
|
||||
status, err := st.Status(ctx)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
b, err := json.MarshalIndent(status, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
fmt.Fprintln(stdout, string(b))
|
||||
return nil
|
||||
}
|
||||
|
||||
func runMaintain(ctx context.Context, stdout io.Writer, cfg config.Config, args []string) error {
|
||||
fs := flag.NewFlagSet("maintain", flag.ContinueOnError)
|
||||
vacuum := fs.Bool("vacuum", false, "run VACUUM after rebuilding and optimizing indexes")
|
||||
if err := fs.Parse(args); err != nil {
|
||||
return err
|
||||
}
|
||||
st, err := store.Open(cfg.DBPath)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
defer st.Close()
|
||||
summary, err := st.Optimize(ctx, *vacuum)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
b, err := json.MarshalIndent(summary, "", " ")
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
fmt.Fprintln(stdout, string(b))
|
||||
return nil
|
||||
}
|
||||
|
||||
func runSync(ctx context.Context, stdout io.Writer, cfg config.Config, args []string) error {
|
||||
fs := flag.NewFlagSet("sync", flag.ContinueOnError)
|
||||
source := fs.String("source", "all", "source: desktop, api, all")
|
||||
@ -405,6 +455,8 @@ Global flags:
|
||||
Commands:
|
||||
init Write a starter config
|
||||
doctor Check config, database, desktop cache, and token
|
||||
status Show archive counts and database size
|
||||
maintain [--vacuum] Rebuild FTS and optimize SQLite indexes
|
||||
sync --source desktop Ingest Notion Desktop cache
|
||||
sync --source api Ingest through the official Notion API
|
||||
sync --source all Run enabled sources
|
||||
|
||||
@ -10,7 +10,7 @@ path = ""
|
||||
enabled = true
|
||||
token_env = "NOTION_TOKEN"
|
||||
base_url = "https://api.notion.com/v1"
|
||||
version = "2022-06-28"
|
||||
version = "2026-03-11"
|
||||
|
||||
[share]
|
||||
remote = ""
|
||||
|
||||
@ -14,6 +14,7 @@ import (
|
||||
const (
|
||||
defaultDirName = ".notcrawl"
|
||||
defaultDesktopPath = "~/Library/Application Support/Notion/notion.db"
|
||||
defaultAPIVersion = "2026-03-11"
|
||||
)
|
||||
|
||||
type Config struct {
|
||||
@ -60,7 +61,7 @@ func Default() Config {
|
||||
Enabled: true,
|
||||
TokenEnv: "NOTION_TOKEN",
|
||||
BaseURL: "https://api.notion.com/v1",
|
||||
Version: "2022-06-28",
|
||||
Version: defaultAPIVersion,
|
||||
},
|
||||
},
|
||||
Share: ShareConfig{
|
||||
@ -150,7 +151,7 @@ func (c *Config) Resolve() error {
|
||||
c.Notion.API.BaseURL = "https://api.notion.com/v1"
|
||||
}
|
||||
if strings.TrimSpace(c.Notion.API.Version) == "" {
|
||||
c.Notion.API.Version = "2022-06-28"
|
||||
c.Notion.API.Version = defaultAPIVersion
|
||||
}
|
||||
if strings.TrimSpace(c.Share.Branch) == "" {
|
||||
c.Share.Branch = "main"
|
||||
|
||||
10
internal/config/config_test.go
Normal file
10
internal/config/config_test.go
Normal file
@ -0,0 +1,10 @@
|
||||
package config
|
||||
|
||||
import "testing"
|
||||
|
||||
func TestDefaultUsesCurrentNotionAPIVersion(t *testing.T) {
|
||||
cfg := Default()
|
||||
if cfg.Notion.API.Version != "2026-03-11" {
|
||||
t.Fatalf("unexpected API version: %s", cfg.Notion.API.Version)
|
||||
}
|
||||
}
|
||||
@ -41,7 +41,7 @@ func (c Client) Sync(ctx context.Context, st *store.Store) (Summary, error) {
|
||||
c.BaseURL = "https://api.notion.com/v1"
|
||||
}
|
||||
if c.Version == "" {
|
||||
c.Version = "2022-06-28"
|
||||
c.Version = "2026-03-11"
|
||||
}
|
||||
if c.HTTP == nil {
|
||||
c.HTTP = http.DefaultClient
|
||||
@ -73,12 +73,12 @@ func (c Client) Sync(ctx context.Context, st *store.Store) (Summary, error) {
|
||||
s.Blocks += count
|
||||
s.Comments += comments
|
||||
}
|
||||
databases, err := c.searchDatabases(ctx)
|
||||
collections, err := c.searchCollections(ctx)
|
||||
if err != nil {
|
||||
return s, err
|
||||
}
|
||||
for _, database := range databases {
|
||||
rows, err := c.ingestDatabase(ctx, st, database)
|
||||
for _, collection := range collections {
|
||||
rows, err := c.ingestCollection(ctx, st, collection)
|
||||
if err != nil {
|
||||
return s, err
|
||||
}
|
||||
@ -145,8 +145,8 @@ func (c Client) searchPages(ctx context.Context) ([]obj, error) {
|
||||
return c.searchObjects(ctx, "page")
|
||||
}
|
||||
|
||||
func (c Client) searchDatabases(ctx context.Context) ([]obj, error) {
|
||||
return c.searchObjects(ctx, "database")
|
||||
func (c Client) searchCollections(ctx context.Context) ([]obj, error) {
|
||||
return c.searchObjects(ctx, c.collectionSearchType())
|
||||
}
|
||||
|
||||
func (c Client) searchObjects(ctx context.Context, objectType string) ([]obj, error) {
|
||||
@ -236,21 +236,25 @@ func (c Client) ingestPage(ctx context.Context, st *store.Store, page obj, opts
|
||||
return blocks, comments, nil
|
||||
}
|
||||
|
||||
func (c Client) ingestDatabase(ctx context.Context, st *store.Store, database obj) (int, error) {
|
||||
id := database.string("id")
|
||||
raw := notiontext.MarshalRaw(database)
|
||||
parent := database.mapObj("parent")
|
||||
name := notiontext.Plain(database["title"])
|
||||
func (c Client) ingestCollection(ctx context.Context, st *store.Store, collection obj) (int, error) {
|
||||
id := collection.string("id")
|
||||
raw := notiontext.MarshalRaw(collection)
|
||||
parent := collection.mapObj("parent")
|
||||
if len(parent) == 0 {
|
||||
parent = collection.mapObj("database_parent")
|
||||
}
|
||||
parentID := firstNonEmpty(parent.string("database_id"), parent.string("page_id"), parent.string("block_id"), parent.string("workspace"))
|
||||
name := notiontext.Plain(collection["title"])
|
||||
if name == "" {
|
||||
name = id
|
||||
}
|
||||
if err := st.UpsertCollection(ctx, store.Collection{
|
||||
ID: id,
|
||||
SpaceID: parent.string("workspace"),
|
||||
ParentID: firstNonEmpty(parent.string("page_id"), parent.string("block_id"), parent.string("workspace")),
|
||||
ParentID: parentID,
|
||||
Name: name,
|
||||
SchemaJSON: marshalAny(database["properties"]),
|
||||
FormatJSON: marshalAny(database),
|
||||
SchemaJSON: marshalAny(collection["properties"]),
|
||||
FormatJSON: marshalAny(collection),
|
||||
RawJSON: raw,
|
||||
Source: SourceName,
|
||||
SyncedAt: store.NowMS(),
|
||||
@ -258,15 +262,15 @@ func (c Client) ingestDatabase(ctx context.Context, st *store.Store, database ob
|
||||
return 0, err
|
||||
}
|
||||
if err := st.UpsertRawRecord(ctx, store.RawRecord{
|
||||
Source: SourceName, RecordTable: "database", RecordID: id, ParentID: parent.string("page_id"),
|
||||
Source: SourceName, RecordTable: c.collectionSearchType(), RecordID: id, ParentID: parentID,
|
||||
SpaceID: parent.string("workspace"), RawJSON: raw, SyncedAt: store.NowMS(),
|
||||
}); err != nil {
|
||||
return 0, err
|
||||
}
|
||||
return c.queryDatabase(ctx, st, id)
|
||||
return c.queryCollection(ctx, st, id)
|
||||
}
|
||||
|
||||
func (c Client) queryDatabase(ctx context.Context, st *store.Store, databaseID string) (int, error) {
|
||||
func (c Client) queryCollection(ctx context.Context, st *store.Store, collectionID string) (int, error) {
|
||||
var count int
|
||||
cursor := ""
|
||||
for {
|
||||
@ -275,7 +279,7 @@ func (c Client) queryDatabase(ctx context.Context, st *store.Store, databaseID s
|
||||
body["start_cursor"] = cursor
|
||||
}
|
||||
var resp obj
|
||||
path := fmt.Sprintf("/databases/%s/query", url.PathEscape(databaseID))
|
||||
path := fmt.Sprintf("%s/%s/query", c.collectionQueryBasePath(), url.PathEscape(collectionID))
|
||||
if err := c.do(ctx, http.MethodPost, path, body, &resp); err != nil {
|
||||
return count, err
|
||||
}
|
||||
@ -284,7 +288,15 @@ func (c Client) queryDatabase(ctx context.Context, st *store.Store, databaseID s
|
||||
if !ok {
|
||||
continue
|
||||
}
|
||||
if _, _, err := c.ingestPage(ctx, st, obj(m), ingestPageOptions{CollectionID: databaseID}); err != nil {
|
||||
if itemType := obj(m).string("object"); itemType != "" && itemType != "page" {
|
||||
if itemType == c.collectionSearchType() {
|
||||
if _, err := c.ingestCollection(ctx, st, obj(m)); err != nil {
|
||||
return count, err
|
||||
}
|
||||
}
|
||||
continue
|
||||
}
|
||||
if _, _, err := c.ingestPage(ctx, st, obj(m), ingestPageOptions{CollectionID: collectionID}); err != nil {
|
||||
return count, err
|
||||
}
|
||||
count++
|
||||
@ -299,6 +311,24 @@ func (c Client) queryDatabase(ctx context.Context, st *store.Store, databaseID s
|
||||
}
|
||||
}
|
||||
|
||||
func (c Client) collectionSearchType() string {
|
||||
if c.usesDataSourceAPI() {
|
||||
return "data_source"
|
||||
}
|
||||
return "database"
|
||||
}
|
||||
|
||||
func (c Client) collectionQueryBasePath() string {
|
||||
if c.usesDataSourceAPI() {
|
||||
return "/data_sources"
|
||||
}
|
||||
return "/databases"
|
||||
}
|
||||
|
||||
func (c Client) usesDataSourceAPI() bool {
|
||||
return c.Version >= "2025-09-03"
|
||||
}
|
||||
|
||||
func (c Client) walkBlocks(ctx context.Context, st *store.Store, pageID, parentID, spaceID string) (int, error) {
|
||||
var count int
|
||||
cursor := ""
|
||||
|
||||
@ -97,3 +97,93 @@ func TestSyncIngestsDatabasesAndRows(t *testing.T) {
|
||||
t.Fatalf("unexpected rows: %+v", rows)
|
||||
}
|
||||
}
|
||||
|
||||
func TestSyncIngestsCurrentDataSourcesAndRows(t *testing.T) {
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
w.Header().Set("Content-Type", "application/json")
|
||||
switch r.URL.Path {
|
||||
case "/users":
|
||||
_, _ = w.Write([]byte(`{"object":"list","results":[],"has_more":false}`))
|
||||
case "/search":
|
||||
if got := r.Header.Get("Notion-Version"); got != "2026-03-11" {
|
||||
t.Fatalf("unexpected Notion-Version: %s", got)
|
||||
}
|
||||
var body map[string]any
|
||||
if err := json.NewDecoder(r.Body).Decode(&body); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
filter := body["filter"].(map[string]any)
|
||||
switch filter["value"] {
|
||||
case "page":
|
||||
_, _ = w.Write([]byte(`{"object":"list","results":[],"has_more":false}`))
|
||||
case "data_source":
|
||||
_, _ = w.Write([]byte(`{
|
||||
"object":"list",
|
||||
"results":[{
|
||||
"object":"data_source",
|
||||
"id":"ds1",
|
||||
"title":[{"plain_text":"Roadmap"}],
|
||||
"parent":{"type":"database_id","database_id":"db1"},
|
||||
"database_parent":{"type":"page_id","page_id":"page-parent"},
|
||||
"properties":{
|
||||
"Name":{"id":"title","type":"title","title":{}},
|
||||
"Status":{"id":"status","type":"select","select":{}}
|
||||
}
|
||||
}],
|
||||
"has_more":false
|
||||
}`))
|
||||
default:
|
||||
t.Fatalf("unexpected search filter: %v", filter["value"])
|
||||
}
|
||||
case "/data_sources/ds1/query":
|
||||
_, _ = w.Write([]byte(`{
|
||||
"object":"list",
|
||||
"results":[{
|
||||
"object":"page",
|
||||
"id":"page1",
|
||||
"created_time":"2026-01-01T00:00:00Z",
|
||||
"last_edited_time":"2026-01-02T00:00:00Z",
|
||||
"in_trash":false,
|
||||
"url":"https://notion.so/page1",
|
||||
"parent":{"type":"data_source_id","data_source_id":"ds1"},
|
||||
"properties":{
|
||||
"Name":{"id":"title","type":"title","title":[{"plain_text":"Ship"}]},
|
||||
"Status":{"id":"status","type":"select","select":{"name":"Done"}}
|
||||
}
|
||||
}],
|
||||
"has_more":false
|
||||
}`))
|
||||
default:
|
||||
t.Fatalf("unexpected request: %s %s", r.Method, r.URL.String())
|
||||
}
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
st, err := store.Open(filepath.Join(t.TempDir(), "notcrawl.db"))
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer st.Close()
|
||||
|
||||
summary, err := (Client{BaseURL: server.URL, Version: "2026-03-11", Token: "secret"}).Sync(context.Background(), st)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if summary.Databases != 1 || summary.DatabaseRows != 1 {
|
||||
t.Fatalf("unexpected summary: %+v", summary)
|
||||
}
|
||||
collections, err := st.Collections(context.Background())
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if len(collections) != 1 || collections[0].ID != "ds1" || collections[0].ParentID != "db1" {
|
||||
t.Fatalf("unexpected collections: %+v", collections)
|
||||
}
|
||||
rows, err := st.CollectionPages(context.Background(), "ds1")
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if len(rows) != 1 || rows[0].ID != "page1" || rows[0].CollectionID != "ds1" {
|
||||
t.Fatalf("unexpected rows: %+v", rows)
|
||||
}
|
||||
}
|
||||
|
||||
@ -16,19 +16,28 @@ import (
|
||||
const schemaVersion = 1
|
||||
|
||||
type Store struct {
|
||||
db *sql.DB
|
||||
db *sql.DB
|
||||
path string
|
||||
}
|
||||
|
||||
func Open(path string) (*Store, error) {
|
||||
if err := os.MkdirAll(filepath.Dir(path), 0o755); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
db, err := sql.Open("sqlite", path)
|
||||
if err := ensureDBFile(path); err != nil {
|
||||
return nil, err
|
||||
}
|
||||
db, err := sql.Open("sqlite", sqliteDSN(path))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
db.SetMaxOpenConns(1)
|
||||
st := &Store{db: db}
|
||||
db.SetMaxIdleConns(1)
|
||||
if err := db.PingContext(context.Background()); err != nil {
|
||||
_ = db.Close()
|
||||
return nil, err
|
||||
}
|
||||
st := &Store{db: db, path: path}
|
||||
if err := st.init(context.Background()); err != nil {
|
||||
_ = db.Close()
|
||||
return nil, err
|
||||
@ -36,18 +45,85 @@ func Open(path string) (*Store, error) {
|
||||
return st, nil
|
||||
}
|
||||
|
||||
func sqliteDSN(path string) string {
|
||||
pragmas := "_pragma=foreign_keys(1)&_pragma=journal_mode(WAL)&_pragma=synchronous(NORMAL)&_pragma=temp_store(MEMORY)&_pragma=mmap_size(268435456)&_pragma=busy_timeout(5000)"
|
||||
if path == ":memory:" {
|
||||
return "file::memory:?cache=shared&" + pragmas
|
||||
}
|
||||
if strings.HasPrefix(path, "file:") {
|
||||
sep := "?"
|
||||
if strings.Contains(path, "?") {
|
||||
sep = "&"
|
||||
}
|
||||
return path + sep + pragmas
|
||||
}
|
||||
return "file:" + path + "?" + pragmas
|
||||
}
|
||||
|
||||
func ensureDBFile(path string) error {
|
||||
if path == ":memory:" || strings.HasPrefix(path, "file:") {
|
||||
return nil
|
||||
}
|
||||
if _, err := os.Stat(path); err == nil {
|
||||
return os.Chmod(path, 0o600)
|
||||
} else if !errors.Is(err, os.ErrNotExist) {
|
||||
return err
|
||||
}
|
||||
file, err := os.OpenFile(path, os.O_CREATE|os.O_EXCL|os.O_WRONLY, 0o600)
|
||||
if err != nil && !errors.Is(err, os.ErrExist) {
|
||||
return err
|
||||
}
|
||||
if file != nil {
|
||||
if err := file.Close(); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (s *Store) DB() *sql.DB {
|
||||
return s.db
|
||||
}
|
||||
|
||||
func (s *Store) Close() error {
|
||||
if s == nil || s.db == nil {
|
||||
return nil
|
||||
}
|
||||
return s.db.Close()
|
||||
}
|
||||
|
||||
type Status struct {
|
||||
DBPath string `json:"db_path"`
|
||||
DBBytes int64 `json:"db_bytes"`
|
||||
WALBytes int64 `json:"wal_bytes"`
|
||||
Spaces int `json:"spaces"`
|
||||
Users int `json:"users"`
|
||||
Pages int `json:"pages"`
|
||||
Blocks int `json:"blocks"`
|
||||
Collections int `json:"collections"`
|
||||
Comments int `json:"comments"`
|
||||
RawRecords int `json:"raw_records"`
|
||||
LastSyncAt int64 `json:"last_sync_at"`
|
||||
}
|
||||
|
||||
type MaintenanceSummary struct {
|
||||
RebuiltFTS bool `json:"rebuilt_fts"`
|
||||
Optimized bool `json:"optimized"`
|
||||
Analyzed bool `json:"analyzed"`
|
||||
Vacuumed bool `json:"vacuumed"`
|
||||
DBBytes int64 `json:"db_bytes"`
|
||||
WALBytes int64 `json:"wal_bytes"`
|
||||
Message string `json:"message"`
|
||||
}
|
||||
|
||||
func (s *Store) init(ctx context.Context) error {
|
||||
stmts := []string{
|
||||
`pragma foreign_keys = on`,
|
||||
`pragma journal_mode = wal`,
|
||||
`pragma synchronous = normal`,
|
||||
`pragma temp_store = memory`,
|
||||
`pragma mmap_size = 268435456`,
|
||||
`pragma busy_timeout = 5000`,
|
||||
`create table if not exists meta (key text primary key, value text not null)`,
|
||||
`create table if not exists spaces (
|
||||
id text primary key,
|
||||
@ -82,6 +158,10 @@ func (s *Store) init(ctx context.Context) error {
|
||||
raw_json text,
|
||||
synced_at integer not null
|
||||
)`,
|
||||
`create index if not exists pages_collection_id on pages(collection_id)`,
|
||||
`create index if not exists pages_parent_id on pages(parent_id)`,
|
||||
`create index if not exists pages_last_edited_time on pages(last_edited_time desc)`,
|
||||
`create index if not exists pages_source_synced_at on pages(source, synced_at desc)`,
|
||||
`create table if not exists blocks (
|
||||
id text primary key,
|
||||
page_id text,
|
||||
@ -101,6 +181,7 @@ func (s *Store) init(ctx context.Context) error {
|
||||
synced_at integer not null
|
||||
)`,
|
||||
`create index if not exists blocks_page_id on blocks(page_id)`,
|
||||
`create index if not exists blocks_page_alive_created on blocks(page_id, alive, created_time, id)`,
|
||||
`create index if not exists blocks_parent_id on blocks(parent_id)`,
|
||||
`create table if not exists collections (
|
||||
id text primary key,
|
||||
@ -113,6 +194,8 @@ func (s *Store) init(ctx context.Context) error {
|
||||
source text not null,
|
||||
synced_at integer not null
|
||||
)`,
|
||||
`create index if not exists collections_parent_id on collections(parent_id)`,
|
||||
`create index if not exists collections_name on collections(name)`,
|
||||
`create table if not exists comments (
|
||||
id text primary key,
|
||||
page_id text,
|
||||
@ -127,6 +210,8 @@ func (s *Store) init(ctx context.Context) error {
|
||||
source text not null,
|
||||
synced_at integer not null
|
||||
)`,
|
||||
`create index if not exists comments_page_id on comments(page_id)`,
|
||||
`create index if not exists comments_created_time on comments(created_time, id)`,
|
||||
`create table if not exists raw_records (
|
||||
source text not null,
|
||||
record_table text not null,
|
||||
@ -137,6 +222,7 @@ func (s *Store) init(ctx context.Context) error {
|
||||
synced_at integer not null,
|
||||
primary key (source, record_table, record_id)
|
||||
)`,
|
||||
`create index if not exists raw_records_parent on raw_records(parent_id, record_table)`,
|
||||
`create table if not exists sync_state (
|
||||
source text not null,
|
||||
entity_type text not null,
|
||||
@ -145,6 +231,7 @@ func (s *Store) init(ctx context.Context) error {
|
||||
synced_at integer not null,
|
||||
primary key (source, entity_type, entity_id)
|
||||
)`,
|
||||
`create index if not exists sync_state_synced_at on sync_state(synced_at desc)`,
|
||||
`create virtual table if not exists page_fts using fts5(page_id unindexed, title, body)`,
|
||||
`create virtual table if not exists comment_fts using fts5(comment_id unindexed, page_id unindexed, body)`,
|
||||
}
|
||||
@ -391,3 +478,71 @@ func (s *Store) RebuildFTS(ctx context.Context) error {
|
||||
_, err = s.db.ExecContext(ctx, `insert into comment_fts(comment_id, page_id, body) select id, page_id, text from comments where alive = 1`)
|
||||
return err
|
||||
}
|
||||
|
||||
func (s *Store) Status(ctx context.Context) (Status, error) {
|
||||
status := Status{DBPath: s.path}
|
||||
counts := []struct {
|
||||
query string
|
||||
dest *int
|
||||
}{
|
||||
{`select count(*) from spaces`, &status.Spaces},
|
||||
{`select count(*) from users`, &status.Users},
|
||||
{`select count(*) from pages`, &status.Pages},
|
||||
{`select count(*) from blocks`, &status.Blocks},
|
||||
{`select count(*) from collections`, &status.Collections},
|
||||
{`select count(*) from comments`, &status.Comments},
|
||||
{`select count(*) from raw_records`, &status.RawRecords},
|
||||
}
|
||||
for _, count := range counts {
|
||||
if err := s.db.QueryRowContext(ctx, count.query).Scan(count.dest); err != nil {
|
||||
return Status{}, err
|
||||
}
|
||||
}
|
||||
if err := s.db.QueryRowContext(ctx, `select coalesce(max(synced_at), 0) from sync_state`).Scan(&status.LastSyncAt); err != nil {
|
||||
return Status{}, err
|
||||
}
|
||||
status.DBBytes = fileSize(s.path)
|
||||
status.WALBytes = fileSize(s.path + "-wal")
|
||||
return status, nil
|
||||
}
|
||||
|
||||
func (s *Store) Optimize(ctx context.Context, vacuum bool) (MaintenanceSummary, error) {
|
||||
if err := s.RebuildFTS(ctx); err != nil {
|
||||
return MaintenanceSummary{}, err
|
||||
}
|
||||
for _, stmt := range []string{
|
||||
`insert into page_fts(page_fts) values('optimize')`,
|
||||
`insert into comment_fts(comment_fts) values('optimize')`,
|
||||
`pragma optimize`,
|
||||
`analyze`,
|
||||
} {
|
||||
if _, err := s.db.ExecContext(ctx, stmt); err != nil {
|
||||
return MaintenanceSummary{}, err
|
||||
}
|
||||
}
|
||||
if vacuum {
|
||||
if _, err := s.db.ExecContext(ctx, `vacuum`); err != nil {
|
||||
return MaintenanceSummary{}, err
|
||||
}
|
||||
}
|
||||
return MaintenanceSummary{
|
||||
RebuiltFTS: true,
|
||||
Optimized: true,
|
||||
Analyzed: true,
|
||||
Vacuumed: vacuum,
|
||||
DBBytes: fileSize(s.path),
|
||||
WALBytes: fileSize(s.path + "-wal"),
|
||||
Message: "database maintenance complete",
|
||||
}, nil
|
||||
}
|
||||
|
||||
func fileSize(path string) int64 {
|
||||
if path == "" || path == ":memory:" || strings.HasPrefix(path, "file:") {
|
||||
return 0
|
||||
}
|
||||
info, err := os.Stat(path)
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
return info.Size()
|
||||
}
|
||||
|
||||
@ -2,6 +2,7 @@ package store
|
||||
|
||||
import (
|
||||
"context"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
)
|
||||
@ -31,3 +32,78 @@ func TestStoreUpsertsAndSearchesPage(t *testing.T) {
|
||||
t.Fatalf("expected page1, got %q", results[0].ID)
|
||||
}
|
||||
}
|
||||
|
||||
func TestStoreStatusAndOptimize(t *testing.T) {
|
||||
path := filepath.Join(t.TempDir(), "notcrawl.db")
|
||||
st, err := Open(path)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer st.Close()
|
||||
ctx := context.Background()
|
||||
now := NowMS()
|
||||
if err := st.UpsertPage(ctx, Page{ID: "page1", Title: "Launch Plan", Alive: true, Source: "test", SyncedAt: now}); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := st.UpsertBlock(ctx, Block{ID: "block1", PageID: "page1", Type: "text", Text: "maintenance keeps search sharp", Alive: true, Source: "test", SyncedAt: now}); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if err := st.SetSyncState(ctx, "test", "workspace", "default", "done"); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
|
||||
status, err := st.Status(ctx)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if status.Pages != 1 || status.Blocks != 1 || status.LastSyncAt == 0 || status.DBBytes == 0 {
|
||||
t.Fatalf("unexpected status: %+v", status)
|
||||
}
|
||||
|
||||
summary, err := st.Optimize(ctx, false)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if !summary.RebuiltFTS || !summary.Optimized || !summary.Analyzed || summary.Vacuumed {
|
||||
t.Fatalf("unexpected maintenance summary: %+v", summary)
|
||||
}
|
||||
results, err := st.Search(ctx, "maintenance", 10)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if len(results) != 1 || results[0].ID != "page1" {
|
||||
t.Fatalf("unexpected search results after optimize: %+v", results)
|
||||
}
|
||||
}
|
||||
|
||||
func TestStoreOpenAppliesSQLitePragmasAndPrivateFileMode(t *testing.T) {
|
||||
path := filepath.Join(t.TempDir(), "notcrawl.db")
|
||||
st, err := Open(path)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
defer st.Close()
|
||||
|
||||
info, err := os.Stat(path)
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if info.Mode().Perm()&0o077 != 0 {
|
||||
t.Fatalf("database should not be group/world-readable: %s", info.Mode().Perm())
|
||||
}
|
||||
|
||||
var journalMode string
|
||||
if err := st.DB().QueryRow(`pragma journal_mode`).Scan(&journalMode); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if journalMode != "wal" {
|
||||
t.Fatalf("expected WAL journal mode, got %q", journalMode)
|
||||
}
|
||||
var busyTimeout int
|
||||
if err := st.DB().QueryRow(`pragma busy_timeout`).Scan(&busyTimeout); err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
if busyTimeout != 5000 {
|
||||
t.Fatalf("expected busy_timeout=5000, got %d", busyTimeout)
|
||||
}
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user