commit a03b44eb6f0d42ef4fb098bbdb31f4422e87ac09 Author: Vincent Koc Date: Wed Apr 22 14:41:56 2026 -0700 chore: scaffold notioncrawl diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..a995136 --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +bin/ +dist/ +*.db +*.db-* +*.log +.DS_Store +.notioncrawl/ diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000..b0f6718 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,19 @@ +# Contributing to notioncrawl + +Keep real Notion workspace data, secrets, tokens, cookies, and exported private +content out of git. + +Useful local checks: + +```bash +go test ./... +go build ./cmd/notioncrawl +``` + +Implementation notes: + +- read Notion Desktop data through snapshots only +- prefer stable normalized rows plus raw source payloads +- keep Markdown rendering deterministic +- add comments only where Notion-specific behavior is not obvious +- keep `README.md`, `SPEC.md`, and examples in sync diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..a7cb85a --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2026 Vincent Koc + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..f8abf73 --- /dev/null +++ b/Makefile @@ -0,0 +1,15 @@ +BINARY ?= bin/notioncrawl + +.PHONY: build test run fmt + +build: + go build -o $(BINARY) ./cmd/notioncrawl + +test: + go test ./... + +run: + go run ./cmd/notioncrawl $(ARGS) + +fmt: + gofmt -w $$(find . -name '*.go' -not -path './.git/*') diff --git a/README.md b/README.md new file mode 100644 index 0000000..baf461e --- /dev/null +++ b/README.md @@ -0,0 +1,71 @@ +# notioncrawl + +`notioncrawl` mirrors Notion workspace data into local SQLite and normalized +Markdown so you can search, query, diff, and share your Notion memory without +depending on the Notion UI. + +It has two ingestion paths: + +- `desktop`: read-only snapshots of the local Notion desktop cache +- `api`: official Notion API sync with rate-limit aware crawling + +SQLite is the canonical archive. Markdown is the durable human/agent surface. +Git share mode publishes normalized snapshots that other machines can subscribe +to without holding Notion credentials. + +## Current Scope + +- local SQLite storage with FTS5 +- read-only local desktop cache ingestion from macOS Notion +- official API page/block/user/comment ingestion +- normalized Markdown export organized by space and page path +- compressed JSONL git-share snapshots plus import/update workflows +- read-only SQL access for ad hoc inspection + +## Quick Start + +```bash +go build -o bin/notioncrawl ./cmd/notioncrawl +bin/notioncrawl init +bin/notioncrawl doctor +bin/notioncrawl sync --source desktop +bin/notioncrawl export-md +bin/notioncrawl search "launch plan" +``` + +For API sync: + +```bash +export NOTION_TOKEN="secret_..." +bin/notioncrawl sync --source api +``` + +Default paths: + +- config: `~/.notioncrawl/config.toml` +- database: `~/.notioncrawl/notioncrawl.db` +- cache: `~/.notioncrawl/cache` +- Markdown archive: `~/.notioncrawl/pages` +- git share repo: `~/.notioncrawl/share` + +## Commands + +- `init` writes a starter config +- `doctor` checks config, SQLite, desktop cache, and token presence +- `sync` ingests from `desktop`, `api`, or `all` +- `export-md` renders normalized Markdown files from SQLite +- `search` searches page and comment text through FTS5 +- `sql` runs read-only SQL against the archive +- `publish` exports SQLite tables and Markdown into a git share repo +- `subscribe` clones a share repo and imports the latest snapshot +- `update` pulls and imports a subscribed share repo + +## Safety Model + +Desktop mode is read-only. It snapshots Notion's local SQLite database before +reading it and never writes to Notion application storage. + +API mode uses the official Notion API. It stores raw API payloads alongside +normalized rows so renderers can improve without recrawling. + +Secrets are never exported into Markdown or git-share snapshots. diff --git a/SPEC.md b/SPEC.md new file mode 100644 index 0000000..913e999 --- /dev/null +++ b/SPEC.md @@ -0,0 +1,132 @@ +# notioncrawl Spec + +## Goals + +- build a local-first Notion crawler +- mirror Notion pages, blocks, databases, comments, and workspace metadata +- store normalized records in SQLite +- preserve raw source records for future re-rendering +- render normalized Markdown blobs into an organized file tree +- support fast text search and raw SQL +- support one-shot backfill and incremental repair +- publish and subscribe private git-backed snapshots + +## Product Summary + +`notioncrawl` is a Go CLI that turns Notion workspace memory into a local +SQLite archive plus normalized Markdown files. + +V1 scope: + +- macOS Notion Desktop cache discovery +- read-only desktop snapshot ingestion +- official Notion API sync +- pages and blocks +- databases/data sources as collections +- comments and discussions where available +- users and spaces/workspaces +- FTS5 search over rendered page/comment text +- raw SQL access +- Markdown export +- git-backed archive publishing and subscription + +Out of scope for V1: + +- write-back actions +- modifying Notion local storage +- bypassing workspace permissions +- full attachment blob mirroring by default +- public integration Marketplace hardening + +## Data Sources + +### Desktop Source + +Default macOS path: + +```text +~/Library/Application Support/Notion/notion.db +``` + +Desktop sync must: + +1. locate Notion Desktop storage +2. snapshot `notion.db` into the cache dir +3. open the snapshot read-only +4. ingest supported tables into the local archive +5. record unsupported source records in `raw_records` + +Desktop cache coverage is opportunistic. It only includes what Notion has +cached, downloaded, or recently touched locally. + +### API Source + +API sync uses `NOTION_TOKEN` by default. It must: + +1. search/list pages and data sources visible to the integration +2. recursively fetch block children +3. fetch users +4. fetch comments where the integration has access +5. obey `Retry-After` on rate limits +6. store raw JSON plus normalized rows + +## SQLite Archive + +SQLite is canonical. Markdown is generated output. + +Core tables: + +- `spaces` +- `users` +- `pages` +- `blocks` +- `collections` +- `collection_views` +- `comments` +- `discussions` +- `raw_records` +- `sync_state` +- `page_fts` +- `comment_fts` + +## Markdown Archive + +Markdown export writes deterministic paths: + +```text +pages//-.md +``` + +Each file starts with YAML-ish front matter: + +```yaml +--- +id: ... +space_id: ... +title: ... +source: desktop+api +notion_url: ... +created_time: ... +last_edited_time: ... +--- +``` + +The body renders blocks into normalized Markdown. Unsupported blocks should be +represented with concise placeholders, not silently dropped. + +## Git Share + +Git share mode exports: + +```text +manifest.json +data/*.jsonl.gz +pages/**/*.md +``` + +`publish` writes a snapshot and optionally commits/pushes it. + +`subscribe` clones a snapshot repo, writes reader config, and imports data into +SQLite without requiring Notion credentials. + +`update` pulls the latest snapshot and imports it. diff --git a/cmd/notioncrawl/main.go b/cmd/notioncrawl/main.go new file mode 100644 index 0000000..1d41014 --- /dev/null +++ b/cmd/notioncrawl/main.go @@ -0,0 +1,15 @@ +package main + +import ( + "fmt" + "os" +) + +func main() { + if len(os.Args) > 1 && (os.Args[1] == "-h" || os.Args[1] == "--help") { + fmt.Print("Usage of notioncrawl:\n notioncrawl [global flags] [args]\n") + return + } + fmt.Fprintln(os.Stderr, "notioncrawl: implementation in progress") + os.Exit(2) +} diff --git a/config.example.toml b/config.example.toml new file mode 100644 index 0000000..8a3a2ed --- /dev/null +++ b/config.example.toml @@ -0,0 +1,19 @@ +db_path = "~/.notioncrawl/notioncrawl.db" +cache_dir = "~/.notioncrawl/cache" +markdown_dir = "~/.notioncrawl/pages" + +[notion.desktop] +enabled = true +path = "" + +[notion.api] +enabled = true +token_env = "NOTION_TOKEN" +base_url = "https://api.notion.com/v1" +version = "2022-06-28" + +[share] +remote = "" +branch = "main" +repo_path = "~/.notioncrawl/share" +stale_after = "1h" diff --git a/go.mod b/go.mod new file mode 100644 index 0000000..91f8f08 --- /dev/null +++ b/go.mod @@ -0,0 +1,3 @@ +module github.com/vincentkoc/notioncrawl + +go 1.26.0