feat: add archive activity report

This commit is contained in:
Vincent Koc 2026-04-22 23:03:06 -07:00 committed by GitHub
parent 796a5a03d4
commit e5764220b2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 313 additions and 2 deletions

View File

@ -25,7 +25,7 @@ to without holding Notion credentials.
- normalized Markdown export organized by space and page path
- CSV/TSV export for crawled Notion database rows
- compressed JSONL git-share snapshots plus import/update workflows
- archive status and SQLite maintenance commands
- archive status, activity reporting, and SQLite maintenance commands
- read-only SQL access for ad hoc inspection
## Install
@ -46,6 +46,7 @@ Use the local Notion Desktop cache:
notcrawl init
notcrawl doctor
notcrawl status
notcrawl report
notcrawl sync --source desktop
notcrawl export-md
notcrawl search "launch plan"
@ -73,6 +74,7 @@ Default paths:
- `init` writes a starter config
- `doctor` checks config, SQLite, desktop cache, and token presence
- `status` prints archive counts, last sync time, and database/WAL size
- `report` summarizes recent page, database, space, and comment activity
- `maintain` rebuilds FTS, optimizes SQLite indexes, and can run `VACUUM`
- `sync` ingests from `desktop`, `api`, or `all`
- `export-md` renders normalized Markdown files from SQLite

View File

@ -28,7 +28,7 @@ V1 scope:
- users and spaces/workspaces
- FTS5 search over rendered page/comment text
- raw SQL access
- archive status and SQLite maintenance commands
- archive status, activity reporting, and SQLite maintenance commands
- Markdown export
- CSV/TSV export for database rows
- git-backed archive publishing and subscription
@ -84,6 +84,9 @@ Store startup must enable WAL, foreign keys, a busy timeout, normal
synchronous writes, in-memory temp storage, and the crawler query indexes needed
for common page, collection, comment, raw-record, and sync-state lookups.
`report` must provide a SQL-free archive summary: total records, recent edited
page/comment windows, top databases, top spaces, and recently edited pages.
Core tables:
- `spaces`

View File

@ -14,6 +14,7 @@ import (
"github.com/vincentkoc/notcrawl/internal/markdown"
"github.com/vincentkoc/notcrawl/internal/notionapi"
"github.com/vincentkoc/notcrawl/internal/notiondesktop"
"github.com/vincentkoc/notcrawl/internal/report"
"github.com/vincentkoc/notcrawl/internal/share"
"github.com/vincentkoc/notcrawl/internal/store"
"github.com/vincentkoc/notcrawl/internal/tableexport"
@ -68,6 +69,8 @@ func run(ctx context.Context, args []string, stdout, stderr io.Writer) error {
return runDoctor(ctx, stdout, cfg)
case "status":
return runStatus(ctx, stdout, cfg)
case "report":
return runReport(ctx, stdout, cfg)
case "maintain":
return runMaintain(ctx, stdout, cfg, cmdArgs)
case "sync":
@ -145,6 +148,24 @@ func runStatus(ctx context.Context, stdout io.Writer, cfg config.Config) error {
return nil
}
func runReport(ctx context.Context, stdout io.Writer, cfg config.Config) error {
st, err := store.Open(cfg.DBPath)
if err != nil {
return err
}
defer st.Close()
activity, err := report.Build(ctx, st, report.Options{})
if err != nil {
return err
}
b, err := json.MarshalIndent(activity, "", " ")
if err != nil {
return err
}
fmt.Fprintln(stdout, string(b))
return nil
}
func runMaintain(ctx context.Context, stdout io.Writer, cfg config.Config, args []string) error {
fs := flag.NewFlagSet("maintain", flag.ContinueOnError)
vacuum := fs.Bool("vacuum", false, "run VACUUM after rebuilding and optimizing indexes")
@ -456,6 +477,7 @@ Commands:
init Write a starter config
doctor Check config, database, desktop cache, and token
status Show archive counts and database size
report Show recent archive activity
maintain [--vacuum] Rebuild FTS and optimize SQLite indexes
sync --source desktop Ingest Notion Desktop cache
sync --source api Ingest through the official Notion API

217
internal/report/report.go Normal file
View File

@ -0,0 +1,217 @@
package report
import (
"context"
"database/sql"
"fmt"
"time"
"github.com/vincentkoc/notcrawl/internal/store"
)
type Options struct {
Now time.Time
}
type ActivityReport struct {
GeneratedAt time.Time `json:"generated_at"`
LatestEditedAt *time.Time `json:"latest_edited_at,omitempty"`
TotalSpaces int `json:"total_spaces"`
TotalUsers int `json:"total_users"`
TotalPages int `json:"total_pages"`
TotalBlocks int `json:"total_blocks"`
TotalDatabases int `json:"total_databases"`
TotalComments int `json:"total_comments"`
TotalRaw int `json:"total_raw_records"`
Windows []WindowStats `json:"windows"`
TopCollections []RankedCount `json:"top_collections"`
TopSpaces []RankedCount `json:"top_spaces"`
RecentPages []PageActivity `json:"recent_pages"`
}
type WindowStats struct {
Label string `json:"label"`
Since time.Time `json:"since"`
EditedPages int `json:"edited_pages"`
ActiveCollections int `json:"active_collections"`
Comments int `json:"comments"`
}
type RankedCount struct {
Name string `json:"name"`
Count int `json:"count"`
}
type PageActivity struct {
ID string `json:"id"`
Title string `json:"title"`
CollectionID string `json:"collection_id,omitempty"`
SpaceID string `json:"space_id,omitempty"`
EditedAt *time.Time `json:"edited_at,omitempty"`
}
func Build(ctx context.Context, st *store.Store, opts Options) (ActivityReport, error) {
now := opts.Now
if now.IsZero() {
now = time.Now().UTC()
}
report := ActivityReport{GeneratedAt: now.UTC()}
if err := scanTotals(ctx, st.DB(), &report); err != nil {
return ActivityReport{}, err
}
var anchor time.Time
if report.LatestEditedAt != nil {
anchor = *report.LatestEditedAt
}
if anchor.IsZero() {
anchor = now.UTC()
}
for _, window := range []struct {
label string
dur time.Duration
}{
{"24 hours", 24 * time.Hour},
{"7 days", 7 * 24 * time.Hour},
{"30 days", 30 * 24 * time.Hour},
} {
stats, err := scanWindow(ctx, st.DB(), window.label, anchor.Add(-window.dur))
if err != nil {
return ActivityReport{}, err
}
report.Windows = append(report.Windows, stats)
}
weekSince := anchor.Add(-7 * 24 * time.Hour)
var err error
report.TopCollections, err = ranked(ctx, st.DB(), `
select coalesce(nullif(c.name, ''), nullif(p.collection_id, ''), 'no database') as name, count(*) as total
from pages p
left join collections c on c.id = p.collection_id
where p.alive = 1 and coalesce(p.last_edited_time, p.created_time, 0) >= ?
group by coalesce(nullif(c.name, ''), nullif(p.collection_id, ''), 'no database')
order by total desc, name asc
limit ?
`, unixMilli(weekSince), 8)
if err != nil {
return ActivityReport{}, err
}
if report.TopCollections == nil {
report.TopCollections = []RankedCount{}
}
report.TopSpaces, err = ranked(ctx, st.DB(), `
select coalesce(nullif(s.name, ''), nullif(p.space_id, ''), 'default') as name, count(*) as total
from pages p
left join spaces s on s.id = p.space_id
where p.alive = 1 and coalesce(p.last_edited_time, p.created_time, 0) >= ?
group by coalesce(nullif(s.name, ''), nullif(p.space_id, ''), 'default')
order by total desc, name asc
limit ?
`, unixMilli(weekSince), 8)
if err != nil {
return ActivityReport{}, err
}
if report.TopSpaces == nil {
report.TopSpaces = []RankedCount{}
}
report.RecentPages, err = recentPages(ctx, st.DB(), 8)
if err != nil {
return ActivityReport{}, err
}
if report.RecentPages == nil {
report.RecentPages = []PageActivity{}
}
return report, nil
}
func scanTotals(ctx context.Context, db *sql.DB, report *ActivityReport) error {
var latest sql.NullInt64
if err := db.QueryRowContext(ctx, `
select
(select count(*) from spaces),
(select count(*) from users),
(select count(*) from pages where alive = 1),
(select count(*) from blocks where alive = 1),
(select count(*) from collections),
(select count(*) from comments where alive = 1),
(select count(*) from raw_records),
(select max(coalesce(last_edited_time, created_time, 0)) from pages where alive = 1)
`).Scan(
&report.TotalSpaces,
&report.TotalUsers,
&report.TotalPages,
&report.TotalBlocks,
&report.TotalDatabases,
&report.TotalComments,
&report.TotalRaw,
&latest,
); err != nil {
return fmt.Errorf("scan report totals: %w", err)
}
if latest.Valid && latest.Int64 > 0 {
t := time.UnixMilli(latest.Int64).UTC()
report.LatestEditedAt = &t
}
return nil
}
func scanWindow(ctx context.Context, db *sql.DB, label string, since time.Time) (WindowStats, error) {
stats := WindowStats{Label: label, Since: since.UTC()}
cutoff := unixMilli(since)
if err := db.QueryRowContext(ctx, `
select
(select count(*) from pages where alive = 1 and coalesce(last_edited_time, created_time, 0) >= ?),
(select count(distinct nullif(collection_id, '')) from pages where alive = 1 and coalesce(last_edited_time, created_time, 0) >= ?),
(select count(*) from comments where alive = 1 and coalesce(created_time, 0) >= ?)
`, cutoff, cutoff, cutoff).Scan(&stats.EditedPages, &stats.ActiveCollections, &stats.Comments); err != nil {
return WindowStats{}, fmt.Errorf("scan %s stats: %w", label, err)
}
return stats, nil
}
func ranked(ctx context.Context, db *sql.DB, query string, args ...any) ([]RankedCount, error) {
rows, err := db.QueryContext(ctx, query, args...)
if err != nil {
return nil, err
}
defer rows.Close()
var out []RankedCount
for rows.Next() {
var row RankedCount
if err := rows.Scan(&row.Name, &row.Count); err != nil {
return nil, err
}
out = append(out, row)
}
return out, rows.Err()
}
func recentPages(ctx context.Context, db *sql.DB, limit int) ([]PageActivity, error) {
rows, err := db.QueryContext(ctx, `
select id, title, coalesce(collection_id, ''), coalesce(space_id, ''), coalesce(last_edited_time, created_time, 0)
from pages
where alive = 1
order by coalesce(last_edited_time, created_time, 0) desc, title asc
limit ?
`, limit)
if err != nil {
return nil, err
}
defer rows.Close()
var out []PageActivity
for rows.Next() {
var row PageActivity
var edited int64
if err := rows.Scan(&row.ID, &row.Title, &row.CollectionID, &row.SpaceID, &edited); err != nil {
return nil, err
}
if edited > 0 {
t := time.UnixMilli(edited).UTC()
row.EditedAt = &t
}
out = append(out, row)
}
return out, rows.Err()
}
func unixMilli(t time.Time) int64 {
return t.UTC().UnixMilli()
}

View File

@ -0,0 +1,67 @@
package report
import (
"context"
"path/filepath"
"testing"
"time"
"github.com/vincentkoc/notcrawl/internal/store"
)
func TestBuildReport(t *testing.T) {
ctx := context.Background()
st, err := store.Open(filepath.Join(t.TempDir(), "notcrawl.db"))
if err != nil {
t.Fatal(err)
}
defer st.Close()
now := time.Date(2026, 4, 23, 5, 0, 0, 0, time.UTC)
latest := now.Add(-time.Hour).UnixMilli()
older := now.Add(-48 * time.Hour).UnixMilli()
if err := st.UpsertSpace(ctx, store.Space{ID: "space1", Name: "HQ", Source: "test", SyncedAt: latest}); err != nil {
t.Fatal(err)
}
if err := st.UpsertUser(ctx, store.User{ID: "user1", Name: "Ada", Source: "test", SyncedAt: latest}); err != nil {
t.Fatal(err)
}
if err := st.UpsertCollection(ctx, store.Collection{ID: "db1", Name: "Roadmap", Source: "test", SyncedAt: latest}); err != nil {
t.Fatal(err)
}
if err := st.UpsertPage(ctx, store.Page{ID: "page1", SpaceID: "space1", CollectionID: "db1", Title: "Launch", CreatedTime: older, LastEditedTime: latest, Alive: true, Source: "test", SyncedAt: latest}); err != nil {
t.Fatal(err)
}
if err := st.UpsertPage(ctx, store.Page{ID: "page2", SpaceID: "space1", Title: "Notes", CreatedTime: older, LastEditedTime: older, Alive: true, Source: "test", SyncedAt: older}); err != nil {
t.Fatal(err)
}
if err := st.UpsertBlock(ctx, store.Block{ID: "block1", PageID: "page1", Type: "paragraph", Text: "ship", CreatedTime: latest, Alive: true, Source: "test", SyncedAt: latest}); err != nil {
t.Fatal(err)
}
if err := st.UpsertComment(ctx, store.Comment{ID: "comment1", PageID: "page1", Text: "done", CreatedTime: latest, Alive: true, Source: "test", SyncedAt: latest}); err != nil {
t.Fatal(err)
}
report, err := Build(ctx, st, Options{Now: now})
if err != nil {
t.Fatal(err)
}
if report.TotalSpaces != 1 || report.TotalUsers != 1 || report.TotalPages != 2 || report.TotalBlocks != 1 || report.TotalDatabases != 1 || report.TotalComments != 1 {
t.Fatalf("unexpected totals: %+v", report)
}
if report.LatestEditedAt == nil || !report.LatestEditedAt.Equal(time.UnixMilli(latest).UTC()) {
t.Fatalf("unexpected latest edit: %s", report.LatestEditedAt)
}
if len(report.Windows) != 3 || report.Windows[0].EditedPages != 1 || report.Windows[0].Comments != 1 {
t.Fatalf("unexpected windows: %+v", report.Windows)
}
if len(report.TopCollections) == 0 || report.TopCollections[0].Name != "Roadmap" {
t.Fatalf("unexpected top collections: %+v", report.TopCollections)
}
if len(report.TopSpaces) == 0 || report.TopSpaces[0].Name != "HQ" {
t.Fatalf("unexpected top spaces: %+v", report.TopSpaces)
}
if len(report.RecentPages) != 2 || report.RecentPages[0].ID != "page1" {
t.Fatalf("unexpected recent pages: %+v", report.RecentPages)
}
}