fix(share): delta import git snapshots

This commit is contained in:
Peter Steinberger 2026-05-08 08:29:38 +01:00
parent 335a95bd66
commit 67c6f4655b
No known key found for this signature in database
15 changed files with 588 additions and 65 deletions

View File

@ -4,22 +4,35 @@
### Changes ### Changes
- Document the crawlkit-backed config/status/control, snapshot, mirror, - Added `discrawl tui`, a terminal archive browser for stored guild messages and local `@me` wiretap DMs using the shared crawlkit pane browser.
sync-state, output, and shared TUI surfaces now used on `main`. - Added crawlkit-backed `metadata --json`, `status --json`, and `doctor --json` control surfaces for launchers, automation, and CI checks.
- Clarify that Discord bot sync, desktop wiretap parsing, DM privacy filters, - Published the generated documentation site at `discrawl.sh`, including command pages, install/setup docs, configuration, security notes, guides, a contact page, and social cards.
schema ownership, FTS/ranking, embeddings, and analytics remain app-owned. - Moved the Go module and release metadata to `github.com/openclaw/discrawl`.
- Align terminal browser docs with the gitcrawl-style shared TUI model:
channel/person/thread groups, message rows, detail/thread panes, sorting, ### Fixes
mouse selection, right-click actions, and local/remote status chrome.
- Made the terminal browser more useful and accurate: default guild scoping, newest-message startup, compact panes, selected-message detail panes, count-header sorting, local/remote status labels, right-click actions, Discord message URLs, row labels, direct-message pane labels, mention rendering, inline mention resolution, attachment details, and reply-context hydration without broad thread scans.
- Kept `tui --help`, status, and terminal-browser reads safe for fresh or missing local databases without triggering Git snapshot auto-update.
- Kept local-only snapshot rows filtered during shared archive imports and forwarded snapshot import progress through the crawlkit import path.
- Made stale Git snapshot imports plan shard deltas from crawlkit file fingerprints or Git object identity, so routine shared-archive refreshes import changed message tail shards instead of rebuilding every table and FTS index.
- Included progress percentages in message-sync logs.
- Fixed GoReleaser version stamping after the module path move.
### Documentation
- Documented the crawlkit-backed config/status/control, snapshot, mirror, sync-state, output, and shared TUI surfaces now used on `main`.
- Clarified that Discord bot sync, desktop wiretap parsing, DM privacy filters, schema ownership, FTS/ranking, embeddings, and analytics remain app-owned.
- Aligned terminal-browser docs with the gitcrawl-style shared TUI model: channel/person/thread groups, message rows, detail/thread panes, sorting, mouse selection, right-click actions, and local/remote status chrome.
- Refreshed the repo-local `discrawl` agent skill for local Discord archive, freshness, query, boundary, TUI, verification, and read-only SQL workflows.
### Maintenance ### Maintenance
- Document the read-only `metadata --json`, `status --json`, and - Migrated runtime paths, SQLite opening, archive mirror/export/import helpers, output/status wiring, and TUI plumbing onto the shared `crawlkit` infrastructure.
`doctor --json` control surface for launchers, automation, and CI checks. - Updated crawlkit through `v0.4.1`, switched imports to `github.com/openclaw/crawlkit`, and added CI smoke coverage for the crawlkit control surface and merge behavior.
- Refresh the repo-local `discrawl` agent skill for local Discord archive, - Added CodeQL, verified secret scanning, protected automation owners, stale issue automation, `.editorconfig`, and `.gitattributes`.
freshness, query, boundary, TUI, and verification workflows. - Added release workflow automation that dispatches the Homebrew tap formula update after GoReleaser publishes a tag.
- Document `discrawl sql` read-only query examples in the repo-local agent
skill so agents can do exact archive counts and rankings safely. ## 0.6.6 - 2026-05-05
### Fixes ### Fixes
@ -45,24 +58,10 @@
- Refreshed dependency and CI tooling pins, including GoReleaser, `go-toml`, golangci-lint, and gosec. - Refreshed dependency and CI tooling pins, including GoReleaser, `go-toml`, golangci-lint, and gosec.
- Tightened CI compatibility with the latest linters and made signal-cancellation and sync fixture tests deterministic under the race detector. - Tightened CI compatibility with the latest linters and made signal-cancellation and sync fixture tests deterministic under the race detector.
### Fixes
- Label direct-message TUI panes as direct messages instead of raw `@me` guild rows, keeping DM channel/person context readable.
- Inherit shared crawlkit TUI improvements for newest-first startup, count-header sorting, selected-message-first chat detail panes, and gitcrawl-style metadata labels.
- Surface Discord attachment filenames and extracted text in TUI detail panes instead of only showing `attachments=true`.
## 0.6.3 - 2026-05-01 ## 0.6.3 - 2026-05-01
### Changes
- Add crawlkit control metadata/status surfaces with `metadata --json`, `status --json`, and `doctor --json`.
- Add `tap` and `cache-import` as public desktop-cache import names while keeping `wiretap` as a documented legacy alias.
- Add `discrawl tui`, a terminal archive browser for stored guild messages and local `@me` wiretap DMs using the shared `crawlkit/tui` package.
- Render TUI rows with compact panes and expose pinned, attachment, reply, channel, and author metadata in the detail pane.
### Fixes ### Fixes
- Keep status and TUI reads safe for fresh or missing local databases without triggering git-share auto-update.
- Added OS keyring fallback for Discord bot-token resolution, keeping env as the first source and documenting the default keyring item. (#17) - Added OS keyring fallback for Discord bot-token resolution, keeping env as the first source and documenting the default keyring item. (#17)
- Clarified and locked down FTS query normalization so operator-like search terms such as `AND`, `OR`, `NOT`, `NEAR`, and `*` stay parameterized and quoted before SQLite `MATCH`. Thanks @mvanhorn. - Clarified and locked down FTS query normalization so operator-like search terms such as `AND`, `OR`, `NOT`, `NEAR`, and `*` stay parameterized and quoted before SQLite `MATCH`. Thanks @mvanhorn.

View File

@ -177,7 +177,9 @@ The terminal browser uses the shared crawlkit explorer. The left pane groups
channels, people, or threads; the middle pane lists messages; the right pane channels, people, or threads; the middle pane lists messages; the right pane
shows the selected message, surrounding conversation, and thread detail. Mouse shows the selected message, surrounding conversation, and thread detail. Mouse
selection, right-click actions, sortable headers, and the local/remote footer selection, right-click actions, sortable headers, and the local/remote footer
follow the same interaction model as `gitcrawl tui`. follow the same interaction model as `gitcrawl tui`. See
[`docs/commands/tui.md`](docs/commands/tui.md) for flags and read-only/DM scope
notes.
### `init` ### `init`
@ -247,6 +249,7 @@ When `--channels` includes a forum channel id, `discrawl` expands that forum's t
Long runs now emit periodic progress logs to stderr so large backfills and Git snapshot imports do not look hung. Long runs now emit periodic progress logs to stderr so large backfills and Git snapshot imports do not look hung.
If in-flight channels stop completing for a while, `discrawl` now emits `message sync waiting` heartbeat logs with the oldest active channel, per-channel page activity, and skip/defer counters, and every run ends with a `message sync finished` summary. If in-flight channels stop completing for a while, `discrawl` now emits `message sync waiting` heartbeat logs with the oldest active channel, per-channel page activity, and skip/defer counters, and every run ends with a `message sync finished` summary.
Each channel crawl also has a bounded runtime budget, so a pathological channel is deferred and retried on the next sync instead of pinning a worker forever. Each channel crawl also has a bounded runtime budget, so a pathological channel is deferred and retried on the next sync instead of pinning a worker forever.
Retryable failures and unavailable-channel markers are tracked per channel; stale unavailable markers are cleared after a later successful crawl, and marker cleanup is best-effort so one missing local sync-state row cannot crash the run.
Full sync member refresh is best-effort and currently gives up after five minutes without a caller-supplied deadline, so message sync completion is not held hostage by a slow guild member crawl. Full sync member refresh is best-effort and currently gives up after five minutes without a caller-supplied deadline, so message sync completion is not held hostage by a slow guild member crawl.
When the archive is already complete, `sync --full` now reuses the stored backlog markers and limits steady-state refresh to live top-level channels plus active threads instead of revisiting every stored archived thread. When the archive is already complete, `sync --full` now reuses the stored backlog markers and limits steady-state refresh to live top-level channels plus active threads instead of revisiting every stored archived thread.
If a guild already has a local member snapshot, routine syncs reuse it and skip another full member crawl until that snapshot ages out. If a guild already has a local member snapshot, routine syncs reuse it and skip another full member crawl until that snapshot ages out.
@ -482,9 +485,9 @@ discrawl subscribe --stale-after 15m https://github.com/example/discord-archive.
discrawl subscribe --no-auto-update https://github.com/example/discord-archive.git discrawl subscribe --no-auto-update https://github.com/example/discord-archive.git
``` ```
Once `share.remote` is configured, read commands auto-fetch and import when the local share import is older than `share.stale_after` (default `15m`). `discrawl update` forces the same pull/import step manually. `discrawl sync` does not auto-import the share unless `--update=auto` or `--update=force` is provided, so routine live refreshes stay fast. Once `share.remote` is configured, read commands auto-fetch and import when the local share import is older than `share.stale_after` (default `15m`). Imports are planned from crawlkit shard fingerprints, with a Git-object fallback for older manifests, so routine updates normally read only changed tail shards and preserve local FTS rows instead of rebuilding the whole archive. `discrawl update` forces the same pull/import step manually. `discrawl sync` does not auto-import the share unless `--update=auto` or `--update=force` is provided, so routine live refreshes stay fast.
Hybrid mode is supported too: keep normal Discord credentials configured and set `share.remote`. `discrawl sync --update=auto` and `discrawl messages --sync` import the Git snapshot first, then use live Discord for latest-message deltas. Use `sync --all-channels` or `sync --full` when you intentionally want a broader live repair/backfill pass. Hybrid mode is supported too: keep normal Discord credentials configured and set `share.remote`. `discrawl sync --update=auto` and `discrawl messages --sync` import the Git snapshot first, usually as a changed-shard delta, then use live Discord for latest-message deltas. Use `sync --all-channels` or `sync --full` when you intentionally want a broader live repair/backfill pass.
Git snapshots publish non-DM archive tables by default. Embedding queue state stays local to each machine, and Git-only readers can use FTS immediately without an embedding provider. Git snapshots publish non-DM archive tables by default. Embedding queue state stays local to each machine, and Git-only readers can use FTS immediately without an embedding provider.

View File

@ -7,7 +7,7 @@ By default, `sync` runs both live/local sources and does **not** import the Git
- Discord bot-token sync for bot-visible guild data - Discord bot-token sync for bot-visible guild data
- local Discord Desktop cache import for classifiable cached messages and proven DMs - local Discord Desktop cache import for classifiable cached messages and proven DMs
Use [`update`](update.html) when you want to pull/import the shared Git snapshot. If you intentionally want a sync run to import the snapshot before live deltas, pass `--update=auto` (only when stale) or `--update=force` (always). `--no-update` is accepted as an explicit no-op alias for the default. Use [`update`](update.html) when you want to pull/import the shared Git snapshot. Snapshot imports normally use changed-shard deltas, but unsafe table changes fall back to a full import. If you intentionally want a sync run to import the snapshot before live deltas, pass `--update=auto` (only when stale) or `--update=force` (always). `--no-update` is accepted as an explicit no-op alias for the default.
Run one explicit `--full` pass when you want a complete historical guild archive. Use plain `sync` afterward for frequent latest-message and desktop-cache refreshes. Run one explicit `--full` pass when you want a complete historical guild archive. Use plain `sync` afterward for frequent latest-message and desktop-cache refreshes.
@ -70,6 +70,8 @@ discrawl sync --with-embeddings
- Heartbeat logs (`message sync waiting`) name the oldest active channel and per-channel page activity if in-flight channels stop completing for a while. - Heartbeat logs (`message sync waiting`) name the oldest active channel and per-channel page activity if in-flight channels stop completing for a while.
- Every run ends with a `message sync finished` summary. - Every run ends with a `message sync finished` summary.
- Each channel crawl has a bounded runtime budget; pathological channels are deferred and retried next sync. - Each channel crawl has a bounded runtime budget; pathological channels are deferred and retried next sync.
- Retryable failures and unavailable-channel markers are tracked per channel; stale unavailable markers are cleared after a later successful crawl.
- Marker cleanup is best-effort, so one missing local sync-state row cannot crash the run.
- Full sync member refresh is best-effort and gives up after five minutes without a caller-supplied deadline. - Full sync member refresh is best-effort and gives up after five minutes without a caller-supplied deadline.
- When the archive is already complete, `sync --full` reuses backlog markers and limits steady-state refresh to live top-level channels plus active threads. - When the archive is already complete, `sync --full` reuses backlog markers and limits steady-state refresh to live top-level channels plus active threads.

View File

@ -2,6 +2,8 @@
Forces a Git snapshot pull and import. Forces a Git snapshot pull and import.
Routine imports are delta-planned from crawlkit shard fingerprints, with a Git-object fallback for older manifests. The usual publish only imports changed tail shards; unsafe table changes fall back to a full import.
## Usage ## Usage
```bash ```bash
@ -19,7 +21,7 @@ discrawl update --with-embeddings
## When to use it ## When to use it
- you have `share.remote` configured and want a fresh import before running a command that does not auto-update (`sync` does not auto-import unless `--update=auto` is passed) - you have `share.remote` configured and want a fresh shard-delta import before running a command that does not auto-update (`sync` does not auto-import unless `--update=auto` is passed)
- you set `--no-auto-update` when subscribing and want to refresh on demand - you set `--no-auto-update` when subscribing and want to refresh on demand
- a CI job already imported the latest snapshot but read commands still consider it stale - a CI job already imported the latest snapshot but read commands still consider it stale

View File

@ -35,7 +35,7 @@ discrawl subscribe --stale-after 15m https://github.com/example/discord-archive.
discrawl subscribe --no-auto-update https://github.com/example/discord-archive.git discrawl subscribe --no-auto-update https://github.com/example/discord-archive.git
``` ```
`discrawl update` forces the same pull/import step manually. `discrawl update` forces the same pull/import step manually. Snapshot imports are delta-planned from crawlkit shard fingerprints. Older manifests without those fields fall back to Git blob identity, so the common publish shape only imports the changed message tail shard plus small cursor tables. Unsafe table-shape changes still fall back to a full import.
`discrawl sync` does **not** auto-import the share unless `--update=auto` or `--update=force` is provided, so routine live refreshes stay fast. `discrawl sync` does **not** auto-import the share unless `--update=auto` or `--update=force` is provided, so routine live refreshes stay fast.
@ -44,7 +44,7 @@ discrawl subscribe --no-auto-update https://github.com/example/discord-archive.g
Keep normal Discord credentials configured **and** set `share.remote`: Keep normal Discord credentials configured **and** set `share.remote`:
```bash ```bash
discrawl sync --update=auto # import snapshot first, then live deltas discrawl sync --update=auto # import snapshot delta first, then live deltas
discrawl messages --sync # blocking pre-query sync for matched scope discrawl messages --sync # blocking pre-query sync for matched scope
discrawl sync --all-channels # broader live repair discrawl sync --all-channels # broader live repair
discrawl sync --full # historical backfill discrawl sync --full # historical backfill

View File

@ -19,7 +19,7 @@ Sync modes control the Discord bot API side of a run. When `wiretap` is selected
| Command | Use when | Behavior | | Command | Use when | Behavior |
| --- | --- | --- | | --- | --- | --- |
| `discrawl sync` | routine refresh | skips member refreshes, checks live top-level channels plus active threads, only fetches new messages for channels with a stored latest cursor | | `discrawl sync` | routine refresh | skips member refreshes, checks live top-level channels plus active threads, only fetches new messages for channels with a stored latest cursor |
| `discrawl sync --update=auto` | hybrid Git/live refresh | imports a stale Git snapshot first, then runs the routine live refresh | | `discrawl sync --update=auto` | hybrid Git/live refresh | imports a stale Git snapshot first, usually as a changed-shard delta, then runs the routine live refresh |
| `discrawl sync --all-channels` | repair pass | broad incremental sweep across every stored channel/thread, including archived threads | | `discrawl sync --all-channels` | repair pass | broad incremental sweep across every stored channel/thread, including archived threads |
| `discrawl sync --full` | historical backfill | crawls older history until channels are complete; can take a long time on large servers | | `discrawl sync --full` | historical backfill | crawls older history until channels are complete; can take a long time on large servers |
@ -43,6 +43,8 @@ Run one explicit `--full` pass when you want a complete historical guild archive
- If in-flight channels stop completing for a while, `discrawl` emits `message sync waiting` heartbeat logs with the oldest active channel, per-channel page activity, and skip/defer counters. - If in-flight channels stop completing for a while, `discrawl` emits `message sync waiting` heartbeat logs with the oldest active channel, per-channel page activity, and skip/defer counters.
- Every run ends with a `message sync finished` summary. - Every run ends with a `message sync finished` summary.
- Each channel crawl has a bounded runtime budget; pathological channels are deferred and retried on the next sync. - Each channel crawl has a bounded runtime budget; pathological channels are deferred and retried on the next sync.
- Retryable failures and unavailable-channel markers are tracked per channel; stale unavailable markers are cleared after a later successful crawl.
- Marker cleanup is best-effort, so one missing local sync-state row cannot crash the run.
- Full sync member refresh is best-effort and gives up after five minutes without a caller-supplied deadline, so message sync completion is not held hostage by a slow guild member crawl. - Full sync member refresh is best-effort and gives up after five minutes without a caller-supplied deadline, so message sync completion is not held hostage by a slow guild member crawl.
- When the archive is already complete, `sync --full` reuses backlog markers and limits steady-state refresh to live top-level channels plus active threads instead of revisiting every stored archived thread. - When the archive is already complete, `sync --full` reuses backlog markers and limits steady-state refresh to live top-level channels plus active threads instead of revisiting every stored archived thread.
- If a guild already has a local member snapshot, routine syncs reuse it and skip another full member crawl until that snapshot ages out. - If a guild already has a local member snapshot, routine syncs reuse it and skip another full member crawl until that snapshot ages out.

13
go.mod
View File

@ -13,9 +13,8 @@ require (
require ( require (
github.com/charmbracelet/bubbles v1.0.0 // indirect github.com/charmbracelet/bubbles v1.0.0 // indirect
github.com/clipperhouse/displaywidth v0.9.0 // indirect github.com/clipperhouse/displaywidth v0.11.0 // indirect
github.com/clipperhouse/stringish v0.1.1 // indirect github.com/clipperhouse/uax29/v2 v2.7.0 // indirect
github.com/clipperhouse/uax29/v2 v2.5.0 // indirect
github.com/pelletier/go-toml/v2 v2.3.1 // indirect github.com/pelletier/go-toml/v2 v2.3.1 // indirect
modernc.org/sqlite v1.50.0 // indirect modernc.org/sqlite v1.50.0 // indirect
) )
@ -25,7 +24,7 @@ require (
github.com/charmbracelet/bubbletea v1.3.10 // indirect github.com/charmbracelet/bubbletea v1.3.10 // indirect
github.com/charmbracelet/colorprofile v0.4.1 // indirect github.com/charmbracelet/colorprofile v0.4.1 // indirect
github.com/charmbracelet/lipgloss v1.1.0 // indirect github.com/charmbracelet/lipgloss v1.1.0 // indirect
github.com/charmbracelet/x/ansi v0.11.6 // indirect github.com/charmbracelet/x/ansi v0.11.7 // indirect
github.com/charmbracelet/x/cellbuf v0.0.15 // indirect github.com/charmbracelet/x/cellbuf v0.0.15 // indirect
github.com/charmbracelet/x/term v0.2.2 // indirect github.com/charmbracelet/x/term v0.2.2 // indirect
github.com/danieljoos/wincred v1.2.3 // indirect github.com/danieljoos/wincred v1.2.3 // indirect
@ -36,18 +35,18 @@ require (
github.com/google/pprof v0.0.0-20260402051712-545e8a4df936 // indirect github.com/google/pprof v0.0.0-20260402051712-545e8a4df936 // indirect
github.com/google/uuid v1.6.0 // indirect github.com/google/uuid v1.6.0 // indirect
github.com/kr/pretty v0.3.1 // indirect github.com/kr/pretty v0.3.1 // indirect
github.com/lucasb-eyer/go-colorful v1.3.0 // indirect github.com/lucasb-eyer/go-colorful v1.4.0 // indirect
github.com/mattn/go-isatty v0.0.22 // indirect github.com/mattn/go-isatty v0.0.22 // indirect
github.com/mattn/go-localereader v0.0.1 // indirect github.com/mattn/go-localereader v0.0.1 // indirect
github.com/mattn/go-runewidth v0.0.19 // indirect github.com/mattn/go-runewidth v0.0.23 // indirect
github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 // indirect github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 // indirect
github.com/muesli/cancelreader v0.2.2 // indirect github.com/muesli/cancelreader v0.2.2 // indirect
github.com/muesli/termenv v0.16.0 // indirect github.com/muesli/termenv v0.16.0 // indirect
github.com/ncruces/go-strftime v1.0.0 // indirect github.com/ncruces/go-strftime v1.0.0 // indirect
github.com/openclaw/crawlkit v0.4.2
github.com/pmezard/go-difflib v1.0.0 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
github.com/rivo/uniseg v0.4.7 // indirect github.com/rivo/uniseg v0.4.7 // indirect
github.com/vincentkoc/crawlkit v0.4.1
github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e // indirect github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e // indirect
golang.org/x/crypto v0.50.0 // indirect golang.org/x/crypto v0.50.0 // indirect
golang.org/x/tools v0.44.0 // indirect golang.org/x/tools v0.44.0 // indirect

26
go.sum
View File

@ -10,18 +10,16 @@ github.com/charmbracelet/colorprofile v0.4.1 h1:a1lO03qTrSIRaK8c3JRxJDZOvhvIeSco
github.com/charmbracelet/colorprofile v0.4.1/go.mod h1:U1d9Dljmdf9DLegaJ0nGZNJvoXAhayhmidOdcBwAvKk= github.com/charmbracelet/colorprofile v0.4.1/go.mod h1:U1d9Dljmdf9DLegaJ0nGZNJvoXAhayhmidOdcBwAvKk=
github.com/charmbracelet/lipgloss v1.1.0 h1:vYXsiLHVkK7fp74RkV7b2kq9+zDLoEU4MZoFqR/noCY= github.com/charmbracelet/lipgloss v1.1.0 h1:vYXsiLHVkK7fp74RkV7b2kq9+zDLoEU4MZoFqR/noCY=
github.com/charmbracelet/lipgloss v1.1.0/go.mod h1:/6Q8FR2o+kj8rz4Dq0zQc3vYf7X+B0binUUBwA0aL30= github.com/charmbracelet/lipgloss v1.1.0/go.mod h1:/6Q8FR2o+kj8rz4Dq0zQc3vYf7X+B0binUUBwA0aL30=
github.com/charmbracelet/x/ansi v0.11.6 h1:GhV21SiDz/45W9AnV2R61xZMRri5NlLnl6CVF7ihZW8= github.com/charmbracelet/x/ansi v0.11.7 h1:kzv1kJvjg2S3r9KHo8hDdHFQLEqn4RBCb39dAYC84jI=
github.com/charmbracelet/x/ansi v0.11.6/go.mod h1:2JNYLgQUsyqaiLovhU2Rv/pb8r6ydXKS3NIttu3VGZQ= github.com/charmbracelet/x/ansi v0.11.7/go.mod h1:9qGpnAVYz+8ACONkZBUWPtL7lulP9No6p1epAihUZwQ=
github.com/charmbracelet/x/cellbuf v0.0.15 h1:ur3pZy0o6z/R7EylET877CBxaiE1Sp1GMxoFPAIztPI= github.com/charmbracelet/x/cellbuf v0.0.15 h1:ur3pZy0o6z/R7EylET877CBxaiE1Sp1GMxoFPAIztPI=
github.com/charmbracelet/x/cellbuf v0.0.15/go.mod h1:J1YVbR7MUuEGIFPCaaZ96KDl5NoS0DAWkskup+mOY+Q= github.com/charmbracelet/x/cellbuf v0.0.15/go.mod h1:J1YVbR7MUuEGIFPCaaZ96KDl5NoS0DAWkskup+mOY+Q=
github.com/charmbracelet/x/term v0.2.2 h1:xVRT/S2ZcKdhhOuSP4t5cLi5o+JxklsoEObBSgfgZRk= github.com/charmbracelet/x/term v0.2.2 h1:xVRT/S2ZcKdhhOuSP4t5cLi5o+JxklsoEObBSgfgZRk=
github.com/charmbracelet/x/term v0.2.2/go.mod h1:kF8CY5RddLWrsgVwpw4kAa6TESp6EB5y3uxGLeCqzAI= github.com/charmbracelet/x/term v0.2.2/go.mod h1:kF8CY5RddLWrsgVwpw4kAa6TESp6EB5y3uxGLeCqzAI=
github.com/clipperhouse/displaywidth v0.9.0 h1:Qb4KOhYwRiN3viMv1v/3cTBlz3AcAZX3+y9OLhMtAtA= github.com/clipperhouse/displaywidth v0.11.0 h1:lBc6kY44VFw+TDx4I8opi/EtL9m20WSEFgwIwO+UVM8=
github.com/clipperhouse/displaywidth v0.9.0/go.mod h1:aCAAqTlh4GIVkhQnJpbL0T/WfcrJXHcj8C0yjYcjOZA= github.com/clipperhouse/displaywidth v0.11.0/go.mod h1:bkrFNkf81G8HyVqmKGxsPufD3JhNl3dSqnGhOoSD/o0=
github.com/clipperhouse/stringish v0.1.1 h1:+NSqMOr3GR6k1FdRhhnXrLfztGzuG+VuFDfatpWHKCs= github.com/clipperhouse/uax29/v2 v2.7.0 h1:+gs4oBZ2gPfVrKPthwbMzWZDaAFPGYK72F0NJv2v7Vk=
github.com/clipperhouse/stringish v0.1.1/go.mod h1:v/WhFtE1q0ovMta2+m+UbpZ+2/HEXNWYXQgCt4hdOzA= github.com/clipperhouse/uax29/v2 v2.7.0/go.mod h1:EFJ2TJMRUaplDxHKj1qAEhCtQPW2tJSwu5BF98AuoVM=
github.com/clipperhouse/uax29/v2 v2.5.0 h1:x7T0T4eTHDONxFJsL94uKNKPHrclyFI0lm7+w94cO8U=
github.com/clipperhouse/uax29/v2 v2.5.0/go.mod h1:Wn1g7MK6OoeDT0vL+Q0SQLDz/KpfsVRgg6W7ihQeh4g=
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
github.com/danieljoos/wincred v1.2.3 h1:v7dZC2x32Ut3nEfRH+vhoZGvN72+dQ/snVXo/vMFLdQ= github.com/danieljoos/wincred v1.2.3 h1:v7dZC2x32Ut3nEfRH+vhoZGvN72+dQ/snVXo/vMFLdQ=
github.com/danieljoos/wincred v1.2.3/go.mod h1:6qqX0WNrS4RzPZ1tnroDzq9kY3fu1KwE7MRLQK4X0bs= github.com/danieljoos/wincred v1.2.3/go.mod h1:6qqX0WNrS4RzPZ1tnroDzq9kY3fu1KwE7MRLQK4X0bs=
@ -49,14 +47,14 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
github.com/lucasb-eyer/go-colorful v1.3.0 h1:2/yBRLdWBZKrf7gB40FoiKfAWYQ0lqNcbuQwVHXptag= github.com/lucasb-eyer/go-colorful v1.4.0 h1:UtrWVfLdarDgc44HcS7pYloGHJUjHV/4FwW4TvVgFr4=
github.com/lucasb-eyer/go-colorful v1.3.0/go.mod h1:R4dSotOR9KMtayYi1e77YzuveK+i7ruzyGqttikkLy0= github.com/lucasb-eyer/go-colorful v1.4.0/go.mod h1:R4dSotOR9KMtayYi1e77YzuveK+i7ruzyGqttikkLy0=
github.com/mattn/go-isatty v0.0.22 h1:j8l17JJ9i6VGPUFUYoTUKPSgKe/83EYU2zBC7YNKMw4= github.com/mattn/go-isatty v0.0.22 h1:j8l17JJ9i6VGPUFUYoTUKPSgKe/83EYU2zBC7YNKMw4=
github.com/mattn/go-isatty v0.0.22/go.mod h1:ZXfXG4SQHsB/w3ZeOYbR0PrPwLy+n6xiMrJlRFqopa4= github.com/mattn/go-isatty v0.0.22/go.mod h1:ZXfXG4SQHsB/w3ZeOYbR0PrPwLy+n6xiMrJlRFqopa4=
github.com/mattn/go-localereader v0.0.1 h1:ygSAOl7ZXTx4RdPYinUpg6W99U8jWvWi9Ye2JC/oIi4= github.com/mattn/go-localereader v0.0.1 h1:ygSAOl7ZXTx4RdPYinUpg6W99U8jWvWi9Ye2JC/oIi4=
github.com/mattn/go-localereader v0.0.1/go.mod h1:8fBrzywKY7BI3czFoHkuzRoWE9C+EiG4R1k4Cjx5p88= github.com/mattn/go-localereader v0.0.1/go.mod h1:8fBrzywKY7BI3czFoHkuzRoWE9C+EiG4R1k4Cjx5p88=
github.com/mattn/go-runewidth v0.0.19 h1:v++JhqYnZuu5jSKrk9RbgF5v4CGUjqRfBm05byFGLdw= github.com/mattn/go-runewidth v0.0.23 h1:7ykA0T0jkPpzSvMS5i9uoNn2Xy3R383f9HDx3RybWcw=
github.com/mattn/go-runewidth v0.0.19/go.mod h1:XBkDxAl56ILZc9knddidhrOlY5R/pDhgLpndooCuJAs= github.com/mattn/go-runewidth v0.0.23/go.mod h1:XBkDxAl56ILZc9knddidhrOlY5R/pDhgLpndooCuJAs=
github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 h1:ZK8zHtRHOkbHy6Mmr5D264iyp3TiX5OmNcI5cIARiQI= github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 h1:ZK8zHtRHOkbHy6Mmr5D264iyp3TiX5OmNcI5cIARiQI=
github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6/go.mod h1:CJlz5H+gyd6CUWT45Oy4q24RdLyn7Md9Vj2/ldJBSIo= github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6/go.mod h1:CJlz5H+gyd6CUWT45Oy4q24RdLyn7Md9Vj2/ldJBSIo=
github.com/muesli/cancelreader v0.2.2 h1:3I4Kt4BQjOR54NavqnDogx/MIoWBFa0StPA8ELUXHmA= github.com/muesli/cancelreader v0.2.2 h1:3I4Kt4BQjOR54NavqnDogx/MIoWBFa0StPA8ELUXHmA=
@ -65,6 +63,8 @@ github.com/muesli/termenv v0.16.0 h1:S5AlUN9dENB57rsbnkPyfdGuWIlkmzJjbFf0Tf5FWUc
github.com/muesli/termenv v0.16.0/go.mod h1:ZRfOIKPFDYQoDFF4Olj7/QJbW60Ol/kL1pU3VfY/Cnk= github.com/muesli/termenv v0.16.0/go.mod h1:ZRfOIKPFDYQoDFF4Olj7/QJbW60Ol/kL1pU3VfY/Cnk=
github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w= github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w=
github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls= github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls=
github.com/openclaw/crawlkit v0.4.2 h1:Lzzkd2/xSkQk+7KyboMEw+ZS2wmlYvDFLwAB2Z/FwBs=
github.com/openclaw/crawlkit v0.4.2/go.mod h1:/AI8o/DeRqXPZJPHq/9mGUjNzLPskm/wTjikRPxEdHY=
github.com/pelletier/go-toml/v2 v2.3.1 h1:MYEvvGnQjeNkRF1qUuGolNtNExTDwct51yp7olPtrEc= github.com/pelletier/go-toml/v2 v2.3.1 h1:MYEvvGnQjeNkRF1qUuGolNtNExTDwct51yp7olPtrEc=
github.com/pelletier/go-toml/v2 v2.3.1/go.mod h1:2gIqNv+qfxSVS7cM2xJQKtLSTLUE9V8t9Stt+h56mCY= github.com/pelletier/go-toml/v2 v2.3.1/go.mod h1:2gIqNv+qfxSVS7cM2xJQKtLSTLUE9V8t9Stt+h56mCY=
github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA= github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA=
@ -80,8 +80,6 @@ github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY=
github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA=
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
github.com/vincentkoc/crawlkit v0.4.1 h1:qDUF+Kk7nqADmpGMcnWTHEQMiX3bSD2DdFywKyT3kWs=
github.com/vincentkoc/crawlkit v0.4.1/go.mod h1:/ioLA/tyZ/927kAOGg0M8Mrqk7pnTZLpCKWfpul9zoE=
github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e h1:JVG44RsyaB9T2KIHavMF/ppJZNG9ZpyihvCd0w101no= github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e h1:JVG44RsyaB9T2KIHavMF/ppJZNG9ZpyihvCd0w101no=
github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e/go.mod h1:RbqR21r5mrJuqunuUZ/Dhy/avygyECGrLceyNeo4LiM= github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e/go.mod h1:RbqR21r5mrJuqunuUZ/Dhy/avygyECGrLceyNeo4LiM=
github.com/zalando/go-keyring v0.2.8 h1:6sD/Ucpl7jNq10rM2pgqTs0sZ9V3qMrqfIIy5YPccHs= github.com/zalando/go-keyring v0.2.8 h1:6sD/Ucpl7jNq10rM2pgqTs0sZ9V3qMrqfIIy5YPccHs=

View File

@ -8,9 +8,9 @@ import (
"os" "os"
"time" "time"
"github.com/openclaw/crawlkit/control"
"github.com/openclaw/discrawl/internal/config" "github.com/openclaw/discrawl/internal/config"
"github.com/openclaw/discrawl/internal/store" "github.com/openclaw/discrawl/internal/store"
"github.com/vincentkoc/crawlkit/control"
) )
func (r *runtime) runMetadata(args []string) error { func (r *runtime) runMetadata(args []string) error {

View File

@ -7,7 +7,7 @@ import (
"fmt" "fmt"
"strings" "strings"
"github.com/vincentkoc/crawlkit/tui" "github.com/openclaw/crawlkit/tui"
"github.com/openclaw/discrawl/internal/store" "github.com/openclaw/discrawl/internal/store"
) )

View File

@ -9,7 +9,7 @@ import (
"strings" "strings"
"time" "time"
crawlconfig "github.com/vincentkoc/crawlkit/config" crawlconfig "github.com/openclaw/crawlkit/config"
) )
const ( const (

View File

@ -8,23 +8,26 @@ import (
"encoding/json" "encoding/json"
"errors" "errors"
"fmt" "fmt"
"hash/fnv"
"io" "io"
"os" "os"
"os/exec" "os/exec"
"path/filepath" "path/filepath"
"sort"
"strconv" "strconv"
"strings" "strings"
"time" "time"
"github.com/openclaw/crawlkit/mirror"
"github.com/openclaw/crawlkit/snapshot"
"github.com/openclaw/discrawl/internal/store" "github.com/openclaw/discrawl/internal/store"
"github.com/vincentkoc/crawlkit/mirror"
"github.com/vincentkoc/crawlkit/snapshot"
) )
const ( const (
ManifestName = "manifest.json" ManifestName = "manifest.json"
LastImportSyncScope = "share:last_import_at" LastImportSyncScope = "share:last_import_at"
LastImportManifestSyncScope = "share:last_import_manifest_generated_at" LastImportManifestSyncScope = "share:last_import_manifest_generated_at"
LastImportManifestJSONScope = "share:last_import_manifest_json"
directMessageGuildID = "@me" directMessageGuildID = "@me"
) )
@ -159,6 +162,7 @@ func Import(ctx context.Context, s *store.Store, opts Options) (Manifest, error)
if err != nil { if err != nil {
return Manifest{}, err return Manifest{}, err
} }
manifest = enrichManifestFromGit(ctx, opts.RepoPath, "HEAD", manifest)
opts.reportProgress(ImportProgress{Phase: "start", TotalRows: manifestRowCount(manifest)}) opts.reportProgress(ImportProgress{Phase: "start", TotalRows: manifestRowCount(manifest)})
restorePragmas, err := applyImportPragmas(ctx, s.DB()) restorePragmas, err := applyImportPragmas(ctx, s.DB())
if err != nil { if err != nil {
@ -262,6 +266,7 @@ func ImportIfChanged(ctx context.Context, s *store.Store, opts Options) (Manifes
if err != nil { if err != nil {
return Manifest{}, false, err return Manifest{}, false, err
} }
manifest = enrichManifestFromGit(ctx, opts.RepoPath, "HEAD", manifest)
if ManifestAlreadyImported(ctx, s, manifest) { if ManifestAlreadyImported(ctx, s, manifest) {
if opts.IncludeEmbeddings { if opts.IncludeEmbeddings {
if err := ImportEmbeddings(ctx, s, opts, manifest); err != nil { if err := ImportEmbeddings(ctx, s, opts, manifest); err != nil {
@ -273,6 +278,12 @@ func ImportIfChanged(ctx context.Context, s *store.Store, opts Options) (Manifes
} }
return manifest, false, nil return manifest, false, nil
} }
if previous, ok := PreviousImportedManifest(ctx, s, opts); ok {
imported, changed, err := ImportIncremental(ctx, s, opts, previous, manifest)
if err == nil || !errors.Is(err, errIncrementalUnsupported) {
return imported, changed, err
}
}
imported, err := Import(ctx, s, opts) imported, err := Import(ctx, s, opts)
if err != nil { if err != nil {
return Manifest{}, false, err return Manifest{}, false, err
@ -280,6 +291,81 @@ func ImportIfChanged(ctx context.Context, s *store.Store, opts Options) (Manifes
return imported, true, nil return imported, true, nil
} }
var errIncrementalUnsupported = errors.New("incremental share import unsupported")
func ImportIncremental(ctx context.Context, s *store.Store, opts Options, previous, manifest Manifest) (Manifest, bool, error) {
plan := snapshot.PlanIncrementalImport(snapshotManifest(previous), snapshotManifest(manifest))
plan, supported := shareIncrementalPlan(plan)
if !supported {
return Manifest{}, false, errIncrementalUnsupported
}
if !plan.Changed() {
if err := MarkImported(ctx, s, manifest); err != nil {
return Manifest{}, false, err
}
return manifest, false, nil
}
opts.reportProgress(ImportProgress{Phase: "start", TotalRows: manifestRowCount(manifest)})
restorePragmas, err := applyImportPragmas(ctx, s.DB())
if err != nil {
return Manifest{}, false, err
}
pragmasRestored := false
defer func() {
if !pragmasRestored {
_ = restorePragmas(ctx)
}
}()
if _, _, err := snapshot.ImportIncremental(ctx, snapshot.IncrementalImportOptions{
DB: s.DB(),
RootDir: opts.RepoPath,
Current: snapshotManifest(manifest),
Plan: plan,
Progress: func(progress snapshot.ImportProgress) {
opts.reportProgress(ImportProgress{
Phase: progress.Phase,
Table: progress.Table,
File: progress.File,
FileIndex: progress.FileIndex,
FileCount: progress.FileCount,
Rows: progress.Rows,
TotalRows: progress.TotalRows,
})
},
Filter: func(table string, row map[string]any) (bool, error) {
return !isDirectMessageSnapshotRow(table, row), nil
},
DeleteTable: func(ctx context.Context, tx *sql.Tx, table string) error {
query, args := snapshotDeleteQuery(table)
if _, err := tx.ExecContext(ctx, query, args...); err != nil {
return fmt.Errorf("clear %s: %w", table, err)
}
return nil
},
ImportRow: importIncrementalSnapshotRow,
AfterImport: func(ctx context.Context, tx *sql.Tx) error {
if err := repairImportedGuildIDs(ctx, tx); err != nil {
return err
}
if opts.IncludeEmbeddings {
return importEmbeddings(ctx, tx, opts, manifest.Embeddings)
}
return nil
},
}); err != nil {
return Manifest{}, false, err
}
if err := MarkImported(ctx, s, manifest); err != nil {
return Manifest{}, false, err
}
if err := restorePragmas(ctx); err != nil {
return Manifest{}, false, err
}
pragmasRestored = true
opts.reportProgress(ImportProgress{Phase: "done", TotalRows: manifestRowCount(manifest)})
return manifest, true, nil
}
func (opts Options) reportProgress(progress ImportProgress) { func (opts Options) reportProgress(progress ImportProgress) {
if opts.Progress != nil { if opts.Progress != nil {
opts.Progress(progress) opts.Progress(progress)
@ -340,7 +426,173 @@ func MarkImported(ctx context.Context, s *store.Store, manifest Manifest) error
if manifest.GeneratedAt.IsZero() { if manifest.GeneratedAt.IsZero() {
return nil return nil
} }
return s.SetSyncState(ctx, LastImportManifestSyncScope, manifest.GeneratedAt.Format(time.RFC3339Nano)) if err := s.SetSyncState(ctx, LastImportManifestSyncScope, manifest.GeneratedAt.Format(time.RFC3339Nano)); err != nil {
return err
}
body, err := json.Marshal(manifest)
if err != nil {
return fmt.Errorf("marshal imported manifest state: %w", err)
}
return s.SetSyncState(ctx, LastImportManifestJSONScope, string(body))
}
func PreviousImportedManifest(ctx context.Context, s *store.Store, opts Options) (Manifest, bool) {
body, err := s.GetSyncState(ctx, LastImportManifestJSONScope)
if err == nil && strings.TrimSpace(body) != "" {
var manifest Manifest
if json.Unmarshal([]byte(body), &manifest) == nil && !manifest.GeneratedAt.IsZero() {
return manifest, true
}
}
last, err := s.GetSyncState(ctx, LastImportManifestSyncScope)
if err != nil || strings.TrimSpace(last) == "" {
return Manifest{}, false
}
generatedAt, err := time.Parse(time.RFC3339Nano, last)
if err != nil {
return Manifest{}, false
}
manifest, err := manifestFromGitHistory(ctx, opts.RepoPath, generatedAt)
if err != nil {
return Manifest{}, false
}
return manifest, true
}
func manifestFromGitHistory(ctx context.Context, repoPath string, generatedAt time.Time) (Manifest, error) {
out, err := output(ctx, repoPath, "git", "log", "--format=%H", "--max-count=500", "--", ManifestName)
if err != nil {
return Manifest{}, err
}
for _, hash := range strings.Fields(out) {
body, err := output(ctx, repoPath, "git", "show", hash+":"+ManifestName)
if err != nil {
continue
}
var manifest Manifest
if err := json.Unmarshal([]byte(body), &manifest); err != nil {
continue
}
if manifest.GeneratedAt.Equal(generatedAt) {
return enrichManifestFromGit(ctx, repoPath, hash, manifest), nil
}
}
return Manifest{}, fmt.Errorf("imported manifest %s not found in git history", generatedAt.Format(time.RFC3339Nano))
}
func enrichManifestFromGit(ctx context.Context, repoPath, rev string, manifest Manifest) Manifest {
if strings.TrimSpace(repoPath) == "" || manifestHasFileManifests(manifest) {
return manifest
}
files, err := gitTreeFiles(ctx, repoPath, rev)
if err != nil {
return manifest
}
for i := range manifest.Tables {
table := &manifest.Tables[i]
if len(table.FileManifests) > 0 {
continue
}
paths := table.Files
if len(paths) == 0 && strings.TrimSpace(table.File) != "" {
paths = []string{table.File}
}
table.FileManifests = make([]snapshot.FileManifest, 0, len(paths))
for _, path := range paths {
info, ok := files[path]
if !ok {
table.FileManifests = nil
break
}
rows := 0
if len(paths) == 1 {
rows = table.Rows
}
table.FileManifests = append(table.FileManifests, snapshot.FileManifest{
Path: path,
Rows: rows,
Size: info.size,
SHA256: "git:" + info.object,
})
}
}
return manifest
}
func manifestHasFileManifests(manifest Manifest) bool {
for _, table := range manifest.Tables {
if (len(table.Files) > 0 || strings.TrimSpace(table.File) != "") && len(table.FileManifests) == 0 {
return false
}
}
return true
}
type gitTreeFile struct {
object string
size int64
}
func gitTreeFiles(ctx context.Context, repoPath, rev string) (map[string]gitTreeFile, error) {
if strings.TrimSpace(rev) == "" {
rev = "HEAD"
}
out, err := output(ctx, repoPath, "git", "ls-tree", "-r", "-l", rev, "--", "tables")
if err != nil {
return nil, err
}
files := map[string]gitTreeFile{}
for _, line := range strings.Split(out, "\n") {
fields := strings.Fields(line)
if len(fields) < 5 {
continue
}
size, _ := strconv.ParseInt(fields[3], 10, 64)
files[fields[4]] = gitTreeFile{object: fields[2], size: size}
}
return files, nil
}
func snapshotManifest(manifest Manifest) snapshot.Manifest {
return snapshot.Manifest{
Version: manifest.Version,
GeneratedAt: manifest.GeneratedAt,
Tables: manifest.Tables,
Files: manifest.Files,
}
}
func shareIncrementalPlan(plan snapshot.ImportPlan) (snapshot.ImportPlan, bool) {
if plan.Full {
return plan, false
}
out := snapshot.ImportPlan{Tables: make([]snapshot.TableImportPlan, 0, len(plan.Tables))}
for _, tablePlan := range plan.Tables {
switch tablePlan.Mode {
case snapshot.TableImportSkip:
out.Tables = append(out.Tables, tablePlan)
case snapshot.TableImportFiles:
switch tablePlan.Table.Name {
case "messages":
out.Tables = append(out.Tables, tablePlan)
case "sync_state":
tablePlan.Mode = snapshot.TableImportReplace
tablePlan.Files = nil
tablePlan.Reason = "replace sync_state to avoid stale cursors"
out.Tables = append(out.Tables, tablePlan)
default:
return plan, false
}
case snapshot.TableImportReplace:
if tablePlan.Table.Name != "sync_state" {
return plan, false
}
out.Tables = append(out.Tables, tablePlan)
default:
return plan, false
}
}
return out, true
} }
func ReadManifest(repoPath string) (Manifest, error) { func ReadManifest(repoPath string) (Manifest, error) {
@ -874,6 +1126,112 @@ func importValue(value any) any {
} }
} }
func importIncrementalSnapshotRow(ctx context.Context, tx *sql.Tx, table string, row map[string]any) error {
if table == "message_events" || table == "mention_events" {
delete(row, "event_id")
}
if err := insertOrReplaceSnapshotRow(ctx, tx, table, row); err != nil {
return err
}
if table != "messages" {
return nil
}
messageID := stringValue(row["id"])
if messageID == "" {
return nil
}
return upsertMessageFTSRow(ctx, tx, messageID)
}
func insertOrReplaceSnapshotRow(ctx context.Context, tx *sql.Tx, table string, row map[string]any) error {
cols := make([]string, 0, len(row))
for col := range row {
cols = append(cols, col)
}
sort.Strings(cols)
quoted := make([]string, 0, len(cols))
placeholders := make([]string, 0, len(cols))
args := make([]any, 0, len(cols))
for _, col := range cols {
quoted = append(quoted, quoteIdent(col))
placeholders = append(placeholders, "?")
args = append(args, importValue(row[col]))
}
stmt := "insert or replace into " + quoteIdent(table) + "(" + strings.Join(quoted, ",") + ") values(" + strings.Join(placeholders, ",") + ")"
if _, err := tx.ExecContext(ctx, stmt, args...); err != nil {
return fmt.Errorf("insert %s: %w", table, err)
}
return nil
}
func upsertMessageFTSRow(ctx context.Context, tx *sql.Tx, messageID string) error {
rowID, ok := messageFTSRowID(messageID)
if !ok {
return nil
}
if _, err := tx.ExecContext(ctx, `delete from message_fts where rowid = ?`, rowID); err != nil {
return fmt.Errorf("delete message_fts %s: %w", messageID, err)
}
var (
guildID string
channelID string
authorID string
authorName string
channelName string
content string
)
if err := tx.QueryRowContext(ctx, `
select
m.guild_id,
m.channel_id,
coalesce(m.author_id, ''),
coalesce(
json_extract(m.raw_json, '$.member.nick'),
json_extract(m.raw_json, '$.author.global_name'),
json_extract(m.raw_json, '$.author.username'),
''
),
coalesce(c.name, ''),
m.normalized_content
from messages m
left join channels c on c.id = m.channel_id
where m.id = ?
`, messageID).Scan(&guildID, &channelID, &authorID, &authorName, &channelName, &content); err != nil {
return fmt.Errorf("query message_fts %s: %w", messageID, err)
}
if _, err := tx.ExecContext(ctx, `
insert into message_fts(rowid, message_id, guild_id, channel_id, author_id, author_name, channel_name, content)
values(?, ?, ?, ?, ?, ?, ?, ?)
`, rowID, messageID, guildID, channelID, nullIfEmpty(authorID), authorName, channelName, content); err != nil {
return fmt.Errorf("insert message_fts %s: %w", messageID, err)
}
return nil
}
func messageFTSRowID(messageID string) (int64, bool) {
if messageID == "" {
return 0, false
}
rowID, err := strconv.ParseInt(messageID, 10, 64)
if err == nil && rowID > 0 {
return rowID, true
}
hash := fnv.New64a()
_, _ = hash.Write([]byte(messageID))
rowID = int64(hash.Sum64() & ((uint64(1) << 63) - 1))
if rowID == 0 {
rowID = 1
}
return rowID, true
}
func nullIfEmpty(value string) any {
if value == "" {
return nil
}
return value
}
func stringValue(value any) string { func stringValue(value any) string {
switch v := value.(type) { switch v := value.(type) {
case string: case string:

View File

@ -14,6 +14,8 @@ import (
"testing" "testing"
"time" "time"
"github.com/openclaw/crawlkit/mirror"
"github.com/openclaw/crawlkit/snapshot"
"github.com/stretchr/testify/require" "github.com/stretchr/testify/require"
"github.com/openclaw/discrawl/internal/store" "github.com/openclaw/discrawl/internal/store"
@ -73,6 +75,127 @@ func TestExportImportRoundTrip(t *testing.T) {
require.Equal(t, manifest.GeneratedAt, imported.GeneratedAt) require.Equal(t, manifest.GeneratedAt, imported.GeneratedAt)
} }
func TestImportIfChangedUsesIncrementalTailImport(t *testing.T) {
ctx := context.Background()
src := seedStore(t, filepath.Join(t.TempDir(), "src.db"))
defer func() { _ = src.Close() }()
repo := filepath.Join(t.TempDir(), "share")
manifest, err := Export(ctx, src, Options{RepoPath: repo, Branch: "main"})
require.NoError(t, err)
require.NotEmpty(t, tableEntry(t, manifest, "messages").FileManifests)
dst, err := store.Open(ctx, filepath.Join(t.TempDir(), "dst.db"))
require.NoError(t, err)
defer func() { _ = dst.Close() }()
_, changed, err := ImportIfChanged(ctx, dst, Options{RepoPath: repo, Branch: "main"})
require.NoError(t, err)
require.True(t, changed)
now := time.Now().UTC().Format(time.RFC3339Nano)
require.NoError(t, src.UpsertMessages(ctx, []store.MessageMutation{{
Record: store.MessageRecord{
ID: "m2",
GuildID: "g1",
ChannelID: "c1",
ChannelName: "general",
AuthorID: "u1",
AuthorName: "Peter",
MessageType: 0,
CreatedAt: now,
Content: "delta landed fast",
NormalizedContent: "delta landed fast",
RawJSON: `{"author":{"username":"Peter"}}`,
},
}}))
updated, err := Export(ctx, src, Options{RepoPath: repo, Branch: "main"})
require.NoError(t, err)
require.NotEqual(t, manifest.GeneratedAt, updated.GeneratedAt)
var progress []ImportProgress
imported, changed, err := ImportIfChanged(ctx, dst, Options{
RepoPath: repo,
Branch: "main",
Progress: func(p ImportProgress) { progress = append(progress, p) },
})
require.NoError(t, err)
require.True(t, changed)
require.Equal(t, updated.GeneratedAt, imported.GeneratedAt)
require.Contains(t, progressPhases(progress), "table_start")
require.NotContains(t, progressPhases(progress), "rebuild_fts")
results, err := dst.SearchMessages(ctx, store.SearchOptions{Query: "delta landed", Limit: 10})
require.NoError(t, err)
require.Len(t, results, 1)
require.Equal(t, "m2", results[0].MessageID)
state, err := dst.GetSyncState(ctx, LastImportManifestJSONScope)
require.NoError(t, err)
require.Contains(t, state, `"file_manifests"`)
}
func TestImportIfChangedInfersLegacyManifestFilesFromGit(t *testing.T) {
ctx := context.Background()
src := seedStore(t, filepath.Join(t.TempDir(), "src.db"))
defer func() { _ = src.Close() }()
repo := filepath.Join(t.TempDir(), "share")
require.NoError(t, exec.CommandContext(ctx, "git", "init", repo).Run())
configureGitUser(t, repo)
manifest, err := Export(ctx, src, Options{RepoPath: repo, Branch: "main"})
require.NoError(t, err)
writeShareManifest(t, repo, stripFileManifests(manifest))
require.NoError(t, exec.CommandContext(ctx, "git", "-C", repo, "add", ".").Run())
require.NoError(t, exec.CommandContext(ctx, "git", "-C", repo, "commit", "-m", "initial snapshot").Run())
dst, err := store.Open(ctx, filepath.Join(t.TempDir(), "dst.db"))
require.NoError(t, err)
defer func() { _ = dst.Close() }()
_, changed, err := ImportIfChanged(ctx, dst, Options{RepoPath: repo, Branch: "main"})
require.NoError(t, err)
require.True(t, changed)
now := time.Now().UTC().Format(time.RFC3339Nano)
require.NoError(t, src.UpsertMessages(ctx, []store.MessageMutation{{
Record: store.MessageRecord{
ID: "m2",
GuildID: "g1",
ChannelID: "c1",
ChannelName: "general",
AuthorID: "u1",
AuthorName: "Peter",
MessageType: 0,
CreatedAt: now,
Content: "legacy git delta",
NormalizedContent: "legacy git delta",
RawJSON: `{"author":{"username":"Peter"}}`,
},
}}))
updated, err := Export(ctx, src, Options{RepoPath: repo, Branch: "main"})
require.NoError(t, err)
writeShareManifest(t, repo, stripFileManifests(updated))
require.NoError(t, exec.CommandContext(ctx, "git", "-C", repo, "add", ".").Run())
require.NoError(t, exec.CommandContext(ctx, "git", "-C", repo, "commit", "-m", "tail snapshot").Run())
previous, ok := PreviousImportedManifest(ctx, dst, Options{RepoPath: repo, Branch: "main"})
require.True(t, ok)
planned, supported := shareIncrementalPlan(snapshot.PlanIncrementalImport(snapshotManifest(previous), snapshotManifest(enrichManifestFromGit(ctx, repo, "HEAD", stripFileManifests(updated)))))
require.True(t, supported, "%+v", planned)
require.True(t, planned.Changed(), "%+v", planned)
var progress []ImportProgress
_, changed, err = ImportIfChanged(ctx, dst, Options{
RepoPath: repo,
Branch: "main",
Progress: func(p ImportProgress) { progress = append(progress, p) },
})
require.NoError(t, err)
require.True(t, changed)
require.NotContains(t, progressPhases(progress), "rebuild_fts")
results, err := dst.SearchMessages(ctx, store.SearchOptions{Query: "legacy git delta", Limit: 10})
require.NoError(t, err)
require.Len(t, results, 1)
}
func TestApplyImportPragmasKeepCrashRecoveryEnabled(t *testing.T) { func TestApplyImportPragmasKeepCrashRecoveryEnabled(t *testing.T) {
ctx := context.Background() ctx := context.Background()
s := seedStore(t, filepath.Join(t.TempDir(), "dst.db")) s := seedStore(t, filepath.Join(t.TempDir(), "dst.db"))
@ -652,6 +775,10 @@ func TestShareSmallHelpersAndValidation(t *testing.T) {
require.Equal(t, `insert into "messages"("id","weird""column") values(?,?)`, insertSQL("messages", []string{"id", `weird"column`})) require.Equal(t, `insert into "messages"("id","weird""column") values(?,?)`, insertSQL("messages", []string{"id", `weird"column`}))
require.Equal(t, "blob", exportValue([]byte("blob"))) require.Equal(t, "blob", exportValue([]byte("blob")))
require.Equal(t, "plain", exportValue("plain")) require.Equal(t, "plain", exportValue("plain"))
require.Equal(t, int64(42), importValue(json.Number("42")))
require.Equal(t, 3.5, importValue(json.Number("3.5")))
require.Equal(t, "nope", importValue(json.Number("nope")))
require.Equal(t, "plain", importValue("plain"))
require.Equal(t, "plain", stringValue("plain")) require.Equal(t, "plain", stringValue("plain"))
require.Equal(t, "42", stringValue(json.Number("42"))) require.Equal(t, "42", stringValue(json.Number("42")))
require.Empty(t, stringValue(42)) require.Empty(t, stringValue(42))
@ -662,6 +789,9 @@ func TestShareSmallHelpersAndValidation(t *testing.T) {
query, args := snapshotExportQuery("messages") query, args := snapshotExportQuery("messages")
require.Equal(t, "select * from messages where guild_id != ?", query) require.Equal(t, "select * from messages where guild_id != ?", query)
require.Equal(t, []any{directMessageGuildID}, args) require.Equal(t, []any{directMessageGuildID}, args)
query, args = snapshotExportQuery("guilds")
require.Equal(t, "select * from guilds where id != ?", query)
require.Equal(t, []any{directMessageGuildID}, args)
query, args = snapshotExportQuery("sync_state") query, args = snapshotExportQuery("sync_state")
require.Equal(t, "select * from sync_state where scope not like 'wiretap:%'", query) require.Equal(t, "select * from sync_state where scope not like 'wiretap:%'", query)
require.Nil(t, args) require.Nil(t, args)
@ -672,9 +802,15 @@ func TestShareSmallHelpersAndValidation(t *testing.T) {
query, args = snapshotDeleteQuery("channels") query, args = snapshotDeleteQuery("channels")
require.Equal(t, "delete from channels where guild_id != ?", query) require.Equal(t, "delete from channels where guild_id != ?", query)
require.Equal(t, []any{directMessageGuildID}, args) require.Equal(t, []any{directMessageGuildID}, args)
query, args = snapshotDeleteQuery("guilds")
require.Equal(t, "delete from guilds where id != ?", query)
require.Equal(t, []any{directMessageGuildID}, args)
query, args = snapshotDeleteQuery("message_events") query, args = snapshotDeleteQuery("message_events")
require.Equal(t, "delete from message_events where guild_id != ?", query) require.Equal(t, "delete from message_events where guild_id != ?", query)
require.Equal(t, []any{directMessageGuildID}, args) require.Equal(t, []any{directMessageGuildID}, args)
query, args = snapshotDeleteQuery("sync_state")
require.Equal(t, "delete from sync_state where scope not like 'wiretap:%'", query)
require.Nil(t, args)
query, args = snapshotDeleteQuery("custom") query, args = snapshotDeleteQuery("custom")
require.Equal(t, "delete from custom", query) require.Equal(t, "delete from custom", query)
require.Nil(t, args) require.Nil(t, args)
@ -684,6 +820,20 @@ func TestShareSmallHelpersAndValidation(t *testing.T) {
require.True(t, isDirectMessageSnapshotRow("sync_state", map[string]any{"scope": "wiretap:last_import"})) require.True(t, isDirectMessageSnapshotRow("sync_state", map[string]any{"scope": "wiretap:last_import"}))
require.False(t, isDirectMessageSnapshotRow("sync_state", map[string]any{"scope": "share:last_import"})) require.False(t, isDirectMessageSnapshotRow("sync_state", map[string]any{"scope": "share:last_import"}))
require.False(t, isDirectMessageSnapshotRow("custom", map[string]any{"guild_id": directMessageGuildID})) require.False(t, isDirectMessageSnapshotRow("custom", map[string]any{"guild_id": directMessageGuildID}))
require.True(t, isLocalOnlyGuildID(directMessageGuildID))
require.False(t, isLocalOnlyGuildID("g1"))
require.Equal(t, []string{"message_id", "guild_id"}, importColumns(TableManifest{Name: "message_events", Columns: []string{"event_id", "message_id", "guild_id"}}))
require.Equal(t, []string{"event_id", "message_id"}, importColumns(TableManifest{Name: "messages", Columns: []string{"event_id", "message_id"}}))
require.Equal(t, 7, manifestRowCount(Manifest{
Tables: []TableManifest{{Rows: 2}, {Rows: 3}},
Embeddings: []EmbeddingManifest{{Rows: 2}},
}))
var seen []ImportProgress
Options{Progress: func(progress ImportProgress) { seen = append(seen, progress) }}.reportProgress(ImportProgress{Phase: "phase"})
require.Equal(t, []ImportProgress{{Phase: "phase"}}, seen)
Options{}.reportProgress(ImportProgress{Phase: "ignored"})
require.Equal(t, mirror.Options{RepoPath: "repo", Remote: "origin", Branch: "main"}, mirrorOptions(Options{RepoPath: "repo", Remote: "origin", Branch: "main"}))
var buf bytes.Buffer var buf bytes.Buffer
cw := &countingWriter{w: &buf} cw := &countingWriter{w: &buf}
@ -853,6 +1003,13 @@ func writeShareManifest(t *testing.T, repo string, manifest Manifest) {
require.NoError(t, os.WriteFile(filepath.Join(repo, ManifestName), append(body, '\n'), 0o600)) require.NoError(t, os.WriteFile(filepath.Join(repo, ManifestName), append(body, '\n'), 0o600))
} }
func stripFileManifests(manifest Manifest) Manifest {
for i := range manifest.Tables {
manifest.Tables[i].FileManifests = nil
}
return manifest
}
func snapshotTableText(t *testing.T, repo string, table TableManifest) string { func snapshotTableText(t *testing.T, repo string, table TableManifest) string {
t.Helper() t.Helper()
return snapshotFilesText(t, repo, table.Files) return snapshotFilesText(t, repo, table.Files)

View File

@ -8,7 +8,7 @@ import (
"strconv" "strconv"
"time" "time"
crawlstore "github.com/vincentkoc/crawlkit/store" crawlstore "github.com/openclaw/crawlkit/store"
) )
const ( const (

View File

@ -7,7 +7,7 @@ import (
"time" "time"
"github.com/bwmarrin/discordgo" "github.com/bwmarrin/discordgo"
"github.com/vincentkoc/crawlkit/progress" "github.com/openclaw/crawlkit/progress"
"github.com/openclaw/discrawl/internal/store" "github.com/openclaw/discrawl/internal/store"
) )
@ -187,7 +187,7 @@ func (s *Syncer) syncMessageChannelsConcurrent(
} }
func (s *Syncer) clearUnavailableChannel(ctx context.Context, channelID string) error { func (s *Syncer) clearUnavailableChannel(ctx context.Context, channelID string) error {
if s.store == nil || channelID == "" { if s == nil || s.store == nil || channelID == "" {
return nil return nil
} }
return s.store.DeleteSyncState(ctx, "channel:"+channelID+":unavailable") return s.store.DeleteSyncState(ctx, "channel:"+channelID+":unavailable")
@ -616,6 +616,9 @@ func (p *messageSyncProgress) record(channel *discordgo.Channel, count int) {
} }
func (p *messageSyncProgress) recordSkip(channel *discordgo.Channel, err error) { func (p *messageSyncProgress) recordSkip(channel *discordgo.Channel, err error) {
if p == nil {
return
}
outcome := syncErrorOutcome(err) outcome := syncErrorOutcome(err)
p.mu.Lock() p.mu.Lock()
switch outcome { switch outcome {