Compare commits

..

87 Commits
v0.6.4 ... main

Author SHA1 Message Date
Peter Steinberger
a1def2c98f
fix: accept trailing search flags
Some checks failed
ci / lint (push) Has been cancelled
ci / test (push) Has been cancelled
ci / deps (push) Has been cancelled
ci / release-check (push) Has been cancelled
ci / secrets (push) Has been cancelled
CodeQL / analyze (push) Has been cancelled
Pages / Deploy docs (push) Has been cancelled
Security Gate: Secret Scanning / Scan for Verified Secrets (push) Has been cancelled
2026-05-08 15:13:00 +01:00
Peter Steinberger
be98cde23f
chore: release v0.7.0 2026-05-08 15:00:32 +01:00
Peter Steinberger
b52eefaa40
chore: satisfy Go 1.26 lint checks 2026-05-08 10:04:39 +01:00
Peter Steinberger
733714a5e7
build: bump Go toolchain to 1.26.3 2026-05-08 10:01:08 +01:00
Peter Steinberger
40c787c54a
refactor: consume crawlkit embedding primitives 2026-05-08 09:58:34 +01:00
Peter Steinberger
40317aa538
fix: keep read commands available during tail 2026-05-08 09:49:13 +01:00
Peter Steinberger
fb969672e0
test: cover cli and archive helper edges 2026-05-08 08:37:27 +01:00
Peter Steinberger
67c6f4655b
fix(share): delta import git snapshots 2026-05-08 08:29:38 +01:00
Peter Steinberger
335a95bd66
ci: update homebrew tap on release
Some checks failed
Security Gate: Secret Scanning / Scan for Verified Secrets (push) Has been cancelled
ci / lint (push) Has been cancelled
ci / test (push) Has been cancelled
ci / deps (push) Has been cancelled
ci / release-check (push) Has been cancelled
ci / secrets (push) Has been cancelled
CodeQL / analyze (push) Has been cancelled
Pages / Deploy docs (push) Has been cancelled
2026-05-07 03:56:51 +01:00
Vincent Koc
d8c8778f19
build(deps): bump crawlkit to v0.4.1 (#58) 2026-05-06 14:52:59 -07:00
Vincent Koc
eeb10dcd30
Merge pull request #57 from openclaw/ci-security-baseline
chore(ci): add crawl security baseline
2026-05-06 01:55:22 -07:00
Vincent Koc
016e849e3c
docs: document SQL archive queries 2026-05-06 01:54:34 -07:00
Vincent Koc
89d35a67a4
docs: refresh discrawl agent skill 2026-05-06 01:47:05 -07:00
Vincent Koc
0da02de393
chore(security): add verified secret scanning 2026-05-06 01:37:04 -07:00
Vincent Koc
14dd5478f4
chore: add Go repository hygiene files 2026-05-06 01:37:03 -07:00
Vincent Koc
4b4303556a
chore(ci): add stale issue automation 2026-05-06 00:30:20 -07:00
Vincent Koc
abcb77e6fc
chore(ci): add CodeQL analysis 2026-05-06 00:30:19 -07:00
Vincent Koc
f328cfba2f
chore(security): add protected automation owners 2026-05-06 00:30:17 -07:00
Vincent Koc
f32dae98cc
docs: document crawlkit archive surfaces 2026-05-05 19:16:52 -07:00
Vincent Koc
98be6a9c11
fix(ci): restore crawlkit merge checks 2026-05-05 18:48:48 -07:00
Vincent Koc
ebb41dabfd
merge: use crawlkit infrastructure
* feat/use-crawlkit: (50 commits)
  fix(release): update version ldflag module path
  chore(deps): use crawlkit v0.4.0
  fix(tui): hydrate discord roots without thread scans
  fix(tui): limit discord thread hydration
  fix(tui): hydrate discord reply context
  fix(share): forward snapshot import progress
  fix(tui): browse newest discord messages
  fix(tui): show discord attachment details
  feat(tui): refresh discord archive rows
  fix(tui): resolve discord inline mentions
  fix(tui): render discord mention names
  docs: note shared tui polish
  fix(tui): document shared controls
  fix(tui): expose discord message details
  fix(tui): add Discord message URLs
  docs(tui): note dm pane labels
  fix(tui): label discord direct message panes
  fix(tui): use compact-pane crawlkit
  fix(tui): pick up shared detail renderer
  fix(sync): include progress percentages
  ...
2026-05-05 18:20:49 -07:00
Vincent Koc
8751f10779
fix(release): update version ldflag module path 2026-05-05 15:02:21 -07:00
Vincent Koc
5daa12f12c
chore(deps): use crawlkit v0.4.0 2026-05-05 15:00:24 -07:00
Vincent Koc
af78c7124f
fix(tui): hydrate discord roots without thread scans 2026-05-05 15:00:24 -07:00
Vincent Koc
acc470311a
fix(tui): limit discord thread hydration 2026-05-05 15:00:24 -07:00
Vincent Koc
18d4aba76a
fix(tui): hydrate discord reply context 2026-05-05 15:00:24 -07:00
Vincent Koc
c8118d9dcc
fix(share): forward snapshot import progress 2026-05-05 15:00:24 -07:00
Vincent Koc
c758a33753
fix(tui): browse newest discord messages 2026-05-05 15:00:24 -07:00
Vincent Koc
9e822ad7d7
fix(tui): show discord attachment details 2026-05-05 15:00:23 -07:00
Vincent Koc
01b1053809
feat(tui): refresh discord archive rows 2026-05-05 15:00:23 -07:00
Vincent Koc
058eb0699e
fix(tui): resolve discord inline mentions 2026-05-05 15:00:23 -07:00
Vincent Koc
a4f5d3fdb4
fix(tui): render discord mention names 2026-05-05 15:00:23 -07:00
Vincent Koc
39906edc3d
docs: note shared tui polish 2026-05-05 15:00:23 -07:00
Vincent Koc
6d31b368fc
fix(tui): document shared controls 2026-05-05 15:00:23 -07:00
Vincent Koc
e67f3e059b
fix(tui): expose discord message details 2026-05-05 15:00:23 -07:00
Vincent Koc
c6d969f998
fix(tui): add Discord message URLs 2026-05-05 15:00:22 -07:00
Vincent Koc
082d384792
docs(tui): note dm pane labels 2026-05-05 15:00:22 -07:00
Vincent Koc
7e3df6e1aa
fix(tui): label discord direct message panes 2026-05-05 15:00:22 -07:00
Vincent Koc
7a510faad5
fix(tui): use compact-pane crawlkit 2026-05-05 15:00:21 -07:00
Vincent Koc
39996a085e
fix(tui): pick up shared detail renderer 2026-05-05 15:00:21 -07:00
Vincent Koc
2b69518169
fix(sync): include progress percentages 2026-05-05 15:00:21 -07:00
Vincent Koc
87da9945b6
fix(tui): apply default guild scope 2026-05-05 14:59:15 -07:00
Vincent Koc
4725873906
fix(tui): label discord archive rows 2026-05-05 14:59:14 -07:00
Vincent Koc
6feb197446
chore(deps): bump crawlkit to v0.3.13 2026-05-05 14:59:14 -07:00
Vincent Koc
e918bf494a
fix(share): filter local-only snapshot imports 2026-05-05 14:59:14 -07:00
Vincent Koc
87fdb1f49c
chore(deps): update crawlkit to v0.3.11 2026-05-05 14:59:14 -07:00
Vincent Koc
dff96610cc
fix(cli): show tui help without opening the archive 2026-05-05 14:59:14 -07:00
Vincent Koc
f3f2496e08
chore(deps): update crawlkit to v0.3.10 2026-05-05 14:59:14 -07:00
Vincent Koc
3fa4af5e2c
chore(deps): update crawlkit to v0.3.9 2026-05-05 14:59:14 -07:00
Vincent Koc
ac5fb8233e
chore(deps): update crawlkit to v0.3.8 2026-05-05 14:59:14 -07:00
Vincent Koc
13f08f5955
docs(changelog): note TUI pane polish 2026-05-05 14:59:13 -07:00
Vincent Koc
300c2f1cfe
chore(deps): update crawlkit to v0.3.7 2026-05-05 14:59:13 -07:00
Vincent Koc
36c9a173e4
fix(tui): enrich Discord archive rows 2026-05-05 14:59:13 -07:00
Vincent Koc
43411bacf2
feat(tui): use shared pane browser 2026-05-05 14:59:13 -07:00
Vincent Koc
9cdb40181a
chore(deps): update crawlkit to v0.3.5 2026-05-05 14:59:13 -07:00
Vincent Koc
0eca15aaf6
fix(tui): use crawlkit empty-json fix 2026-05-05 14:59:13 -07:00
Vincent Koc
8fc6f1f789
fix(tui): use crawlkit safe renderer 2026-05-05 14:59:12 -07:00
Vincent Koc
ab39e1bde2
ci: smoke crawlkit control surface 2026-05-05 14:59:12 -07:00
Vincent Koc
c4be70e521
feat(cli): add crawlkit control surface 2026-05-05 14:59:11 -07:00
Vincent Koc
5e5c401531
feat(tui): use universal archive rows 2026-05-05 14:57:10 -07:00
Vincent Koc
f7db36c7fd
chore: tidy crawlkit module sums 2026-05-05 14:57:10 -07:00
Vincent Koc
7fcb7bb599
refactor: use crawlkit package nouns 2026-05-05 14:57:10 -07:00
Vincent Koc
a13447fd47
chore: use crawlkit v0.2.0 2026-05-05 14:56:30 -07:00
Vincent Koc
88d43dd77b
docs(tui): document discord browser workflow 2026-05-05 14:56:29 -07:00
Vincent Koc
f0752b7e2f
feat(tui): add discord archive browser 2026-05-05 14:56:29 -07:00
Vincent Koc
65672636de
chore: use crawlkit v0.1.1 2026-05-05 14:56:29 -07:00
Vincent Koc
ddf769d09b
chore: use crawlkit v0.1.0 2026-05-05 14:56:29 -07:00
Vincent Koc
5406ae59b9
refactor(share): use crawlkit archive helpers 2026-05-05 14:56:29 -07:00
Vincent Koc
6202e6bf55
refactor(store): use crawlkit sqlite opener 2026-05-05 14:55:48 -07:00
Vincent Koc
4e9b9bee86
refactor(config): use crawlkit runtime paths 2026-05-05 14:55:48 -07:00
Vincent Koc
4ee6379494
chore: add crawlkit module dependency 2026-05-05 14:55:48 -07:00
Peter Steinberger
8999ff5fd3
chore: start 0.7.0 development
Some checks failed
ci / lint (push) Has been cancelled
ci / test (push) Has been cancelled
ci / deps (push) Has been cancelled
ci / release-check (push) Has been cancelled
ci / secrets (push) Has been cancelled
2026-05-05 10:27:11 +01:00
Peter Steinberger
e351d0ecdc
docs: polish discrawl contact links 2026-05-05 10:20:15 +01:00
Peter Steinberger
3b76ba7973
chore: move module to openclaw/discrawl 2026-05-05 10:07:56 +01:00
Peter Steinberger
f3aaf284f2
docs: add discrawl social card 2026-05-05 03:58:21 +01:00
Peter Steinberger
70817284fa
docs: make discrawl.sh canonical 2026-05-05 03:49:43 +01:00
Peter Steinberger
4f15765afb
docs: add generated docs site 2026-05-05 03:34:49 +01:00
Peter Steinberger
8c64819033
docs: keep contact email plain 2026-05-05 02:23:39 +01:00
Peter Steinberger
3566b74d89
docs: add discrawl contact page 2026-05-05 02:05:09 +01:00
Peter Steinberger
d1f4d378f7
release: v0.6.6 2026-05-05 01:47:34 +01:00
Peter Steinberger
6ea543b4c6
test: cover wiretap cache checkpoint helpers 2026-05-05 01:43:13 +01:00
Peter Steinberger
78fcca8204
fix: stabilize wiretap cache checkpointing 2026-05-05 01:22:48 +01:00
Peter Steinberger
68b49c90a5
ci: skip warm discord backup imports 2026-05-03 19:41:40 +01:00
Peter Steinberger
86502b251c
chore: bump discrawl to 0.6.5 2026-05-03 15:36:02 +01:00
Peter Steinberger
dc72ac200e
docs: add 0.6.5 unreleased changelog 2026-05-03 15:31:51 +01:00
Peter Steinberger
c934c579b0
fix: keep discrawl sync update explicit 2026-05-03 15:03:41 +01:00
Peter Steinberger
45f0133b62
docs: update 0.6.4 changelog 2026-05-03 12:15:51 +01:00
127 changed files with 7370 additions and 1452 deletions

View File

@ -1,36 +1,59 @@
---
name: discrawl
description: Use for local Discord archive search, sync freshness, DMs, channel summaries, and Discrawl repo/release work.
description: Use for local Discord archive search, sync freshness, DMs, channel summaries, desktop/API/git-share sources, TUI browsing, and Discrawl repo/release work.
---
# Discrawl
Use local archive data first for Discord questions. Browse or hit live APIs only when the local archive is stale or the user asks for current external context.
Use local Discord archive data first for Discord questions. Hit Discord APIs
only when the archive is stale, missing the requested scope, or the user asks
for current external context.
## Sources
- DB: `~/.discrawl/discrawl.db`
- Config: `~/.discrawl/config.toml`
- Repo: `~/Projects/discrawl`
- Preferred CLI: `discrawl`; fallback to repo binary if installed binary is stale
- Cache: `~/.discrawl/cache`
- Logs: `~/.discrawl/logs`
- Git share repo: `~/.discrawl/share`
- Repo: `openclaw/discrawl`; use `~/GIT/_Perso/discrawl` only after verifying
its remote targets `openclaw/discrawl`, otherwise use a fresh checkout
- Preferred CLI: `discrawl`; fallback to `go run ./cmd/discrawl` from the repo if the installed binary is stale
## Freshness
For recent/current questions, check freshness before analysis:
```bash
discrawl status --json
```
For precise freshness from the default database:
```bash
sqlite3 ~/.discrawl/discrawl.db \
"select coalesce(max(updated_at),'') from sync_state where scope like 'channel:%';"
```
Routine refresh:
Routine diagnostics:
```bash
discrawl doctor
```
Desktop-local refresh:
```bash
discrawl sync --source wiretap
```
Bot API latest refresh, when credentials are available:
```bash
discrawl sync
```
Historical/backfill refresh:
Use `--full` only for deliberate historical backfills:
```bash
discrawl sync --full
@ -42,7 +65,7 @@ If SQLite reports busy/locked, check for stray `discrawl` processes before retry
1. Resolve scope: guild, channel, DM, author, keyword, date range.
2. Check freshness for recent/current requests.
3. Use CLI for normal reads; use SQL for precise counts/rankings.
3. Prefer CLI search/messages for slices; use read-only SQL for exact counts.
4. Report absolute date spans, counts, channel/DM names, and known gaps.
Common commands:
@ -50,26 +73,52 @@ Common commands:
```bash
discrawl search "query"
discrawl messages --channel '#maintainers' --days 7 --all
discrawl --json sql "select count(*) from messages;"
discrawl dms --last 20
discrawl tui --dm
discrawl sql "select count(*) from messages;"
```
When the installed CLI lacks a new feature, build or run from `~/Projects/discrawl` before concluding the feature is missing.
## SQL
## Discord DMs
Use `discrawl sql` for exact counts, joins, and ranking queries when normal
CLI reads are too coarse. The command is read-only by default, accepts SQL as
args or stdin, and supports `--json` for agent parsing.
Wiretap/Desktop cache DMs are local-only. Do not imply they are in the published Git snapshot. For missing recent DMs, refresh first; stale archive is a common cause.
Useful examples:
```bash
discrawl --json sql "select count(*) as messages from messages;"
discrawl --json sql "select coalesce(nullif(c.name, ''), m.channel_id) as channel, count(*) as messages from messages m left join channels c on c.id = m.channel_id group by m.channel_id order by messages desc limit 20;"
discrawl --json sql "select coalesce(nullif(mm.display_name, ''), nullif(mm.global_name, ''), nullif(mm.username, ''), m.author_id) as author, count(*) as messages from messages m left join members mm on mm.guild_id = m.guild_id and mm.user_id = m.author_id group by m.guild_id, m.author_id order by messages desc limit 20;"
```
Never use `--unsafe --confirm` unless the user explicitly asks for a database
mutation and the write has been reviewed.
When the installed CLI lacks a new feature, build or run from a verified
`openclaw/discrawl` checkout before concluding the feature is missing.
## Discord Boundaries
Bot API sync requires configured Discord bot credentials; do not invent token
availability. Desktop wiretap mode reads local Discord Desktop artifacts and
must not extract credentials, use user tokens, call Discord as the user, or
write to Discord application storage. Wiretap/Desktop cache DMs are local-only
and must not be described as part of the published Git snapshot. Git-share
snapshots must not include secrets or `@me` DM rows.
## Verification
For repo edits, prefer existing Go gates:
```bash
go test ./...
GOWORK=off go test ./...
```
Then run targeted CLI smoke for the touched surface, for example:
```bash
discrawl doctor
discrawl status --json
discrawl search "test" --limit 5
```

12
.editorconfig Normal file
View File

@ -0,0 +1,12 @@
root = true
[*]
charset = utf-8
end_of_line = lf
insert_final_newline = true
indent_style = tab
indent_size = 4
[*.{md,yml,yaml,json,toml}]
indent_style = space
indent_size = 2

6
.gitattributes vendored Normal file
View File

@ -0,0 +1,6 @@
* text=auto
*.go text eol=lf
*.md text eol=lf
*.toml text eol=lf
*.yml text eol=lf
*.yaml text eol=lf

12
.github/CODEOWNERS vendored Normal file
View File

@ -0,0 +1,12 @@
# Protect ownership and automation rules.
/.github/CODEOWNERS @openclaw/openclaw-secops
/.github/dependabot.yml @openclaw/openclaw-secops
/.github/workflows/ @openclaw/openclaw-secops
# Release, backup, and package integrity surfaces.
/.goreleaser.yaml @openclaw/openclaw-secops
/go.mod @openclaw/openclaw-secops
/go.sum @openclaw/openclaw-secops
/scripts/*backup* @openclaw/openclaw-secops
/scripts/*release* @openclaw/openclaw-secops
/scripts/*publish* @openclaw/openclaw-secops

View File

@ -91,7 +91,19 @@ jobs:
}'
- name: Build
run: go build ./cmd/discrawl
run: go build -o bin/discrawl ./cmd/discrawl
- name: Smoke test CLI control surface
run: |
set -euo pipefail
output="$(./bin/discrawl help)"
printf '%s\n' "$output"
printf '%s' "$output" | grep -q "metadata"
printf '%s' "$output" | grep -q "tui"
test -n "$(./bin/discrawl --version)"
./bin/discrawl metadata --json | grep -q '"schema_version"'
./bin/discrawl status --json | grep -q '"databases"'
./bin/discrawl tui --json | grep -q '^\['
deps:
runs-on: ubuntu-latest

37
.github/workflows/codeql.yml vendored Normal file
View File

@ -0,0 +1,37 @@
name: CodeQL
on:
pull_request:
push:
branches:
- main
schedule:
- cron: "29 4 * * 1"
workflow_dispatch:
permissions:
actions: read
contents: read
security-events: write
jobs:
analyze:
name: analyze
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v6
- name: Setup Go
uses: actions/setup-go@v6
with:
go-version-file: go.mod
cache: true
- name: Initialize CodeQL
uses: github/codeql-action/init@v4
with:
languages: go
- name: Perform CodeQL Analysis
uses: github/codeql-action/analyze@v4

52
.github/workflows/pages.yml vendored Normal file
View File

@ -0,0 +1,52 @@
name: Pages
on:
push:
branches:
- main
paths:
- "docs/**"
- "scripts/build-docs-site.mjs"
- ".github/workflows/pages.yml"
workflow_dispatch:
permissions:
contents: read
pages: write
id-token: write
concurrency:
group: pages
cancel-in-progress: false
jobs:
deploy:
name: Deploy docs
runs-on: ubuntu-latest
timeout-minutes: 10
environment:
name: github-pages
url: ${{ steps.deployment.outputs.page_url }}
steps:
- name: Check out
uses: actions/checkout@v6
- name: Set up Node
uses: actions/setup-node@v6
with:
node-version: 24
- name: Build site
run: node scripts/build-docs-site.mjs
- name: Configure Pages
uses: actions/configure-pages@v6
- name: Upload artifact
uses: actions/upload-pages-artifact@v5
with:
path: dist/docs-site
- name: Deploy
id: deployment
uses: actions/deploy-pages@v5

View File

@ -76,7 +76,12 @@ jobs:
git clone "$BACKUP_REMOTE" "$BACKUP_REPO"
go run ./cmd/discrawl --config "$CONFIG" init --db "$DB" --guild "$DISCRAWL_GUILD_ID"
if [ -f "$BACKUP_REPO/manifest.json" ]; then
go run ./cmd/discrawl --config "$CONFIG" update --repo "$BACKUP_REPO" --remote "$BACKUP_REMOTE"
if [ -s "$DB" ]; then
echo "Restored Discord DB cache at $DB; skipping pre-sync snapshot import."
else
echo "Discord DB cache missing; importing latest published snapshot before latest-only sync."
go run ./cmd/discrawl --config "$CONFIG" update --repo "$BACKUP_REPO" --remote "$BACKUP_REMOTE"
fi
fi
go run ./cmd/discrawl --config "$CONFIG" sync --guild "$DISCRAWL_GUILD_ID" --skip-members --latest-only
git -C "$BACKUP_REPO" pull --ff-only origin main

View File

@ -44,3 +44,62 @@ jobs:
args: release --clean --config /tmp/.goreleaser.yaml
env:
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
update-homebrew-tap:
runs-on: ubuntu-latest
needs: goreleaser
steps:
- name: Resolve release tag
run: |
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
echo "RELEASE_TAG=${{ inputs.tag }}" >> "$GITHUB_ENV"
else
echo "RELEASE_TAG=${{ github.ref_name }}" >> "$GITHUB_ENV"
fi
- name: Dispatch tap formula update
env:
GH_TOKEN: ${{ secrets.HOMEBREW_TAP_TOKEN }}
run: |
if [ -z "$GH_TOKEN" ]; then
echo "::error::Set HOMEBREW_TAP_TOKEN with workflow access to steipete/homebrew-tap"
exit 1
fi
request_id="discrawl-${RELEASE_TAG}-${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}"
expected_title="Update discrawl for ${RELEASE_TAG} (${request_id})"
gh workflow run update-formula.yml \
--repo steipete/homebrew-tap \
--ref main \
-f formula=discrawl \
-f tag="$RELEASE_TAG" \
-f repository=openclaw/discrawl \
-f artifact_template="{formula}_{version}_{target}.tar.gz" \
-f request_id="$request_id"
run_id=""
for _ in {1..30}; do
run_id=$(gh run list \
--repo steipete/homebrew-tap \
--workflow update-formula.yml \
--branch main \
--event workflow_dispatch \
--limit 20 \
--json databaseId,displayTitle \
--jq ".[] | select(.displayTitle == \"$expected_title\") | .databaseId" | head -n1)
if [ -n "$run_id" ]; then
break
fi
sleep 5
done
if [ -z "$run_id" ]; then
echo "::error::Could not find tap workflow run with title: $expected_title"
exit 1
fi
gh run watch "$run_id" \
--repo steipete/homebrew-tap \
--exit-status \
--interval 10

63
.github/workflows/secret-scan.yml vendored Normal file
View File

@ -0,0 +1,63 @@
name: "Security Gate: Secret Scanning"
on:
push:
branches: ["**"]
pull_request:
branches: [main, master]
permissions: {}
jobs:
trufflehog:
name: Scan for Verified Secrets
runs-on: ubuntu-latest
permissions:
contents: read
steps:
- name: Checkout code
uses: actions/checkout@v6
with:
fetch-depth: 0
- name: Resolve scan range
id: scan_range
env:
EVENT_NAME: ${{ github.event_name }}
PR_BASE_SHA: ${{ github.event.pull_request.base.sha }}
PR_HEAD_SHA: ${{ github.event.pull_request.head.sha }}
PUSH_BASE_SHA: ${{ github.event.before }}
PUSH_HEAD_SHA: ${{ github.sha }}
DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
run: |
set -euo pipefail
zero_sha="0000000000000000000000000000000000000000"
if [[ "$EVENT_NAME" == "pull_request" ]]; then
base="$PR_BASE_SHA"
head="$PR_HEAD_SHA"
else
base="$PUSH_BASE_SHA"
head="$PUSH_HEAD_SHA"
if [[ -z "$base" || "$base" == "$zero_sha" ]]; then
base="origin/$DEFAULT_BRANCH"
fi
fi
echo "base=$base" >> "$GITHUB_OUTPUT"
echo "head=$head" >> "$GITHUB_OUTPUT"
- name: TruffleHog OSS
id: trufflehog
uses: trufflesecurity/trufflehog@v3.95.2
with:
path: ./
base: ${{ steps.scan_range.outputs.base }}
head: ${{ steps.scan_range.outputs.head }}
extra_args: --only-verified --debug
- name: Notify on failure
if: steps.trufflehog.outcome == 'failure'
run: |
echo "::error::Verified secrets found. Rotate the credential before merging."
exit 1

86
.github/workflows/stale.yml vendored Normal file
View File

@ -0,0 +1,86 @@
name: Stale
on:
schedule:
- cron: "25 4 * * *"
workflow_dispatch:
permissions: {}
jobs:
stale:
permissions:
issues: write
pull-requests: write
runs-on: ubuntu-latest
steps:
- name: Mark stale unassigned issues and pull requests
uses: actions/stale@v10
with:
days-before-issue-stale: 14
days-before-issue-close: 7
days-before-pr-stale: 14
days-before-pr-close: 7
stale-issue-label: stale
stale-pr-label: stale
exempt-issue-labels: enhancement,maintainer,pinned,security,no-stale
exempt-pr-labels: maintainer,no-stale
operations-per-run: 1000
ascending: true
exempt-all-assignees: true
remove-stale-when-updated: true
stale-issue-message: |
This issue has been automatically marked as stale due to inactivity.
Please add updated discrawl details or it will be closed.
stale-pr-message: |
This pull request has been automatically marked as stale due to inactivity.
Please update it or it will be closed.
close-issue-message: |
Closing due to inactivity.
If this still affects discrawl, open a new issue with current reproduction details.
close-issue-reason: not_planned
close-pr-message: |
Closing due to inactivity.
If this PR should be revived, reopen it with current context and validation.
- name: Mark stale assigned issues
uses: actions/stale@v10
with:
days-before-issue-stale: 30
days-before-issue-close: 10
days-before-pr-stale: -1
days-before-pr-close: -1
stale-issue-label: stale
exempt-issue-labels: enhancement,maintainer,pinned,security,no-stale
operations-per-run: 1000
ascending: true
include-only-assigned: true
remove-stale-when-updated: true
stale-issue-message: |
This assigned issue has been automatically marked as stale after 30 days of inactivity.
Please add an update or it will be closed.
close-issue-message: |
Closing due to inactivity.
If this still affects discrawl, reopen or file a new issue with current evidence.
close-issue-reason: not_planned
- name: Mark stale assigned pull requests
uses: actions/stale@v10
with:
days-before-issue-stale: -1
days-before-issue-close: -1
days-before-pr-stale: 27
days-before-pr-close: 7
stale-pr-label: stale
exempt-pr-labels: maintainer,no-stale
operations-per-run: 1000
ascending: true
include-only-assigned: true
ignore-pr-updates: true
remove-stale-when-updated: true
stale-pr-message: |
This assigned pull request has been automatically marked as stale after being open for 27 days.
Please add an update or it will be closed.
close-pr-message: |
Closing due to inactivity.
If this PR should be revived, reopen it with current context and validation.

View File

@ -12,7 +12,7 @@ builds:
env:
- CGO_ENABLED=0
ldflags:
- -s -w -X github.com/steipete/discrawl/internal/cli.version={{ .Version }}
- -s -w -X github.com/openclaw/discrawl/internal/cli.version={{ .Version }}
targets:
- darwin_amd64
- darwin_arm64

View File

@ -1,6 +1,54 @@
# Changelog
All notable changes to `discrawl` will be documented in this file.
## 0.7.0 - 2026-05-08
### Changes
- Added `discrawl tui`, a terminal archive browser for stored guild messages and local `@me` wiretap DMs using the shared crawlkit pane browser.
- Added crawlkit-backed `metadata --json`, `status --json`, and `doctor --json` control surfaces for launchers, automation, and CI checks.
- Published the generated documentation site at `discrawl.sh`, including command pages, install/setup docs, configuration, security notes, guides, a contact page, and social cards.
- Moved the Go module and release metadata to `github.com/openclaw/discrawl`.
### Fixes
- Kept documented command-local search flags working after the query, such as `discrawl search "term" --limit 5`. Thanks @PrinceOfEgypt.
- Made the terminal browser more useful and accurate: default guild scoping, newest-message startup, compact panes, selected-message detail panes, count-header sorting, local/remote status labels, right-click actions, Discord message URLs, row labels, direct-message pane labels, mention rendering, inline mention resolution, attachment details, and reply-context hydration without broad thread scans.
- Kept read-only commands such as `search`, `messages`, and safe `sql` usable while `tail` or another writer holds the sync lock. Thanks @PrinceOfEgypt.
- Kept `tui --help`, status, and terminal-browser reads safe for fresh or missing local databases without triggering Git snapshot auto-update.
- Kept local-only snapshot rows filtered during shared archive imports and forwarded snapshot import progress through the crawlkit import path.
- Made stale Git snapshot imports plan shard deltas from crawlkit file fingerprints or Git object identity, so routine shared-archive refreshes import changed message tail shards instead of rebuilding every table and FTS index.
- Included progress percentages in message-sync logs.
- Fixed GoReleaser version stamping after the module path move.
### Documentation
- Documented the crawlkit-backed config/status/control, snapshot, mirror, sync-state, output, and shared TUI surfaces now used on `main`.
- Clarified that Discord bot sync, desktop wiretap parsing, DM privacy filters, schema ownership, FTS/ranking, embeddings, and analytics remain app-owned.
- Aligned terminal-browser docs with the gitcrawl-style shared TUI model: channel/person/thread groups, message rows, detail/thread panes, sorting, mouse selection, right-click actions, and local/remote status chrome.
- Refreshed the repo-local `discrawl` agent skill for local Discord archive, freshness, query, boundary, TUI, verification, and read-only SQL workflows.
### Maintenance
- Migrated runtime paths, SQLite opening, archive mirror/export/import helpers, output/status wiring, and TUI plumbing onto the shared `crawlkit` infrastructure.
- Moved reusable embedding providers and vector helpers onto `crawlkit` while keeping Discrawl-owned storage, FTS, queueing, and privacy filters local.
- Updated crawlkit through `v0.4.1`, switched imports to `github.com/openclaw/crawlkit`, and added CI smoke coverage for the crawlkit control surface and merge behavior.
- Added CodeQL, verified secret scanning, protected automation owners, stale issue automation, `.editorconfig`, and `.gitattributes`.
- Added release workflow automation that dispatches the Homebrew tap formula update after GoReleaser publishes a tag.
## 0.6.6 - 2026-05-05
### Fixes
- `wiretap` now uses a fast default path for Discord Chromium cache imports: it scans cheap context files plus route-bearing HTTP cache entries, checkpoints file progress in batches, and leaves exhaustive historical cache archaeology behind `--full-cache` / `desktop.full_cache`.
## 0.6.5 - 2026-05-03
### Fixes
- Scheduled Discord backup publishing now skips redundant pre-sync snapshot imports when the workflow DB cache is warm, keeping fresh Git snapshots from getting delayed by a full archive reimport.
- `discrawl sync` now keeps Git snapshot refreshes explicit by default; use `--update=auto` or `--update=force` when you want a sync run to pull/import the shared snapshot before live Discord or desktop-cache deltas.
- Snapshot imports now emit phase/table/file progress and keep the sync lock file updated with the active phase, making long update/import runs diagnosable instead of looking hung.
- Recent-message scans are backed by a plain `messages(created_at, id)` index so archive freshness and short-window analysis queries avoid full-table scans.
## 0.6.4 - 2026-05-03
@ -8,6 +56,11 @@ All notable changes to `discrawl` will be documented in this file.
- `discrawl` now handles SIGINT/SIGTERM by canceling active sync/import contexts so large SQLite and FTS writes can roll back and close cleanly instead of being terminated mid-transaction.
### Maintenance
- Refreshed dependency and CI tooling pins, including GoReleaser, `go-toml`, golangci-lint, and gosec.
- Tightened CI compatibility with the latest linters and made signal-cancellation and sync fixture tests deterministic under the race detector.
## 0.6.3 - 2026-05-01
### Fixes

View File

@ -22,6 +22,9 @@ Wiretap DMs stay local and are never exported to the Git-backed snapshot mirror.
- tails Gateway events for live updates, with periodic repair syncs
- imports classifiable Discord Desktop cache messages with `wiretap`, including proven DMs under `@me`
- publishes and imports private Git-backed archive snapshots for org-wide read access
- browses stored messages and local DMs in a terminal archive UI
- exposes `metadata --json`, `status --json`, and `doctor --json` for local
launchers, automation, and CI
- supports Git-only read mode with no Discord credentials on reader machines
- generates backup README activity reports, with optional AI-written field notes
- exposes read-only SQL for ad hoc analysis
@ -114,7 +117,7 @@ discrawl --version
Build from source:
```bash
git clone https://github.com/steipete/discrawl.git
git clone https://github.com/openclaw/discrawl.git
cd discrawl
go build -o bin/discrawl ./cmd/discrawl
./bin/discrawl --version
@ -159,6 +162,25 @@ discrawl messages --channel general --hours 24
## Commands
### `tui`
Opens the local terminal archive browser for stored messages.
```bash
discrawl tui
discrawl tui --guild 123456789012345678 --channel general
discrawl tui --dm
discrawl --json tui --limit 50
```
The terminal browser uses the shared crawlkit explorer. The left pane groups
channels, people, or threads; the middle pane lists messages; the right pane
shows the selected message, surrounding conversation, and thread detail. Mouse
selection, right-click actions, sortable headers, and the local/remote footer
follow the same interaction model as `gitcrawl tui`. See
[`docs/commands/tui.md`](docs/commands/tui.md) for flags and read-only/DM scope
notes.
### `init`
Creates the local config and discovers accessible guilds.
@ -173,15 +195,20 @@ discrawl init --db ~/data/discrawl.db
Refreshes SQLite from one or both archive sources.
By default, `sync` runs both sources:
By default, `sync` runs both live/local sources and does not import the Git snapshot first:
- Discord bot-token sync for bot-visible guild data
- local Discord Desktop cache import for classifiable cached messages and proven DMs
Use `discrawl update` when you want to pull/import the shared Git snapshot. If you intentionally want a sync run to import the snapshot before live deltas, pass `--update=auto` to import only when stale or `--update=force` to pull/import before syncing. `--no-update` is accepted as an explicit no-op alias for the default.
Run one explicit `--full` pass when you want a complete historical guild archive. Use plain `sync` afterward for frequent latest-message and desktop-cache refreshes.
```bash
discrawl sync
discrawl sync --update=auto
discrawl sync --update=force
discrawl sync --no-update
discrawl sync --full
discrawl sync --full --all
discrawl sync --guild 123456789012345678
@ -207,7 +234,8 @@ Bot sync modes:
| Command | Use when | Behavior |
| --- | --- | --- |
| `discrawl sync` | routine refresh | imports any stale Git snapshot first, skips member refreshes, checks live top-level channels plus active threads, and only fetches new messages for channels with a stored latest cursor |
| `discrawl sync` | routine refresh | skips member refreshes, checks live top-level channels plus active threads, and only fetches new messages for channels with a stored latest cursor |
| `discrawl sync --update=auto` | hybrid Git/live refresh | imports a stale Git snapshot first, then runs the routine live refresh |
| `discrawl sync --all-channels` | repair pass | broad incremental sweep across every stored channel/thread, including archived threads |
| `discrawl sync --full` | historical backfill | crawls older history until channels are complete; can take a long time on large servers |
@ -218,9 +246,10 @@ Bot sync modes:
`--latest-only` is still accepted for explicit latest-only runs; it is now the default for untargeted `sync`. Use `--all-channels` to opt out of the fast default without doing a full historical crawl.
When `--channels` includes a forum channel id, `discrawl` expands that forum's threads and syncs their messages as part of the targeted run.
`--since` limits initial history/bootstrap and full-history backfill to messages at or after the given RFC3339 timestamp. It does not mark older history as complete, so a later `sync --full` without `--since` can continue the backfill.
Long runs now emit periodic progress logs to stderr so large backfills do not look hung.
Long runs now emit periodic progress logs to stderr so large backfills and Git snapshot imports do not look hung.
If in-flight channels stop completing for a while, `discrawl` now emits `message sync waiting` heartbeat logs with the oldest active channel, per-channel page activity, and skip/defer counters, and every run ends with a `message sync finished` summary.
Each channel crawl also has a bounded runtime budget, so a pathological channel is deferred and retried on the next sync instead of pinning a worker forever.
Retryable failures and unavailable-channel markers are tracked per channel; stale unavailable markers are cleared after a later successful crawl, and marker cleanup is best-effort so one missing local sync-state row cannot crash the run.
Full sync member refresh is best-effort and currently gives up after five minutes without a caller-supplied deadline, so message sync completion is not held hostage by a slow guild member crawl.
When the archive is already complete, `sync --full` now reuses the stored backlog markers and limits steady-state refresh to live top-level channels plus active threads instead of revisiting every stored archived thread.
If a guild already has a local member snapshot, routine syncs reuse it and skip another full member crawl until that snapshot ages out.
@ -247,6 +276,7 @@ This is the path for searchable DMs because bot tokens cannot read personal dire
discrawl wiretap
discrawl wiretap --path "$HOME/Library/Application Support/discord"
discrawl wiretap --dry-run
discrawl wiretap --full-cache
discrawl wiretap --watch-every 2m
```
@ -258,7 +288,8 @@ Notes:
- preserves existing local `@me` guilds, channels, messages, and attachments when importing a Git snapshot, so a shared guild mirror refresh does not wipe local wiretap DM search
- drops message payloads whose channel cannot be classified from cached channel metadata or Discord route URLs; dropped rows are counted as `skipped_messages`
- imports what Discord Desktop has cached locally, not complete live DM history
- scans local `.ldb`, `.log`, `.json`, and `.txt` artifacts for Discord message JSON
- scans local `.ldb`, `.log`, `.json`, and `.txt` artifacts for Discord message JSON, plus route-bearing Chromium HTTP cache entries by default
- use `--full-cache` or `desktop.full_cache = true` for exhaustive Chromium cache import when you want slower historical guild-cache archaeology
- does not extract, store, or print Discord auth tokens
- `--max-file-bytes` skips unusually large files; default is 64 MiB
@ -454,9 +485,9 @@ discrawl subscribe --stale-after 15m https://github.com/example/discord-archive.
discrawl subscribe --no-auto-update https://github.com/example/discord-archive.git
```
Once `share.remote` is configured, read commands auto-fetch and import when the local share import is older than `share.stale_after` (default `15m`). `discrawl update` forces the same pull/import step manually.
Once `share.remote` is configured, read commands auto-fetch and import when the local share import is older than `share.stale_after` (default `15m`). Imports are planned from crawlkit shard fingerprints, with a Git-object fallback for older manifests, so routine updates normally read only changed tail shards and preserve local FTS rows instead of rebuilding the whole archive. `discrawl update` forces the same pull/import step manually. `discrawl sync` does not auto-import the share unless `--update=auto` or `--update=force` is provided, so routine live refreshes stay fast.
Hybrid mode is supported too: keep normal Discord credentials configured and set `share.remote`. `discrawl sync` and `discrawl messages --sync` import the Git snapshot first, then use live Discord for latest-message deltas. Use `sync --all-channels` or `sync --full` when you intentionally want a broader live repair/backfill pass.
Hybrid mode is supported too: keep normal Discord credentials configured and set `share.remote`. `discrawl sync --update=auto` and `discrawl messages --sync` import the Git snapshot first, usually as a changed-shard delta, then use live Discord for latest-message deltas. Use `sync --all-channels` or `sync --full` when you intentionally want a broader live repair/backfill pass.
Git snapshots publish non-DM archive tables by default. Embedding queue state stays local to each machine, and Git-only readers can use FTS immediately without an embedding provider.
@ -487,7 +518,7 @@ discrawl report --readme path/to/discord-backup/README.md
Every scheduled snapshot publish updates deterministic README stats: latest update time, latest archived message, archive totals, and day/week/month activity.
The backup workflows restore and save `.discrawl-ci/discrawl.db` with `actions/cache`. On a warm runner cache, `discrawl update` compares the cached DB's last imported snapshot timestamp with `manifest.json` and skips the full sharded import when they match. Cache misses and newer backup manifests still take the normal pull/import path.
The backup workflows restore and save `.discrawl-ci/discrawl.db` with `actions/cache`. On a warm runner cache, scheduled publishers skip the pre-sync snapshot import and go straight to the live latest-message delta before publishing. Cache misses still import the latest published snapshot first so `--latest-only` has channel cursors to resume from.
### `digest`
@ -567,6 +598,7 @@ attachment_text = true
[desktop]
path = "~/.config/discord" # macOS default: "~/Library/Application Support/discord"
max_file_bytes = 67108864
full_cache = false
[search]
default_mode = "fts"
@ -674,6 +706,7 @@ go run github.com/golangci/golangci-lint/v2/cmd/golangci-lint@v2.11.1 run
go test ./... -coverprofile=/tmp/discrawl.cover
go tool cover -func=/tmp/discrawl.cover | tail -n 1
go build ./cmd/discrawl
go run ./cmd/discrawl help | grep tui
```
Target coverage is `>= 85%`.

View File

@ -465,12 +465,14 @@ Expected flags:
- `--dry-run`
- `--watch-every <duration>`
- `--max-file-bytes <bytes>`
- `--full-cache`
Requirements:
- never use Discord user tokens
- never extract or persist auth tokens from desktop cache
- scan bounded local files only
- default to route-bearing HTTP cache entries; exhaustive Chromium cache scans require explicit full-cache mode
- store sanitized raw metadata, not full arbitrary cache blobs
### `search`

View File

@ -7,7 +7,7 @@ import (
"os/signal"
"syscall"
"github.com/steipete/discrawl/internal/cli"
"github.com/openclaw/discrawl/internal/cli"
)
func main() {

View File

@ -11,8 +11,8 @@ import (
"testing"
"time"
"github.com/steipete/discrawl/internal/config"
"github.com/steipete/discrawl/internal/store"
"github.com/openclaw/discrawl/internal/config"
"github.com/openclaw/discrawl/internal/store"
)
func TestMainHelpAndVersion(t *testing.T) {

1
docs/CNAME Normal file
View File

@ -0,0 +1 @@
discrawl.sh

46
docs/README.md Normal file
View File

@ -0,0 +1,46 @@
# Discrawl
Mirror Discord guilds into local SQLite. Search server history without depending on Discord search. Bring a bot token, or read everything offline from a Git snapshot.
## What it does
- discovers every guild a bot can access and syncs channels, threads, members, and message history into SQLite
- maintains FTS5 indexes for fast literal search; optional embeddings for semantic and hybrid recall
- imports classifiable Discord Desktop cache messages with `wiretap`, including proven DMs under `@me`
- tails the Gateway for live updates with periodic repair sweeps
- publishes the archive as sharded NDJSON snapshots in a private Git repo so readers can search offline with no Discord credentials
- exposes read-only SQL, channel/member directories, mention queries, digests, and trend analytics
## Pick your path
- **New here?** Read [Install](install.html) and run `discrawl init`.
- **Already have a bot?** Jump to [`sync`](commands/sync.html) and [`search`](commands/search.html).
- **Just want to read a shared archive?** Use [`subscribe`](commands/subscribe.html) - no token needed.
- **Need DM search?** [`wiretap`](commands/wiretap.html) imports local Discord Desktop cache.
- **Want semantic search?** Configure [Embeddings](guides/embeddings.html), then run [`embed`](commands/embed.html).
- **Wiring an agent or launcher?** `discrawl metadata --json`, `discrawl status --json`, and `discrawl doctor --json` expose the read-only crawlkit control surface.
## At a glance
```bash
export DISCORD_BOT_TOKEN="..."
discrawl init
discrawl doctor
discrawl sync --full
discrawl search "panic: nil pointer"
discrawl tail
```
[`discrawl tui`](commands/tui.html) uses the shared crawlkit terminal explorer:
channel/person/thread groups on the left, message rows in the middle, and
readable message/thread detail on the right.
## Sections
- **[Start](install.html)** - install, configure, set up the Discord bot, security notes, contact
- **[Guides](guides/)** - sync sources, wiretap internals, search modes, embeddings, Git snapshots, data layout
- **[Commands](commands/)** - one page per CLI command
## Where to file issues
`https://github.com/openclaw/discrawl/issues`. See [contact](contact.html) for project links.

View File

@ -7,7 +7,7 @@ summary: "Release checklist for discrawl (GitHub release binaries via GoReleaser
Always do all steps below. No partial releases.
Assumptions:
- Repo: `steipete/discrawl`
- Repo: `openclaw/discrawl`
- Binary: `discrawl`
- GoReleaser config: `.goreleaser.yaml`
- Homebrew tap repo: `~/Projects/homebrew-tap`
@ -82,7 +82,7 @@ After tagging a real release:
Useful commands:
```sh
curl -L -o /tmp/discrawl-darwin-arm64.tgz https://github.com/steipete/discrawl/releases/download/vX.Y.Z/discrawl_X.Y.Z_darwin_arm64.tar.gz
curl -L -o /tmp/discrawl-darwin-arm64.tgz https://github.com/openclaw/discrawl/releases/download/vX.Y.Z/discrawl_X.Y.Z_darwin_arm64.tar.gz
shasum -a 256 /tmp/discrawl-darwin-arm64.tgz
brew uninstall discrawl || true
brew install steipete/tap/discrawl
@ -92,7 +92,7 @@ brew info steipete/tap/discrawl
## Notes
- Build-time version stamping comes from `-X github.com/steipete/discrawl/internal/cli.version={{ .Version }}`
- Build-time version stamping comes from `-X github.com/openclaw/discrawl/internal/cli.version={{ .Version }}`
- If release workflow needs a rerun:
```sh

63
docs/bot-setup.md Normal file
View File

@ -0,0 +1,63 @@
# Discord bot setup
Discrawl needs a real Discord bot token to run `sync` or `tail`. Not a user token. The desktop `wiretap` import does not need any token.
## Minimum setup
1. Create or reuse a Discord application in the [Discord developer portal](https://discord.com/developers/applications).
2. Add a bot user to that application.
3. Invite the bot to the target guilds.
4. Enable these intents for the bot:
- **Server Members Intent**
- **Message Content Intent**
5. Ensure the bot can at least:
- view channels
- read message history
Without those intents/permissions, `sync`, `tail`, member snapshots, and message content archiving will be partial or fail outright.
## Provide the token
### Environment variable
```bash
export DISCORD_BOT_TOKEN="your-bot-token"
discrawl doctor
```
If you keep shell secrets in `~/.profile`, add the export there and reload your shell.
### OS keyring
If you prefer the OS keyring, keep the token out of config and store it in the default keyring item:
```bash
# macOS Keychain
security add-generic-password -U -s discrawl -a discord_bot_token -w "$DISCORD_BOT_TOKEN"
# Linux Secret Service / libsecret
printf %s "$DISCORD_BOT_TOKEN" | secret-tool store --label="discrawl Discord bot token" service discrawl username discord_bot_token
# Windows Credential Manager
cmdkey /generic:discrawl:discord_bot_token /user:discord_bot_token /pass:%DISCORD_BOT_TOKEN%
```
Set `discord.token_source = "keyring"` if you want to require the keyring and skip env entirely.
## Verify
```bash
discrawl doctor
```
`doctor` reports the token source (env or keyring), confirms bot auth, lists how many guilds the bot can access, and verifies the local DB plus FTS wiring. It does not print the token contents.
## Wiretap-only setup
If you only want to import local Discord Desktop cache messages and not run a bot, skip everything above and run:
```bash
discrawl sync --source wiretap
```
Or `discrawl wiretap` directly. See the [wiretap guide](guides/wiretap.html).

View File

@ -0,0 +1,37 @@
# `analytics`
Groups activity-style queries under one namespace.
## Usage
```bash
discrawl analytics
discrawl analytics quiet --since 30d
discrawl analytics quiet --guild 123456789012345678
discrawl analytics trends --weeks 8
discrawl analytics trends --weeks 12 --channel general
discrawl --json analytics quiet --since 60d
discrawl --json analytics trends --weeks 4
```
## Subcommands
### `quiet`
Top-level text/announcement channels with no messages in the lookback window, including never-active channels.
- `--since <duration>` - lookback window (e.g. `30d`, `60d`)
- `--guild <id>` - scope to one guild; when omitted, `default_guild_id` is used if configured
### `trends`
Monday-start UTC weekly message counts per message-capable channel.
- `--weeks <n>` - number of weeks to include
- `--channel <id|name>` - scope to one channel
- `--guild <id>` - scope to one guild
## See also
- [`digest`](digest.html)
- [`status`](status.html)

25
docs/commands/channels.md Normal file
View File

@ -0,0 +1,25 @@
# `channels`
Browse the offline channel directory.
## Usage
```bash
discrawl channels list
discrawl channels show 123456789012345678
```
## Subcommands
- `list` - dump every channel and thread in the local archive
- `show <id>` - show metadata for one channel/thread
## Notes
- threads are stored as channels because that matches the Discord model
- archived threads are part of the sync surface and appear here too
## See also
- [`members`](members.html)
- [Data layout](../guides/data-storage.html)

29
docs/commands/digest.md Normal file
View File

@ -0,0 +1,29 @@
# `digest`
Summarizes per-channel activity for a lookback window.
## Usage
```bash
discrawl digest
discrawl digest --since 30d
discrawl digest --guild 123456789012345678
discrawl digest --channel general
discrawl --json digest --since 7d --top-n 5
```
## Flags
- `--since <duration>` - Go durations (`72h`, `30m`) and `Nd` shorthand (`7d`, `30d`)
- `--guild <id>` - scope to one guild; when omitted, `default_guild_id` is used if configured
- `--channel <id|name>` - scope to one channel
- `--top-n <n>` - how many top posters and mention targets per channel
## Output
For each channel in scope: message count, top posters, top mention targets, first/last activity in window.
## See also
- [`analytics`](analytics.html)
- [`mentions`](mentions.html)

39
docs/commands/dms.md Normal file
View File

@ -0,0 +1,39 @@
# `dms`
Lists local wiretap DM conversations or reads one DM thread. Convenience layer over the synthetic `@me` guild id.
## Usage
```bash
discrawl dms
discrawl dms --with Molty --last 20
discrawl dms --with 1456464433768300635 --all
discrawl dms --search "launch checklist"
discrawl dms --with Molty --search "invoice"
```
## Default output
`discrawl dms` (no flags) shows one row per local DM channel with:
- message count
- author count
- first/last cached message times
## Flags
- `--with <name|id>` - switches to message output for that DM conversation (unless `--list` is also set)
- `--list` - keep the channel-summary listing even when `--with` is set
- `--search <query>` - search only local DM messages
- `--last <n>` / `--all` / `--limit <n>` - same slicing as [`messages`](messages.html)
## Notes
- only sees data imported by [`wiretap`](wiretap.html) - Discord Desktop cache, not live DM history
- skips Git snapshot auto-update because DMs are never imported from the shared mirror
- DMs are local-only and never published
## See also
- [Wiretap guide](../guides/wiretap.html)
- [`messages --dm`](messages.html)

35
docs/commands/doctor.md Normal file
View File

@ -0,0 +1,35 @@
# `doctor`
Checks config, auth, DB, and FTS wiring. The fastest sanity check.
## Usage
```bash
discrawl doctor
```
## What it verifies
- config loads from the expected path
- where the bot token was resolved from (env var or OS keyring)
- bot auth succeeds against Discord
- how many guilds the bot can access
- local SQLite database exists and the schema version matches the binary
- FTS5 index is wired up
## What it does not do
- does not print the token contents
- does not run a sync; it only checks readiness
## Common outputs
- "token from env (DISCORD_BOT_TOKEN)" or "token from keyring (discrawl/discord_bot_token)"
- "0 guilds visible" - bot is not invited to any guild yet, or intents/permissions are missing
- "schema newer than binary" - update `discrawl` to a build that supports the local DB schema
## See also
- [Bot setup](../bot-setup.html)
- [Configuration](../configuration.html)
- [`status`](status.html)

42
docs/commands/embed.md Normal file
View File

@ -0,0 +1,42 @@
# `embed`
Drains pending `embedding_jobs` rows by calling the configured embedding provider and writing vectors to `message_embeddings`.
## Usage
```bash
discrawl embed
discrawl embed --limit 1000
discrawl embed --rebuild --limit 1000
```
## Flags
- `--limit <n>` - cap how many jobs this run drains
- `--batch-size <n>` - provider request batch size
- `--rebuild` - regenerate vectors for the existing archive after a provider/model/input-version change
## Behavior
- claims jobs with a short lock so overlapping runs do not process the same batch
- rate limits requeue the batch and stop that drain run cleanly
- provider or validation failures retry up to three attempts before the job is marked failed
- messages with no normalized text are marked done and any stale vector for that message is removed
## Identity
Provider, model, and input version are stored on each job and vector. Changing any of them retargets pending jobs to the new identity and resets prior attempts. Existing vectors for another identity remain in SQLite but are not used by semantic search.
## When to use `--rebuild`
After changing `[search.embeddings]` provider, model, or any input setting, when you want to regenerate vectors for messages already in the archive.
## Pairing with `sync`
`sync --with-embeddings` enqueues; `embed` drains. The two phases are intentionally separate so a slow provider does not block the hot sync path.
## See also
- [Embeddings guide](../guides/embeddings.html)
- [Search modes](../guides/search-modes.html)
- [`search`](search.html)

31
docs/commands/init.md Normal file
View File

@ -0,0 +1,31 @@
# `init`
Creates the local config and discovers accessible guilds.
## Usage
```bash
discrawl init
discrawl init --guild 123456789012345678
discrawl init --db ~/data/discrawl.db
discrawl init --with-embeddings
```
## What it does
- writes `~/.discrawl/config.toml` (or whatever `--config` / `DISCRAWL_CONFIG` points to)
- discovers guilds the configured bot can access
- if exactly one guild is available, sets it as `default_guild_id` automatically
- creates the SQLite database at `db_path`
## Flags
- `--guild <id>` - set a specific default guild instead of auto-picking
- `--db <path>` - override `db_path`
- `--with-embeddings` - enable `[search.embeddings]` in the generated config
## See also
- [Configuration](../configuration.html)
- [Bot setup](../bot-setup.html)
- [`doctor`](doctor.html)

72
docs/commands/members.md Normal file
View File

@ -0,0 +1,72 @@
# `members`
Browse the offline member directory built from archived profile payloads.
## Usage
```bash
discrawl members list
discrawl members show 123456789012345678
discrawl members show --messages 10 steipete
discrawl members search "peter"
discrawl members search "github"
discrawl members search "https://github.com/steipete"
```
## Subcommands
- `list` - dump the local member directory
- `show <id|query>` - show one member; if the query resolves to one match, also show recent messages
- `search <query>` - match names plus any offline profile fields present in the archived member payload
## Flags
- `show --messages <n>` - include the most recent `n` messages from that member
## Profile fields
Extracted from the archived Discord member/user payload. May include:
- `bio`
- `pronouns`
- `location`
- `website`
- `x`
- `github`
- discovered URLs
If the bot cannot see a field from Discord, `discrawl` cannot invent it. This is strictly archive-based offline data.
## Typical workflow
```bash
discrawl sync --full
discrawl members search "design engineer"
discrawl members search "github"
discrawl members show --messages 25 steipete
discrawl messages --author steipete --days 30 --all
```
## Typical `members show` output
```text
guild=1456350064065904867
user=37658261826043904
username=steipete
display=Peter Steinberger
joined=2026-03-08T16:03:14Z
bot=false
x=steipete
github=steipete
website=https://steipete.me
bio=Builds native apps and tooling.
urls=https://steipete.me, https://github.com/steipete
message_count=1284
first_message=2026-02-01T09:00:00Z
last_message=2026-03-08T15:59:58Z
```
## See also
- [`channels`](channels.html)
- [Data layout](../guides/data-storage.html)

27
docs/commands/mentions.md Normal file
View File

@ -0,0 +1,27 @@
# `mentions`
Lists structured user and role mentions extracted during sync.
## Usage
```bash
discrawl mentions --channel maintainers --days 7
discrawl mentions --target steipete --type user --limit 50
discrawl mentions --target 1456406468898197625
discrawl --json mentions --type role --days 1
```
## Flags
- `--target <id|name>` - user or role id, exact name, or partial match
- `--type <user|role>` - filter by mention type
- `--channel <id|name>` - same channel matching as [`messages`](messages.html)
- `--guild <id>` / `--guilds <id,id>` - restrict the guild scope
- `--days <n>` / `--since <RFC3339>` / `--before <RFC3339>` - time filters
- `--limit <n>` - cap result count
## Notes
- mentions are recorded structurally during sync, so this is a direct row read - no FTS parsing
- combine with `--type role` to find every mention of a role
- combine with `--target steipete` to find everywhere your account got pinged

41
docs/commands/messages.md Normal file
View File

@ -0,0 +1,41 @@
# `messages`
Lists exact message slices by channel, author, and time range. Unlike [`search`](search.html), this does not query the FTS index - it pulls a slice of rows.
## Usage
```bash
discrawl messages --channel maintainers --days 7 --all
discrawl messages --channel maintainers --hours 6 --all
discrawl messages --channel "#maintainers" --since 2026-03-01T00:00:00Z
discrawl messages --channel 1456744319972282449 --author steipete --limit 50
discrawl messages --channel maintainers --last 100 --sync
discrawl messages --dm --channel Molty --last 20
discrawl messages --channel maintainers --days 7 --all --include-empty
discrawl --json messages --channel maintainers --days 3
```
## Flags
- `--channel <id|name|#name>` - id, exact name, `#name`, or partial name match
- `--guild <id>` / `--guilds <id,id>` / `--dm` - restrict the guild scope (`--dm` is shorthand for `--guild @me`)
- `--author <name>` - restrict to one author
- `--hours <n>` - shorthand for "since now minus N hours"
- `--days <n>` - shorthand for "since now minus N days"
- `--since <RFC3339>` - explicit start timestamp
- `--last <n>` - return the newest `N` matching messages, then print oldest-to-newest
- `--limit <n>` - safety limit (default 200; `--all` removes it)
- `--all` - removes the safety limit
- `--sync` - blocking pre-query sync for the matching channel or guild scope
- `--include-empty` - include rows with no displayable/searchable content
## Notes
- at least one filter is required
- `--dm` skips Git snapshot auto-update because DMs are never imported from the shared mirror
- use either `--last` for the newest matching rows or `--all` for an uncapped oldest-to-newest slice
## See also
- [`search`](search.html)
- [`dms`](dms.html)

42
docs/commands/publish.md Normal file
View File

@ -0,0 +1,42 @@
# `publish`
Publishes the local SQLite archive as sharded, compressed NDJSON snapshots in a private Git repo.
## Usage
```bash
discrawl publish --remote https://github.com/example/discord-archive.git --push
discrawl publish --readme path/to/discord-backup/README.md --push
discrawl publish --message "sync: discord archive" --push
discrawl publish --with-embeddings --push
```
## Flags
- `--repo <path>` - local snapshot repo path (defaults to `[share].repo_path`)
- `--remote <url>` - target Git remote (defaults to `[share].remote`)
- `--branch <name>` - snapshot branch (defaults to `[share].branch`)
- `--message <text>` - commit message (default: `sync: discord archive`)
- `--no-commit` - write/export files without creating a Git commit
- `--push` - push the snapshot commit after writing it
- `--readme <path>` - update the activity block in this README file too
- `--with-embeddings` - also export stored `message_embeddings` rows
## What is published
- non-DM archive tables (DM `@me` rows are always excluded)
- README activity block (latest update, latest message, totals, day/week/month activity)
- with `--with-embeddings`: vectors for the configured `[search.embeddings]` provider/model/input version, plus identity manifest
## What is not published
- `@me` DM guilds, channels, messages, events, attachments, mentions, wiretap sync state
- `embedding_jobs`
- raw bot tokens or any local secret
## See also
- [Git snapshots guide](../guides/git-snapshots.html)
- [`subscribe`](subscribe.html)
- [`update`](update.html)
- [`report`](report.html)

35
docs/commands/report.md Normal file
View File

@ -0,0 +1,35 @@
# `report`
Generates the Markdown activity block used by the shared backup repo README.
## Usage
```bash
discrawl report
discrawl report --readme path/to/discord-backup/README.md
```
## Flags
- `--readme <path>` - update the activity block in the given README file in place
## What gets rendered
Deterministic README stats:
- latest update time
- latest archived message
- archive totals
- day / week / month activity
Every scheduled snapshot publish updates this block.
## CI integration
The backup workflows restore and save `.discrawl-ci/discrawl.db` with `actions/cache`. On a warm runner cache, scheduled publishers skip the pre-sync snapshot import and go straight to the live latest-message delta before publishing. Cache misses still import the latest published snapshot first so `--latest-only` has channel cursors to resume from.
## See also
- [`publish`](publish.html)
- [Git snapshots](../guides/git-snapshots.html)
- [`status`](status.html)

51
docs/commands/search.md Normal file
View File

@ -0,0 +1,51 @@
# `search`
Searches archived messages. FTS is the default mode and works without embeddings.
## Usage
```bash
discrawl search "panic: nil pointer"
discrawl search --mode fts "panic: nil pointer"
discrawl search --mode semantic "missing launch checklist"
discrawl search --mode hybrid "database timeout"
discrawl search --guild 123456789012345678 "payment failed"
discrawl search --dm "launch checklist"
discrawl search --channel billing --author steipete --limit 50 "invoice"
discrawl search --include-empty "GitHub"
discrawl --json search "websocket closed"
```
## Modes
- `fts` (default) - SQLite FTS5 with `unicode61` tokenizer; newest matches first
- `semantic` - embeds the query, scores against locally stored vectors; errors out if embeddings are disabled or no compatible vectors exist
- `hybrid` - runs both, deduplicates by message id, falls back to FTS when semantic is unavailable
## Flags
- `--mode <fts|semantic|hybrid>` - search mode
- `--guild <id>` / `--guilds <id,id>` - restrict the guild scope
- `--dm` - shorthand for `--guild @me`
- `--channel <id|name|#name>` - restrict to one channel (id, exact name, `#name`, or partial match)
- `--author <name>` - restrict to one author
- `--limit <n>` - cap result count
- `--include-empty` - include rows with no searchable content (attachment text/filenames, embeds, and replies still count as content)
## FTS behavior
User query terms are parameterized and quoted before `MATCH`, so tokens like `AND`, `OR`, `NOT`, `NEAR`, and `*` are searched as input terms instead of FTS operators. Punctuation still follows FTS5 tokenization rules.
## Semantic prerequisites
- `[search.embeddings]` configured in `~/.discrawl/config.toml`
- local `message_embeddings` rows for the configured provider, model, and input version
- input version is currently `message_normalized_v1`
Run `discrawl sync --with-embeddings` to enqueue, then `discrawl embed` to generate vectors.
## See also
- [Search modes](../guides/search-modes.html)
- [Embeddings](../guides/embeddings.html)
- [`messages`](messages.html) - exact slices, not search

25
docs/commands/sql.md Normal file
View File

@ -0,0 +1,25 @@
# `sql`
Runs read-only SQL against the local database.
## Usage
```bash
discrawl sql 'select count(*) as messages from messages'
echo 'select guild_id, count(*) from messages group by guild_id' | discrawl sql -
```
`-` reads SQL from stdin.
## Notes
- read-only - writes are blocked at the connection level
- `--unsafe --confirm` opens the escape hatch for deliberate write/admin SQL
- the schema is multi-guild ready; threads are stored as channels because that matches the Discord model
- proven DMs use the synthetic guild id `@me`
- SQLite schema migrations are versioned with `PRAGMA user_version`; startup fails fast when a local DB schema is newer than the supported binary
## See also
- [Data layout](../guides/data-storage.html) - what tables exist
- [`status`](status.html) - high-level archive numbers without raw SQL

24
docs/commands/status.md Normal file
View File

@ -0,0 +1,24 @@
# `status`
Shows local archive status.
## Usage
```bash
discrawl status
```
## Reports
- where the local database lives
- guild count and per-guild totals
- channel and thread counts
- message totals
- latest archived message time
- whether the Git share is configured and how stale the local import is
- embeddings status if `[search.embeddings]` is enabled
## See also
- [`doctor`](doctor.html) - liveness check (config, auth, DB, FTS wiring)
- [`report`](report.html) - Markdown activity block for the shared backup README

View File

@ -0,0 +1,48 @@
# `subscribe`
Subscribes to a Git-backed snapshot repo. The Git-only setup path - no Discord bot token required.
## Usage
```bash
discrawl subscribe https://github.com/example/discord-archive.git
discrawl subscribe --repo ~/.discrawl/share https://github.com/example/discord-archive.git
discrawl subscribe --branch main https://github.com/example/discord-archive.git
discrawl subscribe --stale-after 15m https://github.com/example/discord-archive.git
discrawl subscribe --no-auto-update https://github.com/example/discord-archive.git
discrawl subscribe --no-import https://github.com/example/discord-archive.git
discrawl subscribe --with-embeddings https://github.com/example/discord-archive.git
```
## What it does
- writes a config with `discord.token_source = "none"` (so no bot token is required)
- imports the latest snapshot into the local SQLite archive
- enables auto-refresh: read commands fetch and import when the local share import is older than `share.stale_after` (default `15m`)
## Flags
- `--repo <path>` - local snapshot repo path
- `--branch <name>` - snapshot branch (default: `main`)
- `--stale-after <duration>` - how stale the local import can get before read commands auto-refresh
- `--no-auto-update` - disable auto-refresh (use [`update`](update.html) manually)
- `--no-import` - write config only; skip the initial pull/import
- `--with-embeddings` - import vectors that match your local `[search.embeddings]` identity
## Disabled in this mode
`sync` and `tail` are disabled when `discord.token_source = "none"` because they need live Discord access. Switch to a token-equipped config to re-enable them.
## After subscribing
```bash
discrawl search "launch checklist"
discrawl messages --channel general --hours 24
discrawl status
```
## See also
- [Git snapshots guide](../guides/git-snapshots.html)
- [`publish`](publish.html)
- [`update`](update.html)

82
docs/commands/sync.md Normal file
View File

@ -0,0 +1,82 @@
# `sync`
Refreshes SQLite from one or both archive sources.
By default, `sync` runs both live/local sources and does **not** import the Git snapshot first:
- Discord bot-token sync for bot-visible guild data
- local Discord Desktop cache import for classifiable cached messages and proven DMs
Use [`update`](update.html) when you want to pull/import the shared Git snapshot. Snapshot imports normally use changed-shard deltas, but unsafe table changes fall back to a full import. If you intentionally want a sync run to import the snapshot before live deltas, pass `--update=auto` (only when stale) or `--update=force` (always). `--no-update` is accepted as an explicit no-op alias for the default.
Run one explicit `--full` pass when you want a complete historical guild archive. Use plain `sync` afterward for frequent latest-message and desktop-cache refreshes.
## Usage
```bash
discrawl sync
discrawl sync --update=auto
discrawl sync --update=force
discrawl sync --no-update
discrawl sync --full
discrawl sync --full --all
discrawl sync --guild 123456789012345678
discrawl sync --guilds 123,456 --concurrency 8
discrawl sync --source both # default: bot API + desktop cache
discrawl sync --source discord # bot API only; aliases: key, bot, api
discrawl sync --source wiretap # desktop cache only; aliases: desktop, cache
discrawl sync --guild 123456789012345678 --all-channels
discrawl sync --channels 111,222 --since 2026-03-01T00:00:00Z
discrawl sync --with-embeddings
```
## Sources
| Source | Reads from | Stores |
| --- | --- | --- |
| `both` | Discord bot API and local Discord Desktop cache | bot-visible guild data plus classifiable cached desktop messages |
| `discord` / `key` | Discord bot API | guilds, channels, threads, members, and messages the bot can access |
| `wiretap` | local Discord Desktop cache files | classifiable cached messages; proven DMs are stored under `@me` |
## Bot sync modes
| Command | Use when | Behavior |
| --- | --- | --- |
| `discrawl sync` | routine refresh | skips member refreshes, checks live top-level channels plus active threads, only fetches new messages for channels with a stored cursor |
| `discrawl sync --update=auto` | hybrid Git/live refresh | imports a stale Git snapshot first, then runs the routine live refresh |
| `discrawl sync --all-channels` | repair pass | broad incremental sweep across every stored channel/thread, including archived threads |
| `discrawl sync --full` | historical backfill | crawls older history until channels are complete |
## Flags
- `--source <both|discord|wiretap>` - which archive sources to read
- `--update <auto|force|none>` - whether to import the Git snapshot before live deltas
- `--full` - historical backfill (slow on large guilds)
- `--all-channels` - broader incremental sweep across every stored channel/thread
- `--latest-only` - explicit latest-only run (also the default for untargeted `sync`)
- `--all` - ignore `default_guild_id` and fan out across every discovered guild
- `--guild <id>` / `--guilds <id,id>` - target specific guilds
- `--channels <id,id>` - target specific channels (forum ids expand to threads)
- `--since <RFC3339>` - limit initial history and `--full` backfill to messages at or after this timestamp
- `--concurrency <n>` - override worker count (default auto-sized: floor 8, cap 32)
- `--skip-members` - refresh guild/channel/message data without crawling members
- `--with-embeddings` - also enqueue changed messages into `embedding_jobs`
## Notes
- `--latest-only` is the default for untargeted `sync`. Use `--all-channels` to opt out without doing a full historical crawl.
- `--since` does not mark older history as complete, so a later `sync --full` without `--since` can continue the backfill.
- Long runs emit periodic progress logs to stderr.
- Heartbeat logs (`message sync waiting`) name the oldest active channel and per-channel page activity if in-flight channels stop completing for a while.
- Every run ends with a `message sync finished` summary.
- Each channel crawl has a bounded runtime budget; pathological channels are deferred and retried next sync.
- Retryable failures and unavailable-channel markers are tracked per channel; stale unavailable markers are cleared after a later successful crawl.
- Marker cleanup is best-effort, so one missing local sync-state row cannot crash the run.
- Full sync member refresh is best-effort and gives up after five minutes without a caller-supplied deadline.
- When the archive is already complete, `sync --full` reuses backlog markers and limits steady-state refresh to live top-level channels plus active threads.
## See also
- [Sync sources](../guides/sync-sources.html)
- [`tail`](tail.html)
- [`update`](update.html)

33
docs/commands/tail.md Normal file
View File

@ -0,0 +1,33 @@
# `tail`
Runs the live Discord Gateway tail and a periodic repair loop.
## Usage
```bash
discrawl tail
discrawl tail --guild 123456789012345678
discrawl tail --repair-every 30m
```
## What it does
- connects to the Discord Gateway with the configured bot token
- writes new messages, edits, and deletes into the local archive as they arrive
- periodically runs a repair pass to catch anything the live stream missed
## Flags
- `--guild <id>` / `--guilds <id,id>` - tail a specific guild scope (default: `default_guild_id`, or all discovered guilds if unset)
- `--repair-every <duration>` - frequency of the repair sweep
## Notes
- requires a working Discord bot token
- not available in Git-only mode (`discord.token_source = "none"`)
- terminates cleanly on SIGINT / SIGTERM and treats cancellation as normal exit
## See also
- [`sync`](sync.html)
- [Bot setup](../bot-setup.html)

47
docs/commands/tui.md Normal file
View File

@ -0,0 +1,47 @@
# `tui`
Opens the local terminal archive browser for stored messages.
## Usage
```bash
discrawl tui
discrawl tui --guild 123456789012345678 --channel general
discrawl tui --guilds 123,456 --author 1456464433768300635
discrawl tui --dm
discrawl --json tui --limit 50
```
## What it shows
The browser uses the shared crawlkit explorer:
- left pane: channel, person, or thread groups
- middle pane: newest matching message rows
- right pane: selected message detail, attachments, replies, and thread context
- footer: local DB or remote Git snapshot source
Mouse selection, right-click actions, sortable headers, refresh, and chat layout match the other crawlkit-backed archive tools.
## Flags
- `--guild <id>` / `--guilds <id,id>` - restrict the guild scope
- `--dm` - browse local direct messages under the synthetic `@me` guild
- `--channel <id|name|#name>` - restrict to one channel or DM conversation
- `--author <id|name>` - restrict to one author
- `--limit <n>` - newest rows to load (default 200)
- `--include-empty` - include rows with no displayable/searchable content
- `--json` - print crawlkit browser rows as JSON instead of opening the TUI
## Notes
- `tui` is read-only.
- without `--guild`, `--guilds`, or `--dm`, it uses `default_guild_id` when configured; otherwise it can browse all stored guild rows
- `--dm` only shows messages imported from the local Discord Desktop cache by [`wiretap`](wiretap.html)
- `--json` is useful for launchers and agents that want the same row shape without an interactive terminal
## See also
- [`messages`](messages.html)
- [`dms`](dms.html)
- [`wiretap`](wiretap.html)

36
docs/commands/update.md Normal file
View File

@ -0,0 +1,36 @@
# `update`
Forces a Git snapshot pull and import.
Routine imports are delta-planned from crawlkit shard fingerprints, with a Git-object fallback for older manifests. The usual publish only imports changed tail shards; unsafe table changes fall back to a full import.
## Usage
```bash
discrawl update
discrawl update --repo ~/.discrawl/share --remote https://github.com/example/discord-archive.git
discrawl update --with-embeddings
```
## Flags
- `--repo <path>` - local snapshot repo path (defaults to `[share].repo_path`)
- `--remote <url>` - target Git remote (defaults to `[share].remote`)
- `--branch <name>` - snapshot branch (defaults to `[share].branch`)
- `--with-embeddings` - also import vectors that match your local `[search.embeddings]` identity
## When to use it
- you have `share.remote` configured and want a fresh shard-delta import before running a command that does not auto-update (`sync` does not auto-import unless `--update=auto` is passed)
- you set `--no-auto-update` when subscribing and want to refresh on demand
- a CI job already imported the latest snapshot but read commands still consider it stale
## How `sync` interacts
`discrawl sync` does **not** auto-import the share unless `--update=auto` (only when stale) or `--update=force` (always). Routine live refreshes stay fast; explicit imports happen via `update`.
## See also
- [Git snapshots guide](../guides/git-snapshots.html)
- [`subscribe`](subscribe.html)
- [`sync`](sync.html)

47
docs/commands/wiretap.md Normal file
View File

@ -0,0 +1,47 @@
# `wiretap`
Imports classifiable Discord Desktop message payloads into the same local SQLite archive.
This is the path for searchable DMs because bot tokens cannot read personal direct messages.
`wiretap` is also available through `discrawl sync --source wiretap` and is included in the default `discrawl sync --source both` path.
## Usage
```bash
discrawl wiretap
discrawl wiretap --path "$HOME/Library/Application Support/discord"
discrawl wiretap --dry-run
discrawl wiretap --full-cache
discrawl wiretap --watch-every 2m
```
## Flags
- `--path <dir>` - override the desktop data directory (default: platform-specific Discord cache path)
- `--dry-run` - report what would be imported without writing anything
- `--full-cache` - exhaustive Chromium HTTP cache import for historical guild-cache archaeology (slower)
- `--watch-every <duration>` - keep importing on a periodic loop
- `--max-file-bytes <n>` - skip unusually large files (default 64 MiB)
## Notes
- stores classifiable cache messages in the same `guilds`, `channels`, and `messages` tables used by bot sync
- stores proven DMs under the synthetic guild id `@me`
- `@me` rows stay local-only: never exported to `publish` / Git snapshot import / embedding snapshots
- preserves existing local `@me` rows when importing a Git snapshot
- drops message payloads whose channel cannot be classified from cached channel metadata or Discord route URLs; dropped rows are counted as `skipped_messages`
- imports what Discord Desktop has cached locally, not complete live DM history
- scans local `.ldb`, `.log`, `.json`, and `.txt` artifacts for Discord message JSON, plus route-bearing Chromium HTTP cache entries by default
- does not extract, store, or print Discord auth tokens
## Default desktop paths
- macOS: `~/Library/Application Support/discord`
- Linux: `~/.config/discord`
## See also
- [Wiretap guide](../guides/wiretap.html)
- [`dms`](dms.html)
- [`sync`](sync.html)

77
docs/configuration.md Normal file
View File

@ -0,0 +1,77 @@
# Configuration
`discrawl init` writes a complete config so most users do not hand-edit anything initially. This page documents the full shape and override rules for when you do.
## File layout
```toml
version = 1
default_guild_id = ""
guild_ids = []
db_path = "~/.discrawl/discrawl.db"
cache_dir = "~/.discrawl/cache"
log_dir = "~/.discrawl/logs"
[discord]
token_source = "env" # use "none" for Git-only read access
token_env = "DISCORD_BOT_TOKEN"
token_keyring_service = "discrawl"
token_keyring_account = "discord_bot_token"
[sync]
source = "both" # "discord" for bot-only sync, "wiretap" for desktop-cache-only import
concurrency = 16
repair_every = "6h"
full_history = true
attachment_text = true
[desktop]
path = "~/.config/discord" # macOS default: "~/Library/Application Support/discord"
max_file_bytes = 67108864
full_cache = false
[search]
default_mode = "fts"
[search.embeddings]
enabled = false
provider = "openai"
model = "text-embedding-3-small"
api_key_env = "OPENAI_API_KEY"
batch_size = 64
[share]
remote = ""
repo_path = "~/.discrawl/share"
branch = "main"
auto_update = true
stale_after = "15m"
```
`concurrency` is auto-sized at `init` to `min(32, max(8, GOMAXPROCS*2))`.
## Token resolution
In order:
1. `DISCORD_BOT_TOKEN`, or the env var named in `discord.token_env`
2. OS keyring item `discrawl` / `discord_bot_token`, or the configured keyring service/account
`discrawl` accepts either raw token text or a value prefixed with `Bot `. Normalization is automatic.
Set `discord.token_source = "keyring"` if you want to require keyring lookup and skip env entirely. Set it to `"none"` for a Git-only reader.
## Override rules
- `--config <path>` beats everything
- `DISCRAWL_CONFIG=<path>` overrides the default config path
- `discord.token_source = "none"` disables live Discord access for Git-only readers
- `discord.token_source = "keyring"` skips env lookup
- `DISCRAWL_NO_AUTO_UPDATE=1` disables Git snapshot auto-update for read commands in one process
## Notes
- `default_guild_id` is the implicit scope for `sync`, `tail`, `digest`, and `analytics` when `--guild` is not passed
- `guild_ids` is reserved for explicit multi-guild fan-out; usually you do not set this directly
- changing `[search.embeddings]` provider/model/input version retargets pending jobs and resets prior attempts; existing vectors for another identity remain in SQLite but are not used for semantic search
- changing `db_path` does not migrate existing data; copy the file yourself if you want to keep history

6
docs/contact.md Normal file
View File

@ -0,0 +1,6 @@
# Contact
Discord archive search and analysis tooling.
- Source: [github.com/openclaw/discrawl](https://github.com/openclaw/discrawl)
- Issues: [github.com/openclaw/discrawl/issues](https://github.com/openclaw/discrawl/issues)

View File

@ -0,0 +1,51 @@
# Data layout
Everything lives in one local SQLite file. Default path: `~/.discrawl/discrawl.db`.
## What is stored
- guild metadata
- channels and threads in one table (Discord models threads as channels)
- current member snapshot
- canonical message rows
- append-only message event records
- FTS5 index rows
- optional local embedding queue metadata and vectors
Messages imported from Discord Desktop use the same message, attachment, mention, and FTS paths as bot-synced messages.
## DMs
Proven DMs use the synthetic guild id `@me`. Unclassifiable desktop-cache payloads are skipped instead of being stored as unknown synthetic data.
## Attachments
Attachment binaries are not stored in SQLite. Only attachment metadata, filenames, and (optionally) extracted text.
Set `sync.attachment_text = false` if you want to keep attachment metadata and filenames but disable attachment body fetches for text indexing.
## Multi-guild ready
The schema is multi-guild ready even when the common UX stays single-guild simple. Threads are stored as channels because that matches the Discord model. Archived threads are part of the sync surface.
## Schema migrations
SQLite schema migrations are versioned with `PRAGMA user_version`. Startup fails fast when a local DB schema is newer than the supported binary - that means you have a binary older than the database.
## Querying directly
Anything you want, with read-only SQL:
```bash
discrawl sql 'select count(*) as messages from messages'
echo 'select guild_id, count(*) from messages group by guild_id' | discrawl sql -
```
See [`sql`](../commands/sql.html).
## See also
- [`status`](../commands/status.html) - high-level archive status
- [`channels`](../commands/channels.html) - channel directory
- [`members`](../commands/members.html) - member directory
- [Security](../security.html)

68
docs/guides/embeddings.md Normal file
View File

@ -0,0 +1,68 @@
# Embeddings
Embeddings are optional. FTS is the default search path and the primary verification target. Embeddings enrich recall in background batches; they do not block the hot sync path.
## Quick path
```bash
export OPENAI_API_KEY="..."
discrawl init --with-embeddings
discrawl sync --with-embeddings
discrawl embed --limit 1000
discrawl search --mode semantic "launch checklist"
discrawl search --mode hybrid "launch checklist"
```
## Two-phase pipeline
1. **Queue** - `sync --with-embeddings` writes `embedding_jobs` rows for new messages, changed normalized text, and messages without an existing job. The embedding provider is **not** called in this phase.
2. **Drain** - `discrawl embed` claims pending jobs with a short lock so overlapping runs do not process the same batch. It calls the configured provider, writes vectors to `message_embeddings` with provider, model, input version, dimensions, and binary vector data.
Behavior during drain:
- rate limits requeue the batch and stop that drain run cleanly
- provider or validation failures retry up to three attempts before marking the job failed
- messages with no normalized text are marked done and any stale vector for that message is removed
## Identity (provider, model, input version)
Stored on each job and vector. If you change provider or model:
- pending jobs are retargeted to the new identity
- prior attempts are reset
- existing vectors for another identity remain in SQLite but are not used for semantic search
Use `--rebuild` when you want to regenerate vectors for the existing archive after a config change:
```bash
discrawl embed --rebuild --limit 1000
```
## Local provider example
```toml
[search.embeddings]
enabled = true
provider = "ollama"
model = "nomic-embed-text"
```
With local providers, message and query embedding both happen on the same machine. With remote providers, message text is sent during `discrawl embed`, and search query text is sent during `--mode semantic` or `--mode hybrid` calls.
## Git snapshot interaction
By default, `publish` does not export embeddings. Use `--with-embeddings`:
```bash
discrawl publish --with-embeddings --push
discrawl subscribe --with-embeddings https://github.com/example/discord-archive.git
discrawl update --with-embeddings
```
The snapshot stores vectors under `embeddings/<provider>/<model>/<input_version>/...` and records that identity in `manifest.json`. Only vectors for non-DM messages are exported. Import only restores matching embedding manifests, so an Ollama/nomic subscriber does not accidentally import OpenAI/text-embedding vectors. `embedding_jobs` is never exported; subscribers that want fresh local vectors run `discrawl embed --rebuild`. Publishing without `--with-embeddings` omits embedding manifests instead of carrying forward an older bundle.
## See also
- [Search modes](search-modes.html)
- [`embed`](../commands/embed.html)
- [Configuration](../configuration.html)

View File

@ -0,0 +1,84 @@
# Git-backed snapshots
Discrawl can publish the SQLite archive as sharded, compressed NDJSON snapshots in a private Git repo, then auto-import that repo before local read commands. This gives readers org memory without Discord credentials.
Snapshot packing/import and git mirror mechanics are shared through
`crawlkit`. Discrawl still owns Discord-specific privacy policy: `@me` direct
messages, wiretap sync state, and local-only desktop rows are excluded from
published snapshots and are preserved locally on import.
## Publisher
```bash
discrawl publish --remote https://github.com/example/discord-archive.git --push
discrawl publish --readme path/to/discord-backup/README.md --push
```
The publisher uses your existing bot-synced archive. It exports non-DM tables only.
## Subscriber
```bash
discrawl subscribe https://github.com/example/discord-archive.git
discrawl search "launch checklist"
discrawl messages --channel general --hours 24
```
`subscribe` is the Git-only setup path. It writes a config with `discord.token_source = "none"`, imports the snapshot, and does not require a Discord bot token. `sync` and `tail` remain disabled in this mode because they need live Discord access.
## Auto-update
Once `share.remote` is configured, read commands auto-fetch and import when the local share import is older than `share.stale_after` (default `15m`):
```bash
discrawl subscribe --stale-after 15m https://github.com/example/discord-archive.git
discrawl subscribe --no-auto-update https://github.com/example/discord-archive.git
```
`discrawl update` forces the same pull/import step manually. Snapshot imports are delta-planned from crawlkit shard fingerprints. Older manifests without those fields fall back to Git blob identity, so the common publish shape only imports the changed message tail shard plus small cursor tables. Unsafe table-shape changes still fall back to a full import.
`discrawl sync` does **not** auto-import the share unless `--update=auto` or `--update=force` is provided, so routine live refreshes stay fast.
## Hybrid mode
Keep normal Discord credentials configured **and** set `share.remote`:
```bash
discrawl sync --update=auto # import snapshot delta first, then live deltas
discrawl messages --sync # blocking pre-query sync for matched scope
discrawl sync --all-channels # broader live repair
discrawl sync --full # historical backfill
```
## What is published
- non-DM archive tables (DM `@me` rows are always excluded)
- README activity block - latest update time, latest archived message, archive totals, day/week/month activity
- `embedding_jobs` is never exported
## Backing up vectors
```bash
discrawl publish --with-embeddings --push
discrawl subscribe --with-embeddings https://github.com/example/discord-archive.git
discrawl update --with-embeddings
```
Stored under `embeddings/<provider>/<model>/<input_version>/...`. Import only restores matching identities; Ollama/nomic subscribers do not accidentally pick up OpenAI/text-embedding vectors. Publishing without `--with-embeddings` omits embedding manifests instead of carrying forward an older bundle.
## CI
The Docker smoke test installs `discrawl` in a clean Go container, subscribes to a Git snapshot repo, then checks `search`, `messages`, `sql`, and `report`:
```bash
DISCRAWL_DOCKER_TEST=1 go test ./internal/cli -run TestDockerGitSourceSmoke -count=1
```
The backup workflows restore and save `.discrawl-ci/discrawl.db` with `actions/cache`. On a warm runner cache, scheduled publishers skip the pre-sync snapshot import and go straight to the live latest-message delta before publishing. Cache misses still import the latest published snapshot first so `--latest-only` has channel cursors to resume from.
## See also
- [`publish`](../commands/publish.html)
- [`subscribe`](../commands/subscribe.html)
- [`update`](../commands/update.html)
- [`report`](../commands/report.html)

View File

@ -0,0 +1,57 @@
# Search modes
`discrawl search` has three modes. FTS is the default and works with no embeddings.
## Modes
- **`fts`** (default) - searches the local SQLite FTS5 index, returns newest matching messages first
- **`semantic`** - embeds the query, scores against locally stored message vectors; errors out cleanly if embeddings are disabled or no compatible vectors exist
- **`hybrid`** - runs FTS and semantic, deduplicates by message id, falls back to FTS when semantic is unavailable
## FTS details
- backed by SQLite FTS5 with the default `unicode61` tokenizer
- user query terms are parameterized and quoted before `MATCH`, so tokens like `AND`, `OR`, `NOT`, `NEAR`, and `*` are searched as input terms instead of FTS operators
- punctuation still follows FTS5 tokenization rules
- by default, `search` skips rows with no searchable content (attachment text, attachment filenames, embeds, and replies still count as content); use `--include-empty` to opt back in
## Semantic and hybrid prerequisites
- `[search.embeddings]` configured in `~/.discrawl/config.toml`
- local `message_embeddings` rows for the configured provider, model, and input version
- input version is currently `message_normalized_v1`, so vectors are tied to normalized message text rather than raw Discord payloads
Two-phase embedding creation:
1. `discrawl sync --with-embeddings` queues changed messages by writing `embedding_jobs` rows. New messages, changed normalized text, and messages without an existing job are queued. This phase does not call the embedding provider.
2. `discrawl embed` drains pending jobs in bounded batches, calls the configured provider, and writes vectors to `message_embeddings`.
## Provider/model identity
The provider/model/input-version identity is stored on each job and vector. If you change provider or model, pending jobs are retargeted to the new identity and prior attempts are reset. Existing vectors for another identity remain in SQLite, but semantic search only reads vectors compatible with the current config.
Use `--rebuild` when changing provider, model, or input settings and you want to regenerate vectors for the existing archive.
## Local vs remote providers
Local providers like Ollama keep both message and query embedding on the same machine. With remote providers (OpenAI, etc.), message text is sent during `discrawl embed`, and search query text is sent when using `--mode semantic` or `--mode hybrid`. Stored message text is not sent during local vector scoring.
## Examples
```bash
discrawl search "panic: nil pointer"
discrawl search --mode fts "panic: nil pointer"
discrawl search --mode semantic "missing launch checklist"
discrawl search --mode hybrid "database timeout"
discrawl search --guild 123456789012345678 "payment failed"
discrawl search --dm "launch checklist"
discrawl search --channel billing --author steipete --limit 50 "invoice"
discrawl search --include-empty "GitHub"
discrawl --json search "websocket closed"
```
## See also
- [`search`](../commands/search.html)
- [`embed`](../commands/embed.html)
- [Embeddings](embeddings.html)

View File

@ -0,0 +1,57 @@
# Sync sources
Discrawl reads from two local archive sources. Either or both can run in a single `sync`.
## Sources
| Source | Reads from | Stores |
| --- | --- | --- |
| `both` | Discord bot API and local Discord Desktop cache | bot-visible guild data plus classifiable cached desktop messages |
| `discord` / `key` / `bot` / `api` | Discord bot API | guilds, channels, threads, members, and messages the bot can access |
| `wiretap` / `desktop` / `cache` | local Discord Desktop cache files | classifiable cached messages; proven DMs are stored under `@me` |
The default is `both`. Pick one with `--source` or by setting `[sync].source` in config.
## Bot sync modes
Sync modes control the Discord bot API side of a run. When `wiretap` is selected, the desktop cache import runs once alongside the chosen bot sync mode.
| Command | Use when | Behavior |
| --- | --- | --- |
| `discrawl sync` | routine refresh | skips member refreshes, checks live top-level channels plus active threads, only fetches new messages for channels with a stored latest cursor |
| `discrawl sync --update=auto` | hybrid Git/live refresh | imports a stale Git snapshot first, usually as a changed-shard delta, then runs the routine live refresh |
| `discrawl sync --all-channels` | repair pass | broad incremental sweep across every stored channel/thread, including archived threads |
| `discrawl sync --full` | historical backfill | crawls older history until channels are complete; can take a long time on large servers |
Run one explicit `--full` pass when you want a complete historical guild archive. Use plain `sync` afterward for frequent latest-message and desktop-cache refreshes.
## Concurrency
`sync` already uses parallel channel workers for bot API message crawling. The default is auto-sized from `GOMAXPROCS` with a floor of `8` and a cap of `32`. Override with `--concurrency`.
## Targeting
- `--guild <id>` runs only that guild
- `--guilds 123,456` runs an explicit set
- `--all` ignores `default_guild_id` and fans out across every discovered guild
- `--channels 111,222` targets specific channels (forum ids expand to their threads)
- `--since <RFC3339>` limits initial history and `--full` backfill to messages at or after the timestamp; older history is not marked complete, so a later `sync --full` without `--since` can continue the backfill
## Performance and resilience
- Long runs emit periodic progress logs to stderr.
- If in-flight channels stop completing for a while, `discrawl` emits `message sync waiting` heartbeat logs with the oldest active channel, per-channel page activity, and skip/defer counters.
- Every run ends with a `message sync finished` summary.
- Each channel crawl has a bounded runtime budget; pathological channels are deferred and retried on the next sync.
- Retryable failures and unavailable-channel markers are tracked per channel; stale unavailable markers are cleared after a later successful crawl.
- Marker cleanup is best-effort, so one missing local sync-state row cannot crash the run.
- Full sync member refresh is best-effort and gives up after five minutes without a caller-supplied deadline, so message sync completion is not held hostage by a slow guild member crawl.
- When the archive is already complete, `sync --full` reuses backlog markers and limits steady-state refresh to live top-level channels plus active threads instead of revisiting every stored archived thread.
- If a guild already has a local member snapshot, routine syncs reuse it and skip another full member crawl until that snapshot ages out.
## See also
- [`sync`](../commands/sync.html)
- [`tail`](../commands/tail.html)
- [Wiretap](wiretap.html)
- [Git snapshots](git-snapshots.html)

61
docs/guides/wiretap.md Normal file
View File

@ -0,0 +1,61 @@
# Desktop wiretap
`wiretap` imports classifiable Discord Desktop message payloads into the same local SQLite archive used by bot sync. It is the path for searchable DMs because bot tokens cannot read personal direct messages.
`wiretap` is also available through `discrawl sync --source wiretap` and is included in the default `discrawl sync --source both` path.
## What it does
- stores classifiable cache messages in the same `guilds`, `channels`, and `messages` tables used by bot sync
- stores proven DMs under the synthetic guild id `@me`
- preserves existing local `@me` guilds, channels, messages, and attachments when importing a Git snapshot, so a shared guild mirror refresh does not wipe local wiretap DM search
- drops message payloads whose channel cannot be classified from cached channel metadata or Discord route URLs; dropped rows are counted as `skipped_messages`
- imports what Discord Desktop has cached locally - not complete live DM history
## What it does not do
- does not extract, store, or print Discord auth tokens
- does not use a user token
- does not call the Discord API as your user
- does not run as a selfbot
## DM privacy: `@me` stays local
`@me` rows are local-only. Excluded from:
- `publish` (Git snapshot output)
- `subscribe` / Git snapshot import
- `--with-embeddings` snapshot export
Excluded categories: DM guilds, channels, messages, events, attachments, mentions, wiretap sync state, and vectors for DM messages.
## What gets scanned
- local `.ldb`, `.log`, `.json`, and `.txt` artifacts for Discord message JSON
- route-bearing Chromium HTTP cache entries by default
- `--full-cache` (or `desktop.full_cache = true`) enables exhaustive Chromium cache import for slower historical guild-cache archaeology
- `--max-file-bytes` skips unusually large files (default 64 MiB)
## Flags
```bash
discrawl wiretap
discrawl wiretap --path "$HOME/Library/Application Support/discord"
discrawl wiretap --dry-run
discrawl wiretap --full-cache
discrawl wiretap --watch-every 2m
```
`--watch-every` keeps the import running on a periodic loop. `--dry-run` reports what would be imported without writing anything.
## Default desktop paths
- macOS: `~/Library/Application Support/discord`
- Linux: `~/.config/discord`
- override via `--path` or `[desktop].path`
## See also
- [`wiretap`](../commands/wiretap.html)
- [`dms`](../commands/dms.html) - convenience layer over `@me`
- [Sync sources](sync-sources.html)

13
docs/index.html Normal file
View File

@ -0,0 +1,13 @@
<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<meta http-equiv="refresh" content="0; url=README.html">
<link rel="canonical" href="README.html">
<title>Discrawl docs</title>
</head>
<body>
<p><a href="README.html">Discrawl docs</a></p>
</body>
</html>

66
docs/install.md Normal file
View File

@ -0,0 +1,66 @@
# Install
Discrawl is a single Go binary. Install via Homebrew or build from source.
## Homebrew
```bash
brew install steipete/tap/discrawl
discrawl --version
```
The tap auto-installs from `steipete/tap`.
## From source
Requires Go `1.26+`.
```bash
git clone https://github.com/openclaw/discrawl.git
cd discrawl
go build -o bin/discrawl ./cmd/discrawl
./bin/discrawl --version
```
If you do not put `discrawl` on `PATH`, replace `discrawl` with `./bin/discrawl` in any example below.
## Quick start (with bot token)
```bash
export DISCORD_BOT_TOKEN="your-bot-token"
discrawl init
discrawl doctor
discrawl sync --full
discrawl sync
discrawl search "panic: nil pointer"
discrawl tail
```
`init` discovers accessible guilds and writes `~/.discrawl/config.toml`. If exactly one guild is available, it becomes the default automatically.
`doctor` verifies the config loads, the token resolves, the bot can reach the Gateway, and the local DB and FTS index are wired up.
## Quick start (Git-only reader)
No Discord credentials required. You read a private Git snapshot another machine published.
```bash
discrawl subscribe https://github.com/example/discord-archive.git
discrawl search "launch checklist"
discrawl messages --channel general --hours 24
```
`subscribe` writes a token-free config (`discord.token_source = "none"`) and imports the snapshot. Read commands auto-refresh when the local snapshot is older than `15m`.
## Default runtime paths
- config: `~/.discrawl/config.toml`
- database: `~/.discrawl/discrawl.db`
- cache: `~/.discrawl/cache/`
- logs: `~/.discrawl/logs/`
## Next steps
- [Bot setup](bot-setup.html) - intents, permissions, token sources
- [Configuration](configuration.html) - the full TOML shape and override rules
- [`sync`](commands/sync.html) - the main archive command

49
docs/security.md Normal file
View File

@ -0,0 +1,49 @@
# Security
## Tokens and credentials
- Do not commit bot tokens or API keys.
- Default config lives in your home directory, not inside the repo.
- Prefer env vars or the OS keyring for bot tokens.
- `discrawl doctor` reports the token source (env or keyring), not token contents.
## Wiretap is local-only
`wiretap` reads local Discord Desktop cache files only. It does not:
- extract, store, or print Discord auth tokens
- use a user token
- call the Discord API as your user
- run as a selfbot
Wiretap DM messages stay local. They are stored under the synthetic guild id `@me` and are never exported to:
- `publish` (Git snapshot output)
- `subscribe` / Git snapshot import
- the optional `--with-embeddings` snapshot export
A shared guild mirror refresh does not wipe local wiretap DM search either - import preserves existing local `@me` guilds, channels, messages, and attachments.
## CI
CI runs secret scanning with `gitleaks` against git history and the working tree.
## What is stored locally
- guild metadata
- channels and threads (one table)
- current member snapshot
- canonical message rows
- append-only message event records
- FTS index rows
- optional local embedding queue metadata and vectors
Attachment binaries are not stored in SQLite. Only attachment metadata and (optionally) extracted text.
Set `sync.attachment_text = false` if you want to keep attachment metadata and filenames but disable attachment body fetches for text indexing.
## What is sent over the wire
With remote embedding providers, message text is sent during `discrawl embed`, and search query text is sent when using `--mode semantic` or `--mode hybrid`. Stored message text is not sent during local vector scoring.
Local providers like Ollama keep both message and query embedding on the same machine.

BIN
docs/social-card.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 146 KiB

79
docs/social-card.svg Normal file
View File

@ -0,0 +1,79 @@
<svg xmlns="http://www.w3.org/2000/svg" width="1200" height="630" viewBox="0 0 1200 630" role="img" aria-labelledby="title desc">
<title id="title">Discrawl social card</title>
<desc id="desc">Discrawl mirrors Discord into SQLite for local search and analysis.</desc>
<defs>
<linearGradient id="bg" x1="0" y1="0" x2="1" y2="1">
<stop offset="0" stop-color="#0b0f16"/>
<stop offset="0.58" stop-color="#111723"/>
<stop offset="1" stop-color="#151827"/>
</linearGradient>
<linearGradient id="accent" x1="0" y1="0" x2="1" y2="0">
<stop offset="0" stop-color="#5fe3d4"/>
<stop offset="0.56" stop-color="#a594ff"/>
<stop offset="1" stop-color="#f364a2"/>
</linearGradient>
<linearGradient id="terminal" x1="0" y1="0" x2="0" y2="1">
<stop offset="0" stop-color="#161d2a"/>
<stop offset="1" stop-color="#0f141d"/>
</linearGradient>
<filter id="shadow" x="-10%" y="-15%" width="120%" height="130%">
<feDropShadow dx="0" dy="22" stdDeviation="22" flood-color="#000000" flood-opacity="0.45"/>
</filter>
<filter id="softGlow" x="-40%" y="-40%" width="180%" height="180%">
<feGaussianBlur stdDeviation="36"/>
</filter>
</defs>
<rect width="1200" height="630" fill="url(#bg)"/>
<circle cx="1030" cy="92" r="210" fill="#5fe3d4" opacity="0.11" filter="url(#softGlow)"/>
<circle cx="104" cy="568" r="240" fill="#f364a2" opacity="0.10" filter="url(#softGlow)"/>
<path d="M0 515 C190 438 330 548 512 472 S874 330 1200 410 L1200 630 L0 630 Z" fill="#0a0d13" opacity="0.55"/>
<path d="M0 534 C206 456 338 570 520 492 S884 360 1200 438" fill="none" stroke="url(#accent)" stroke-width="3" opacity="0.44"/>
<g transform="translate(72 70)">
<rect x="0" y="0" width="112" height="112" rx="22" fill="#0c0f14" stroke="#253244" stroke-width="2"/>
<rect x="23" y="28" width="66" height="47" rx="5" fill="none" stroke="#5fe3d4" stroke-width="4"/>
<line x1="23" y1="43" x2="89" y2="43" stroke="#5fe3d4" stroke-width="3"/>
<circle cx="33" cy="36" r="2.8" fill="#f364a2"/>
<circle cx="43" cy="36" r="2.8" fill="#f7c177"/>
<circle cx="53" cy="36" r="2.8" fill="#5fe3d4"/>
<text x="31" y="59" font-family="JetBrains Mono, Menlo, Consolas, monospace" font-size="10" font-weight="800" fill="#5fe3d4">SELECT</text>
<text x="31" y="71" font-family="JetBrains Mono, Menlo, Consolas, monospace" font-size="10" font-weight="800" fill="#aab3c1">msgs</text>
<rect x="23" y="84" width="66" height="6" rx="3" fill="#161b24"/>
<rect x="23" y="84" width="42" height="6" rx="3" fill="#5fe3d4"/>
</g>
<text x="205" y="126" font-family="JetBrains Mono, Menlo, Consolas, monospace" font-size="28" font-weight="800" letter-spacing="2" fill="#5fe3d4">discrawl.sh</text>
<text x="72" y="248" font-family="Inter, -apple-system, BlinkMacSystemFont, Segoe UI, sans-serif" font-size="96" font-weight="800" letter-spacing="-3" fill="#edf4fb">Discord history,</text>
<text x="72" y="346" font-family="Inter, -apple-system, BlinkMacSystemFont, Segoe UI, sans-serif" font-size="96" font-weight="800" letter-spacing="-3" fill="#edf4fb">local answers.</text>
<text x="74" y="410" font-family="Inter, -apple-system, BlinkMacSystemFont, Segoe UI, sans-serif" font-size="30" font-weight="560" fill="#aab3c1">Mirror Discord into SQLite.</text>
<text x="74" y="450" font-family="Inter, -apple-system, BlinkMacSystemFont, Segoe UI, sans-serif" font-size="30" font-weight="560" fill="#aab3c1">Search, query, tail, and analyze locally.</text>
<g transform="translate(72 505)">
<rect x="0" y="0" width="210" height="54" rx="10" fill="#5fe3d4"/>
<text x="28" y="35" font-family="JetBrains Mono, Menlo, Consolas, monospace" font-size="20" font-weight="900" fill="#081016">discrawl sync</text>
<rect x="230" y="0" width="228" height="54" rx="10" fill="#151d29" stroke="#263448" stroke-width="2"/>
<text x="258" y="35" font-family="JetBrains Mono, Menlo, Consolas, monospace" font-size="20" font-weight="800" fill="#f364a2">discrawl search</text>
</g>
<g transform="translate(742 135)" filter="url(#shadow)">
<rect x="0" y="0" width="386" height="330" rx="20" fill="url(#terminal)" stroke="#263448" stroke-width="2"/>
<rect x="0" y="0" width="386" height="54" rx="20" fill="#121925"/>
<path d="M0 34 Q0 0 34 0 H352 Q386 0 386 34 V54 H0 Z" fill="#121925"/>
<circle cx="30" cy="27" r="7" fill="#f364a2"/>
<circle cx="54" cy="27" r="7" fill="#f7c177"/>
<circle cx="78" cy="27" r="7" fill="#5fe3d4"/>
<text x="112" y="34" font-family="JetBrains Mono, Menlo, Consolas, monospace" font-size="16" font-weight="800" fill="#657287">sqlite archive</text>
<text x="28" y="95" font-family="JetBrains Mono, Menlo, Consolas, monospace" font-size="20" font-weight="800" fill="#5fe3d4">$ discrawl wiretap</text>
<text x="28" y="132" font-family="JetBrains Mono, Menlo, Consolas, monospace" font-size="18" font-weight="650" fill="#6f7b8d">dm cache imported: 814</text>
<text x="28" y="180" font-family="JetBrains Mono, Menlo, Consolas, monospace" font-size="20" font-weight="800" fill="#5fe3d4">$ discrawl sql</text>
<text x="28" y="218" font-family="JetBrains Mono, Menlo, Consolas, monospace" font-size="18" font-weight="650" fill="#edf4fb">312k messages</text>
<text x="28" y="255" font-family="JetBrains Mono, Menlo, Consolas, monospace" font-size="18" font-weight="650" fill="#edf4fb">14k attachments</text>
<text x="28" y="292" font-family="JetBrains Mono, Menlo, Consolas, monospace" font-size="18" font-weight="650" fill="#edf4fb">FTS5 ready</text>
<rect x="286" y="260" width="72" height="10" rx="5" fill="#263448"/>
<rect x="312" y="282" width="46" height="10" rx="5" fill="#263448"/>
<rect x="298" y="304" width="60" height="10" rx="5" fill="#263448"/>
</g>
<text x="72" y="600" font-family="JetBrains Mono, Menlo, Consolas, monospace" font-size="18" font-weight="800" fill="#657287">bot sync + desktop wiretap + FTS5 + semantic search</text>
</svg>

After

Width:  |  Height:  |  Size: 5.9 KiB

31
go.mod
View File

@ -1,30 +1,53 @@
module github.com/steipete/discrawl
module github.com/openclaw/discrawl
go 1.26.2
go 1.26.3
require (
github.com/bwmarrin/discordgo v0.29.0
github.com/gorilla/websocket v1.5.3
github.com/pelletier/go-toml/v2 v2.3.1
github.com/stretchr/testify v1.11.1
github.com/zalando/go-keyring v0.2.8
golang.org/x/sys v0.43.0
golang.org/x/text v0.36.0
modernc.org/sqlite v1.50.0
)
require (
github.com/charmbracelet/bubbles v1.0.0 // indirect
github.com/clipperhouse/displaywidth v0.11.0 // indirect
github.com/clipperhouse/uax29/v2 v2.7.0 // indirect
github.com/pelletier/go-toml/v2 v2.3.1 // indirect
modernc.org/sqlite v1.50.0 // indirect
)
require (
github.com/aymanbagabas/go-osc52/v2 v2.0.1 // indirect
github.com/charmbracelet/bubbletea v1.3.10 // indirect
github.com/charmbracelet/colorprofile v0.4.1 // indirect
github.com/charmbracelet/lipgloss v1.1.0 // indirect
github.com/charmbracelet/x/ansi v0.11.7 // indirect
github.com/charmbracelet/x/cellbuf v0.0.15 // indirect
github.com/charmbracelet/x/term v0.2.2 // indirect
github.com/danieljoos/wincred v1.2.3 // indirect
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/dustin/go-humanize v1.0.1 // indirect
github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f // indirect
github.com/godbus/dbus/v5 v5.2.2 // indirect
github.com/google/pprof v0.0.0-20260402051712-545e8a4df936 // indirect
github.com/google/uuid v1.6.0 // indirect
github.com/kr/pretty v0.3.1 // indirect
github.com/lucasb-eyer/go-colorful v1.4.0 // indirect
github.com/mattn/go-isatty v0.0.22 // indirect
github.com/mattn/go-localereader v0.0.1 // indirect
github.com/mattn/go-runewidth v0.0.23 // indirect
github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 // indirect
github.com/muesli/cancelreader v0.2.2 // indirect
github.com/muesli/termenv v0.16.0 // indirect
github.com/ncruces/go-strftime v1.0.0 // indirect
github.com/openclaw/crawlkit v0.5.0
github.com/pmezard/go-difflib v1.0.0 // indirect
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
github.com/rivo/uniseg v0.4.7 // indirect
github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e // indirect
golang.org/x/crypto v0.50.0 // indirect
golang.org/x/tools v0.44.0 // indirect
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c // indirect

43
go.sum
View File

@ -1,5 +1,25 @@
github.com/aymanbagabas/go-osc52/v2 v2.0.1 h1:HwpRHbFMcZLEVr42D4p7XBqjyuxQH5SMiErDT4WkJ2k=
github.com/aymanbagabas/go-osc52/v2 v2.0.1/go.mod h1:uYgXzlJ7ZpABp8OJ+exZzJJhRNQ2ASbcXHWsFqH8hp8=
github.com/bwmarrin/discordgo v0.29.0 h1:FmWeXFaKUwrcL3Cx65c20bTRW+vOb6k8AnaP+EgjDno=
github.com/bwmarrin/discordgo v0.29.0/go.mod h1:NJZpH+1AfhIcyQsPeuBKsUtYrRnjkyu0kIVMCHkZtRY=
github.com/charmbracelet/bubbles v1.0.0 h1:12J8/ak/uCZEMQ6KU7pcfwceyjLlWsDLAxB5fXonfvc=
github.com/charmbracelet/bubbles v1.0.0/go.mod h1:9d/Zd5GdnauMI5ivUIVisuEm3ave1XwXtD1ckyV6r3E=
github.com/charmbracelet/bubbletea v1.3.10 h1:otUDHWMMzQSB0Pkc87rm691KZ3SWa4KUlvF9nRvCICw=
github.com/charmbracelet/bubbletea v1.3.10/go.mod h1:ORQfo0fk8U+po9VaNvnV95UPWA1BitP1E0N6xJPlHr4=
github.com/charmbracelet/colorprofile v0.4.1 h1:a1lO03qTrSIRaK8c3JRxJDZOvhvIeSco3ej+ngLk1kk=
github.com/charmbracelet/colorprofile v0.4.1/go.mod h1:U1d9Dljmdf9DLegaJ0nGZNJvoXAhayhmidOdcBwAvKk=
github.com/charmbracelet/lipgloss v1.1.0 h1:vYXsiLHVkK7fp74RkV7b2kq9+zDLoEU4MZoFqR/noCY=
github.com/charmbracelet/lipgloss v1.1.0/go.mod h1:/6Q8FR2o+kj8rz4Dq0zQc3vYf7X+B0binUUBwA0aL30=
github.com/charmbracelet/x/ansi v0.11.7 h1:kzv1kJvjg2S3r9KHo8hDdHFQLEqn4RBCb39dAYC84jI=
github.com/charmbracelet/x/ansi v0.11.7/go.mod h1:9qGpnAVYz+8ACONkZBUWPtL7lulP9No6p1epAihUZwQ=
github.com/charmbracelet/x/cellbuf v0.0.15 h1:ur3pZy0o6z/R7EylET877CBxaiE1Sp1GMxoFPAIztPI=
github.com/charmbracelet/x/cellbuf v0.0.15/go.mod h1:J1YVbR7MUuEGIFPCaaZ96KDl5NoS0DAWkskup+mOY+Q=
github.com/charmbracelet/x/term v0.2.2 h1:xVRT/S2ZcKdhhOuSP4t5cLi5o+JxklsoEObBSgfgZRk=
github.com/charmbracelet/x/term v0.2.2/go.mod h1:kF8CY5RddLWrsgVwpw4kAa6TESp6EB5y3uxGLeCqzAI=
github.com/clipperhouse/displaywidth v0.11.0 h1:lBc6kY44VFw+TDx4I8opi/EtL9m20WSEFgwIwO+UVM8=
github.com/clipperhouse/displaywidth v0.11.0/go.mod h1:bkrFNkf81G8HyVqmKGxsPufD3JhNl3dSqnGhOoSD/o0=
github.com/clipperhouse/uax29/v2 v2.7.0 h1:+gs4oBZ2gPfVrKPthwbMzWZDaAFPGYK72F0NJv2v7Vk=
github.com/clipperhouse/uax29/v2 v2.7.0/go.mod h1:EFJ2TJMRUaplDxHKj1qAEhCtQPW2tJSwu5BF98AuoVM=
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
github.com/danieljoos/wincred v1.2.3 h1:v7dZC2x32Ut3nEfRH+vhoZGvN72+dQ/snVXo/vMFLdQ=
github.com/danieljoos/wincred v1.2.3/go.mod h1:6qqX0WNrS4RzPZ1tnroDzq9kY3fu1KwE7MRLQK4X0bs=
@ -7,6 +27,8 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f h1:Y/CXytFA4m6baUTXGLOoWe4PQhGxaX0KpnayAqC48p4=
github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f/go.mod h1:vw97MGsxSvLiUE2X8qFplwetxpGLQrlU1Q9AUEIzCaM=
github.com/godbus/dbus/v5 v5.2.2 h1:TUR3TgtSVDmjiXOgAAyaZbYmIeP3DPkld3jgKGV8mXQ=
github.com/godbus/dbus/v5 v5.2.2/go.mod h1:3AAv2+hPq5rdnr5txxxRwiGjPXamgoIHgz9FPBfOp3c=
github.com/google/pprof v0.0.0-20260402051712-545e8a4df936 h1:EwtI+Al+DeppwYX2oXJCETMO23COyaKGP6fHVpkpWpg=
@ -25,10 +47,24 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
github.com/lucasb-eyer/go-colorful v1.4.0 h1:UtrWVfLdarDgc44HcS7pYloGHJUjHV/4FwW4TvVgFr4=
github.com/lucasb-eyer/go-colorful v1.4.0/go.mod h1:R4dSotOR9KMtayYi1e77YzuveK+i7ruzyGqttikkLy0=
github.com/mattn/go-isatty v0.0.22 h1:j8l17JJ9i6VGPUFUYoTUKPSgKe/83EYU2zBC7YNKMw4=
github.com/mattn/go-isatty v0.0.22/go.mod h1:ZXfXG4SQHsB/w3ZeOYbR0PrPwLy+n6xiMrJlRFqopa4=
github.com/mattn/go-localereader v0.0.1 h1:ygSAOl7ZXTx4RdPYinUpg6W99U8jWvWi9Ye2JC/oIi4=
github.com/mattn/go-localereader v0.0.1/go.mod h1:8fBrzywKY7BI3czFoHkuzRoWE9C+EiG4R1k4Cjx5p88=
github.com/mattn/go-runewidth v0.0.23 h1:7ykA0T0jkPpzSvMS5i9uoNn2Xy3R383f9HDx3RybWcw=
github.com/mattn/go-runewidth v0.0.23/go.mod h1:XBkDxAl56ILZc9knddidhrOlY5R/pDhgLpndooCuJAs=
github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 h1:ZK8zHtRHOkbHy6Mmr5D264iyp3TiX5OmNcI5cIARiQI=
github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6/go.mod h1:CJlz5H+gyd6CUWT45Oy4q24RdLyn7Md9Vj2/ldJBSIo=
github.com/muesli/cancelreader v0.2.2 h1:3I4Kt4BQjOR54NavqnDogx/MIoWBFa0StPA8ELUXHmA=
github.com/muesli/cancelreader v0.2.2/go.mod h1:3XuTXfFS2VjM+HTLZY9Ak0l6eUKfijIfMUZ4EgX0QYo=
github.com/muesli/termenv v0.16.0 h1:S5AlUN9dENB57rsbnkPyfdGuWIlkmzJjbFf0Tf5FWUc=
github.com/muesli/termenv v0.16.0/go.mod h1:ZRfOIKPFDYQoDFF4Olj7/QJbW60Ol/kL1pU3VfY/Cnk=
github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w=
github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls=
github.com/openclaw/crawlkit v0.5.0 h1:sVqIbQ5v6LiOf+NXcVj93UhfoaJqMbBlrd1lU6uhO9M=
github.com/openclaw/crawlkit v0.5.0/go.mod h1:/AI8o/DeRqXPZJPHq/9mGUjNzLPskm/wTjikRPxEdHY=
github.com/pelletier/go-toml/v2 v2.3.1 h1:MYEvvGnQjeNkRF1qUuGolNtNExTDwct51yp7olPtrEc=
github.com/pelletier/go-toml/v2 v2.3.1/go.mod h1:2gIqNv+qfxSVS7cM2xJQKtLSTLUE9V8t9Stt+h56mCY=
github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA=
@ -36,23 +72,30 @@ github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZb
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE=
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ=
github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
github.com/rogpeppe/go-internal v1.9.0 h1:73kH8U+JUqXU8lRuOHeVHaa/SZPifC7BkcraZVejAe8=
github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs=
github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY=
github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA=
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e h1:JVG44RsyaB9T2KIHavMF/ppJZNG9ZpyihvCd0w101no=
github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e/go.mod h1:RbqR21r5mrJuqunuUZ/Dhy/avygyECGrLceyNeo4LiM=
github.com/zalando/go-keyring v0.2.8 h1:6sD/Ucpl7jNq10rM2pgqTs0sZ9V3qMrqfIIy5YPccHs=
github.com/zalando/go-keyring v0.2.8/go.mod h1:tsMo+VpRq5NGyKfxoBVjCuMrG47yj8cmakZDO5QGii0=
golang.org/x/crypto v0.0.0-20210421170649-83a5a9bb288b/go.mod h1:T9bdIzuCu7OtxOm1hfPfRQxPLYneinmdGuTeoZ9dtd4=
golang.org/x/crypto v0.50.0 h1:zO47/JPrL6vsNkINmLoo/PH1gcxpls50DNogFvB5ZGI=
golang.org/x/crypto v0.50.0/go.mod h1:3muZ7vA7PBCE6xgPX7nkzzjiUq87kRItoJQM1Yo8S+Q=
golang.org/x/exp v0.0.0-20231006140011-7918f672742d h1:jtJma62tbqLibJ5sFQz8bKtEM8rJBtfilJ2qTU199MI=
golang.org/x/exp v0.0.0-20231006140011-7918f672742d/go.mod h1:ldy0pHrwJyGW56pPQzzkH36rKxoZW1tw7ZJpeKx+hdo=
golang.org/x/mod v0.35.0 h1:Ww1D637e6Pg+Zb2KrWfHQUnH2dQRLBQyAtpr/haaJeM=
golang.org/x/mod v0.35.0/go.mod h1:+GwiRhIInF8wPm+4AoT6L0FA1QWAad3OMdTRx4tFYlU=
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4=
golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0=
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.0.0-20210809222454-d867a43fc93e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
golang.org/x/sys v0.43.0 h1:Rlag2XtaFTxp19wS8MXlJwTvoh8ArU6ezoyFsMyCTNI=
golang.org/x/sys v0.43.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=

View File

@ -13,12 +13,13 @@ import (
"syscall"
"time"
"github.com/steipete/discrawl/internal/config"
"github.com/steipete/discrawl/internal/discord"
"github.com/steipete/discrawl/internal/discorddesktop"
"github.com/steipete/discrawl/internal/embed"
"github.com/steipete/discrawl/internal/store"
"github.com/steipete/discrawl/internal/syncer"
"github.com/openclaw/crawlkit/embed"
"github.com/openclaw/discrawl/internal/config"
"github.com/openclaw/discrawl/internal/discord"
"github.com/openclaw/discrawl/internal/discorddesktop"
"github.com/openclaw/discrawl/internal/share"
"github.com/openclaw/discrawl/internal/store"
"github.com/openclaw/discrawl/internal/syncer"
)
type syncSources struct {
@ -113,9 +114,19 @@ func (r *runtime) runSync(args []string) error {
latestOnly := fs.Bool("latest-only", false, "")
guildsFlag := fs.String("guilds", "", "")
guildFlag := fs.String("guild", "", "")
updateMode := fs.String("update", "", "")
noUpdate := fs.Bool("no-update", false, "")
if err := fs.Parse(args); err != nil {
return usageErr(err)
}
if *noUpdate && strings.TrimSpace(*updateMode) != "" && !strings.EqualFold(strings.TrimSpace(*updateMode), string(shareUpdateNever)) {
return usageErr(errors.New("use either --no-update or --update, not both"))
}
if strings.TrimSpace(*updateMode) != "" {
if _, err := parseShareUpdateMode(*updateMode); err != nil {
return usageErr(err)
}
}
sources, err := parseSyncSources(*source)
if err != nil {
return usageErr(err)
@ -151,6 +162,7 @@ func (r *runtime) runSync(args []string) error {
func (r *runtime) runSyncLocked(sources syncSources, opts syncer.SyncOptions) error {
var apiStats *syncer.SyncStats
if sources.discord {
r.setSyncLockPhase("discord sync")
shouldClose := r.client == nil
if err := r.ensureDiscordServices(); err != nil {
return err
@ -166,9 +178,11 @@ func (r *runtime) runSyncLocked(sources syncSources, opts syncer.SyncOptions) er
}
var wiretapStats *discorddesktop.Stats
if sources.wiretap {
r.setSyncLockPhase("wiretap import")
stats, err := discorddesktop.Import(r.ctx, r.store, discorddesktop.Options{
Path: r.cfg.Desktop.Path,
MaxFileBytes: r.cfg.Desktop.MaxFileBytes,
FullCache: r.cfg.Desktop.FullCache,
Now: r.now,
})
if err != nil {
@ -250,6 +264,7 @@ func (r *runtime) runWiretap(args []string) error {
fs.SetOutput(io.Discard)
path := fs.String("path", r.cfg.Desktop.Path, "")
maxFileBytes := fs.Int64("max-file-bytes", r.cfg.Desktop.MaxFileBytes, "")
fullCache := fs.Bool("full-cache", r.cfg.Desktop.FullCache, "")
dryRun := fs.Bool("dry-run", false, "")
watchEvery := fs.Duration("watch-every", 0, "")
if err := fs.Parse(args); err != nil {
@ -265,6 +280,7 @@ func (r *runtime) runWiretap(args []string) error {
stats, err := discorddesktop.Import(ctx, r.store, discorddesktop.Options{
Path: *path,
MaxFileBytes: *maxFileBytes,
FullCache: *fullCache,
DryRun: *dryRun,
Now: r.now,
})
@ -299,16 +315,37 @@ func (r *runtime) runWiretap(args []string) error {
}
func (r *runtime) runStatus(args []string) error {
if len(args) != 0 {
fs := flag.NewFlagSet("status", flag.ContinueOnError)
fs.SetOutput(io.Discard)
jsonOut := fs.Bool("json", false, "")
if err := fs.Parse(args); err != nil {
return usageErr(err)
}
if fs.NArg() != 0 {
return usageErr(errors.New("status takes no arguments"))
}
if *jsonOut {
r.json = true
}
dbPath, err := config.ExpandPath(r.cfg.DBPath)
if err != nil {
return configErr(err)
}
status, err := r.store.Status(r.ctx, dbPath, r.cfg.EffectiveDefaultGuildID())
if err != nil {
return err
status := store.Status{DBPath: dbPath, DefaultGuildID: r.cfg.EffectiveDefaultGuildID()}
if r.store != nil {
status, err = r.store.Status(r.ctx, dbPath, r.cfg.EffectiveDefaultGuildID())
if err != nil {
return err
}
}
if r.json {
needsUpdate := false
if r.store != nil && r.cfg.ShareEnabled() {
if staleAfter, err := time.ParseDuration(r.cfg.Share.StaleAfter); err == nil {
needsUpdate = share.NeedsImport(r.ctx, r.store, staleAfter)
}
}
return r.print(controlStatus(r.configPath, r.cfg, status, needsUpdate))
}
return r.print(status)
}
@ -337,7 +374,7 @@ func (r *runtime) runEmbed(args []string) error {
providerFactory := r.newEmbed
if providerFactory == nil {
providerFactory = func(cfg config.EmbeddingsConfig) (embed.Provider, error) {
return embed.NewProvider(cfg)
return embed.NewProvider(crawlkitEmbeddingConfig(cfg))
}
}
provider, err := providerFactory(r.cfg.Search.Embeddings)
@ -369,9 +406,18 @@ func (r *runtime) runEmbed(args []string) error {
}
func (r *runtime) runDoctor(args []string) error {
if len(args) != 0 {
fs := flag.NewFlagSet("doctor", flag.ContinueOnError)
fs.SetOutput(io.Discard)
jsonOut := fs.Bool("json", false, "")
if err := fs.Parse(args); err != nil {
return usageErr(err)
}
if fs.NArg() != 0 {
return usageErr(errors.New("doctor takes no arguments"))
}
if *jsonOut {
r.json = true
}
report := map[string]any{
"config_path": r.configPath,
}
@ -389,7 +435,7 @@ func (r *runtime) runDoctor(args []string) error {
report["share_stale_after"] = cfg.Share.StaleAfter
}
if cfg.Search.Embeddings.Enabled {
check := embed.CheckProvider(r.ctx, cfg.Search.Embeddings)
check := embed.CheckProvider(r.ctx, crawlkitEmbeddingConfig(cfg.Search.Embeddings))
report["embeddings"] = check.Status
report["embeddings_provider"] = check.Provider
report["embeddings_model"] = check.Model

View File

@ -7,7 +7,7 @@ import (
"io"
"strings"
"github.com/steipete/discrawl/internal/report"
"github.com/openclaw/discrawl/internal/report"
)
func (r *runtime) runAnalytics(args []string) error {
@ -20,11 +20,11 @@ func (r *runtime) runAnalytics(args []string) error {
subArgs := args[1:]
switch subcommand {
case "quiet":
return r.withLocalStoreDefaultLocked(true, true, func() error {
return r.withLocalStoreRead(true, func() error {
return r.runAnalyticsQuiet(subArgs)
})
case "trends":
return r.withLocalStoreDefaultLocked(true, true, func() error {
return r.withLocalStoreRead(true, func() error {
return r.runAnalyticsTrends(subArgs)
})
default:

View File

@ -10,8 +10,8 @@ import (
"github.com/stretchr/testify/require"
"github.com/steipete/discrawl/internal/config"
"github.com/steipete/discrawl/internal/store"
"github.com/openclaw/discrawl/internal/config"
"github.com/openclaw/discrawl/internal/store"
)
func TestAnalyticsCommand(t *testing.T) {

View File

@ -11,12 +11,12 @@ import (
"time"
"github.com/bwmarrin/discordgo"
"github.com/steipete/discrawl/internal/config"
"github.com/steipete/discrawl/internal/discord"
"github.com/steipete/discrawl/internal/embed"
"github.com/steipete/discrawl/internal/share"
"github.com/steipete/discrawl/internal/store"
"github.com/steipete/discrawl/internal/syncer"
"github.com/openclaw/crawlkit/embed"
"github.com/openclaw/discrawl/internal/config"
"github.com/openclaw/discrawl/internal/discord"
"github.com/openclaw/discrawl/internal/share"
"github.com/openclaw/discrawl/internal/store"
"github.com/openclaw/discrawl/internal/syncer"
)
type cliError struct {
@ -47,6 +47,10 @@ func ExitCode(err error) int {
}
func Run(ctx context.Context, args []string, stdout, stderr io.Writer) error {
if len(args) == 0 || args[0] == "help" || args[0] == "--help" || args[0] == "-h" {
printUsage(stdout)
return nil
}
global := flag.NewFlagSet("discrawl", flag.ContinueOnError)
global.SetOutput(io.Discard)
configPath := global.String("config", "", "")
@ -66,10 +70,14 @@ func Run(ctx context.Context, args []string, stdout, stderr io.Writer) error {
return nil
}
rest := global.Args()
if len(rest) == 0 || rest[0] == "help" {
if len(rest) == 0 || rest[0] == "help" || rest[0] == "--help" || rest[0] == "-h" {
printUsage(stdout)
return nil
}
if rest[0] == "version" {
_, _ = io.WriteString(stdout, version+"\n")
return nil
}
level := slog.LevelInfo
if *quiet {
level = slog.LevelError
@ -90,23 +98,35 @@ func Run(ctx context.Context, args []string, stdout, stderr io.Writer) error {
}
type runtime struct {
ctx context.Context
configPath string
cfg config.Config
stdout io.Writer
stderr io.Writer
json bool
plain bool
logger *slog.Logger
store *store.Store
client discordClient
syncer syncService
dbLockHeld bool
openStore func(context.Context, string) (*store.Store, error)
newDiscord func(config.Config) (discordClient, error)
newSyncer func(syncer.Client, *store.Store, *slog.Logger) syncService
newEmbed func(config.EmbeddingsConfig) (embed.Provider, error)
now func() time.Time
ctx context.Context
configPath string
cfg config.Config
stdout io.Writer
stderr io.Writer
json bool
plain bool
logger *slog.Logger
store *store.Store
client discordClient
syncer syncService
dbLockHeld bool
lockStarted time.Time
openStore func(context.Context, string) (*store.Store, error)
newDiscord func(config.Config) (discordClient, error)
newSyncer func(syncer.Client, *store.Store, *slog.Logger) syncService
newEmbed func(config.EmbeddingsConfig) (embed.Provider, error)
now func() time.Time
}
func crawlkitEmbeddingConfig(cfg config.EmbeddingsConfig) embed.Config {
return embed.Config{
Provider: cfg.Provider,
Model: cfg.Model,
BaseURL: cfg.BaseURL,
APIKeyEnv: cfg.APIKeyEnv,
RequestTimeout: cfg.RequestTimeout,
MaxInputChars: cfg.MaxInputChars,
}
}
type discordClient interface {
@ -128,43 +148,59 @@ type attachmentTextConfigurer interface {
func (r *runtime) dispatch(rest []string) error {
switch rest[0] {
case "metadata":
return r.runMetadata(rest[1:])
case "init":
return r.runInit(rest[1:])
case "sync":
return r.withLocalStoreLocked(true, func() error { return r.runSync(rest[1:]) })
updateMode, err := syncShareUpdateMode(rest[1:])
if err != nil {
return usageErr(err)
}
return r.withLocalStoreUpdateLocked(updateMode, true, func() error { return r.runSync(rest[1:]) })
case "tail":
return r.withServicesLocked(true, func() error { return r.runTail(rest[1:]) })
case "wiretap":
return r.withLocalStoreLocked(false, func() error { return r.runWiretap(rest[1:]) })
case "tap", "cache-import":
return r.withLocalStoreLocked(false, func() error { return r.runWiretap(rest[1:]) })
case "search":
autoShareUpdate := !hasBoolFlag(rest[1:], "--dm")
return r.withLocalStoreDefaultLocked(autoShareUpdate, autoShareUpdate, func() error { return r.runSearch(rest[1:]) })
return r.withLocalStoreRead(autoShareUpdate, func() error { return r.runSearch(rest[1:]) })
case "tui":
if hasHelpArg(rest[1:]) {
return r.runTUI(rest[1:])
}
return r.withLocalStoreReadOnly(func() error { return r.runTUI(rest[1:]) })
case "messages":
if hasBoolFlag(rest[1:], "--sync") && !hasBoolFlag(rest[1:], "--dm") {
return r.withServicesAutoLocked(true, true, true, func() error { return r.runMessages(rest[1:]) })
}
autoShareUpdate := !hasBoolFlag(rest[1:], "--dm")
return r.withLocalStoreDefaultLocked(autoShareUpdate, autoShareUpdate, func() error { return r.runMessages(rest[1:]) })
return r.withLocalStoreRead(autoShareUpdate, func() error { return r.runMessages(rest[1:]) })
case "digest":
return r.withLocalStoreDefaultLocked(true, true, func() error { return r.runDigest(rest[1:]) })
return r.withLocalStoreRead(true, func() error { return r.runDigest(rest[1:]) })
case "analytics":
return r.runAnalytics(rest[1:])
case "dms":
return r.withLocalStoreDefault(false, func() error { return r.runDirectMessages(rest[1:]) })
return r.withLocalStoreRead(false, func() error { return r.runDirectMessages(rest[1:]) })
case "mentions":
return r.withLocalStoreLocked(true, func() error { return r.runMentions(rest[1:]) })
return r.withLocalStoreRead(true, func() error { return r.runMentions(rest[1:]) })
case "embed":
return r.withLocalStoreLocked(true, func() error { return r.runEmbed(rest[1:]) })
case "sql":
return r.withLocalStoreLocked(true, func() error { return r.runSQL(rest[1:]) })
if boolFlagEnabled(rest[1:], "--unsafe") {
return r.withLocalStoreLocked(true, func() error { return r.runSQL(rest[1:]) })
}
return r.withLocalStoreRead(true, func() error { return r.runSQL(rest[1:]) })
case "members":
return r.withLocalStoreLocked(true, func() error { return r.runMembers(rest[1:]) })
return r.withLocalStoreRead(true, func() error { return r.runMembers(rest[1:]) })
case "channels":
return r.withLocalStoreLocked(true, func() error { return r.runChannels(rest[1:]) })
return r.withLocalStoreRead(true, func() error { return r.runChannels(rest[1:]) })
case "status":
return r.withLocalStoreLocked(true, func() error { return r.runStatus(rest[1:]) })
return r.withLocalStoreReadOnly(func() error { return r.runStatus(rest[1:]) })
case "report":
return r.withLocalStoreLocked(true, func() error { return r.runReport(rest[1:]) })
return r.withLocalStoreRead(true, func() error { return r.runReport(rest[1:]) })
case "publish":
return r.withServicesAutoLocked(false, false, true, func() error { return r.runPublish(rest[1:]) })
case "subscribe":
@ -187,14 +223,41 @@ func (r *runtime) withServicesLocked(withDiscord bool, fn func() error) error {
}
func (r *runtime) withLocalStoreLocked(autoShareUpdate bool, fn func() error) error {
return r.withLocalStoreDefaultLocked(autoShareUpdate, true, fn)
return r.withLocalStoreUpdateLocked(boolShareUpdateMode(autoShareUpdate), true, fn)
}
func (r *runtime) withLocalStoreDefault(autoShareUpdate bool, fn func() error) error {
return r.withLocalStoreDefaultLocked(autoShareUpdate, false, fn)
func (r *runtime) withLocalStoreRead(autoShareUpdate bool, fn func() error) error {
return r.withLocalStoreReadUpdate(boolShareUpdateMode(autoShareUpdate), fn)
}
func (r *runtime) withLocalStoreDefaultLocked(autoShareUpdate, lockDB bool, fn func() error) error {
func (r *runtime) withLocalStoreReadUpdate(updateMode shareUpdateMode, fn func() error) error {
cfg, err := config.Load(r.configPath)
if err != nil {
if !errors.Is(err, os.ErrNotExist) {
return configErr(err)
}
cfg = config.Default()
if err := cfg.Normalize(); err != nil {
return configErr(err)
}
}
if err := config.EnsureRuntimeDirs(cfg); err != nil {
return configErr(err)
}
dbPath, err := config.ExpandPath(cfg.DBPath)
if err != nil {
return configErr(err)
}
r.cfg = cfg
if r.shouldAutoUpdateShare(updateMode) {
if err := r.autoUpdateShareIfLockAvailable(dbPath, updateMode); err != nil {
return err
}
}
return r.openLocalStoreReadOnly(dbPath, fn)
}
func (r *runtime) withLocalStoreUpdateLocked(updateMode shareUpdateMode, lockDB bool, fn func() error) error {
cfg, err := config.Load(r.configPath)
if err != nil {
if !errors.Is(err, os.ErrNotExist) {
@ -215,13 +278,45 @@ func (r *runtime) withLocalStoreDefaultLocked(autoShareUpdate, lockDB bool, fn f
r.cfg = cfg
if lockDB {
return r.withSyncLock(func() error {
return r.openLocalStore(dbPath, autoShareUpdate, fn)
return r.openLocalStore(dbPath, updateMode, fn)
})
}
return r.openLocalStore(dbPath, autoShareUpdate, fn)
return r.openLocalStore(dbPath, updateMode, fn)
}
func (r *runtime) openLocalStore(dbPath string, autoShareUpdate bool, fn func() error) error {
func (r *runtime) shouldAutoUpdateShare(mode shareUpdateMode) bool {
return os.Getenv("DISCRAWL_NO_AUTO_UPDATE") != "1" &&
r.cfg.ShareEnabled() &&
(mode == shareUpdateForce || mode == shareUpdateAuto || (mode == shareUpdateConfigured && r.cfg.Share.AutoUpdate))
}
func (r *runtime) autoUpdateShareIfLockAvailable(dbPath string, updateMode shareUpdateMode) error {
locked, err := r.tryWithSyncLock(func() error {
storeFactory := r.openStore
if storeFactory == nil {
storeFactory = store.Open
}
var openErr error
r.store, openErr = storeFactory(r.ctx, dbPath)
if openErr != nil {
return dbErr(openErr)
}
defer func() {
_ = r.store.Close()
r.store = nil
}()
return r.autoUpdateShare(updateMode)
})
if err != nil {
return err
}
if !locked {
r.logger.Info("share update skipped; sync lock is held")
}
return nil
}
func (r *runtime) openLocalStore(dbPath string, updateMode shareUpdateMode, fn func() error) error {
storeFactory := r.openStore
if storeFactory == nil {
storeFactory = store.Open
@ -232,19 +327,96 @@ func (r *runtime) openLocalStore(dbPath string, autoShareUpdate bool, fn func()
return dbErr(err)
}
defer func() { _ = r.store.Close() }()
if autoShareUpdate && os.Getenv("DISCRAWL_NO_AUTO_UPDATE") != "1" {
if err := r.autoUpdateShare(); err != nil {
if updateMode != shareUpdateNever && os.Getenv("DISCRAWL_NO_AUTO_UPDATE") != "1" {
if err := r.autoUpdateShare(updateMode); err != nil {
return err
}
}
return fn()
}
func (r *runtime) withLocalStoreReadOnly(fn func() error) error {
cfg, err := config.Load(r.configPath)
if err != nil {
if !errors.Is(err, os.ErrNotExist) {
return configErr(err)
}
cfg = config.Default()
if err := cfg.Normalize(); err != nil {
return configErr(err)
}
}
dbPath, err := config.ExpandPath(cfg.DBPath)
if err != nil {
return configErr(err)
}
r.cfg = cfg
var openErr error
r.store, openErr = store.OpenReadOnly(r.ctx, dbPath)
if openErr != nil {
if errors.Is(openErr, os.ErrNotExist) {
r.store = nil
return fn()
}
return dbErr(openErr)
}
defer func() { _ = r.store.Close() }()
return fn()
}
func (r *runtime) openLocalStoreReadOnly(dbPath string, fn func() error) error {
var openErr error
r.store, openErr = store.OpenReadOnly(r.ctx, dbPath)
if openErr != nil {
if errors.Is(openErr, os.ErrNotExist) {
storeFactory := r.openStore
if storeFactory == nil {
storeFactory = store.Open
}
r.store, openErr = storeFactory(r.ctx, dbPath)
if openErr == nil {
defer func() { _ = r.store.Close() }()
return fn()
}
}
if errors.Is(openErr, store.ErrSchemaVersionMismatch) {
if err := r.withSyncLock(func() error {
storeFactory := r.openStore
if storeFactory == nil {
storeFactory = store.Open
}
var migrateErr error
r.store, migrateErr = storeFactory(r.ctx, dbPath)
if migrateErr != nil {
return dbErr(migrateErr)
}
closeErr := r.store.Close()
r.store = nil
return closeErr
}); err != nil {
return err
}
r.store, openErr = store.OpenReadOnly(r.ctx, dbPath)
if openErr == nil {
defer func() { _ = r.store.Close() }()
return fn()
}
}
return dbErr(openErr)
}
defer func() { _ = r.store.Close() }()
return fn()
}
func (r *runtime) withServicesAuto(withDiscord, autoShareUpdate bool, fn func() error) error {
return r.withServicesAutoLocked(withDiscord, autoShareUpdate, false, fn)
}
func (r *runtime) withServicesAutoLocked(withDiscord, autoShareUpdate, lockDB bool, fn func() error) error {
return r.withServicesUpdateLocked(withDiscord, boolShareUpdateMode(autoShareUpdate), lockDB, fn)
}
func (r *runtime) withServicesUpdateLocked(withDiscord bool, updateMode shareUpdateMode, lockDB bool, fn func() error) error {
cfg, err := config.Load(r.configPath)
if err != nil {
return configErr(err)
@ -259,13 +431,13 @@ func (r *runtime) withServicesAutoLocked(withDiscord, autoShareUpdate, lockDB bo
r.cfg = cfg
if lockDB {
return r.withSyncLock(func() error {
return r.openServices(dbPath, withDiscord, autoShareUpdate, fn)
return r.openServices(dbPath, withDiscord, updateMode, fn)
})
}
return r.openServices(dbPath, withDiscord, autoShareUpdate, fn)
return r.openServices(dbPath, withDiscord, updateMode, fn)
}
func (r *runtime) openServices(dbPath string, withDiscord, autoShareUpdate bool, fn func() error) error {
func (r *runtime) openServices(dbPath string, withDiscord bool, updateMode shareUpdateMode, fn func() error) error {
storeFactory := r.openStore
if storeFactory == nil {
storeFactory = store.Open
@ -276,8 +448,8 @@ func (r *runtime) openServices(dbPath string, withDiscord, autoShareUpdate bool,
return dbErr(err)
}
defer func() { _ = r.store.Close() }()
if autoShareUpdate && os.Getenv("DISCRAWL_NO_AUTO_UPDATE") != "1" {
if err := r.autoUpdateShare(); err != nil {
if updateMode != shareUpdateNever && os.Getenv("DISCRAWL_NO_AUTO_UPDATE") != "1" {
if err := r.autoUpdateShare(updateMode); err != nil {
return err
}
}
@ -321,24 +493,27 @@ func (r *runtime) ensureDiscordServices() error {
return nil
}
func (r *runtime) autoUpdateShare() error {
if !r.cfg.ShareEnabled() || !r.cfg.Share.AutoUpdate {
func (r *runtime) autoUpdateShare(mode shareUpdateMode) error {
if !r.cfg.ShareEnabled() || (mode == shareUpdateConfigured && !r.cfg.Share.AutoUpdate) {
return nil
}
staleAfter, err := time.ParseDuration(r.cfg.Share.StaleAfter)
if err != nil {
return configErr(fmt.Errorf("invalid share.stale_after: %w", err))
}
if !share.NeedsImport(r.ctx, r.store, staleAfter) {
if mode != shareUpdateForce && !share.NeedsImport(r.ctx, r.store, staleAfter) {
return nil
}
opts, err := r.shareOptions()
if err != nil {
return err
}
r.setSyncLockPhase("share pull")
r.logger.Info("share update pulling", "repo_path", opts.RepoPath, "remote", opts.Remote)
if err := share.Pull(r.ctx, opts); err != nil {
return err
}
r.setSyncLockPhase("share import")
_, _, err = share.ImportIfChanged(r.ctx, r.store, opts)
if errors.Is(err, share.ErrNoManifest) {
return nil
@ -355,5 +530,6 @@ func (r *runtime) shareOptions() (share.Options, error) {
RepoPath: repoPath,
Remote: r.cfg.Share.Remote,
Branch: r.cfg.Share.Branch,
Progress: r.shareProgress,
}, nil
}

View File

@ -4,6 +4,8 @@ import (
"bytes"
"context"
"encoding/json"
"errors"
"io"
"log/slog"
"net/http"
"net/http/httptest"
@ -18,11 +20,13 @@ import (
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/steipete/discrawl/internal/config"
discordclient "github.com/steipete/discrawl/internal/discord"
"github.com/steipete/discrawl/internal/share"
"github.com/steipete/discrawl/internal/store"
"github.com/steipete/discrawl/internal/syncer"
"github.com/openclaw/discrawl/internal/config"
discordclient "github.com/openclaw/discrawl/internal/discord"
"github.com/openclaw/discrawl/internal/discorddesktop"
"github.com/openclaw/discrawl/internal/report"
"github.com/openclaw/discrawl/internal/share"
"github.com/openclaw/discrawl/internal/store"
"github.com/openclaw/discrawl/internal/syncer"
)
func TestHelpAndVersion(t *testing.T) {
@ -34,10 +38,196 @@ func TestHelpAndVersion(t *testing.T) {
out.Reset()
require.NoError(t, Run(context.Background(), []string{"--version"}, &out, &bytes.Buffer{}))
require.Contains(t, out.String(), "0.6.4")
require.Contains(t, out.String(), "0.7.0")
err := Run(context.Background(), []string{"bogus"}, &out, &bytes.Buffer{})
require.Equal(t, 2, ExitCode(err))
require.Equal(t, 1, ExitCode(context.Canceled))
require.Equal(t, 7, ExitCode(&cliError{code: 7, err: errors.New("custom")}))
}
func TestCommandValidationEdges(t *testing.T) {
ctx := context.Background()
dir := t.TempDir()
cfgPath := filepath.Join(dir, "config.toml")
dbPath := filepath.Join(dir, "discrawl.db")
cfg := config.Default()
cfg.DBPath = dbPath
cfg.Discord.TokenSource = "none"
require.NoError(t, config.Write(cfgPath, cfg))
s, err := store.Open(ctx, dbPath)
require.NoError(t, err)
require.NoError(t, s.Close())
cases := [][]string{
{"--config", cfgPath, "--bogus"},
{"--config", cfgPath, "search"},
{"--config", cfgPath, "search", "--mode", "bogus", "term"},
{"--config", cfgPath, "messages"},
{"--config", cfgPath, "messages", "--hours", "-1", "--channel", "general"},
{"--config", cfgPath, "messages", "--hours", "1", "--days", "1", "--channel", "general"},
{"--config", cfgPath, "messages", "--all", "--last", "1", "--channel", "general"},
{"--config", cfgPath, "messages", "--dm", "--sync", "--channel", "alice"},
{"--config", cfgPath, "dms", "--hours", "-1"},
{"--config", cfgPath, "dms", "--limit", "1", "--last", "1", "--with", "alice"},
{"--config", cfgPath, "mentions"},
{"--config", cfgPath, "mentions", "--days", "-1", "--target", "u1"},
{"--config", cfgPath, "mentions", "--type", "channel", "--target", "u1"},
{"--config", cfgPath, "digest", "--since", "-1d"},
{"--config", cfgPath, "analytics", "wat"},
{"--config", cfgPath, "analytics", "quiet", "extra"},
{"--config", cfgPath, "analytics", "trends", "--weeks", "-1"},
{"--config", cfgPath, "channels"},
{"--config", cfgPath, "channels", "wat"},
{"--config", cfgPath, "channels", "show"},
{"--config", cfgPath, "status", "extra"},
{"--config", cfgPath, "report", "extra"},
{"--config", cfgPath, "wiretap", "extra"},
{"--config", cfgPath, "wiretap", "--max-file-bytes", "0"},
{"--config", cfgPath, "sync", "--source", "bogus"},
{"--config", cfgPath, "sync", "--since", "not-time"},
{"--config", cfgPath, "sync", "--no-update", "--update", "force"},
{"--config", cfgPath, "publish", "--remote", ""},
{"--config", cfgPath, "subscribe"},
{"--config", cfgPath, "update", "extra"},
{"--config", cfgPath, "sql", "--confirm", "select 1"},
{"--config", cfgPath, "sql", "--unsafe", "select 1"},
{"--config", cfgPath, "members"},
{"--config", cfgPath, "members", "wat"},
}
for _, args := range cases {
var stdout, stderr bytes.Buffer
err := Run(ctx, args, &stdout, &stderr)
require.Error(t, err, args)
}
}
func TestOutputBranches(t *testing.T) {
now := time.Date(2026, 5, 8, 12, 0, 0, 0, time.UTC)
values := []any{
syncRunStats{
Source: "both",
Discord: &syncer.SyncStats{Guilds: 1, Channels: 2, Threads: 3, Members: 4, Messages: 5},
Wiretap: &discorddesktop.Stats{
Path: "/tmp/discord",
FilesVisited: 1,
FilesScanned: 2,
FilesSkipped: 3,
FilesUnchanged: 4,
CacheFilesFastSkipped: 5,
JSONObjects: 6,
Guilds: 7,
Channels: 8,
Messages: 9,
DMMessages: 10,
DMChannels: 11,
GuildMessages: 12,
SkippedMessages: 13,
SkippedChannels: 14,
Checkpoints: 15,
FullCache: true,
DryRun: true,
},
},
syncer.SyncStats{Guilds: 1, Channels: 2, Threads: 3, Members: 4, Messages: 5},
discorddesktop.Stats{Path: "/tmp/discord", FilesVisited: 1, FullCache: true, DryRun: true},
store.EmbeddingDrainStats{
Processed: 3,
Succeeded: 2,
Failed: 1,
Requeued: 4,
RateLimited: true,
RemainingBacklog: 5,
Provider: "openai",
Model: "model",
InputVersion: "v1",
},
[]store.DirectMessageConversationRow{{
ChannelID: "c1",
Name: "Alice",
MessageCount: 2,
AuthorCount: 1,
FirstMessageAt: now.Add(-time.Hour),
LastMessageAt: now,
}},
store.MemberProfile{
Member: store.MemberRow{
GuildID: "g1",
UserID: "u1",
Username: "peter",
DisplayName: "Peter",
JoinedAt: now,
XHandle: "steipete",
GitHubLogin: "steipete",
Website: "https://steipete.me",
Pronouns: "he/him",
Location: "Vienna",
Bio: "Maintainer",
URLs: []string{"https://example.com"},
},
MessageCount: 1,
FirstMessageAt: now.Add(-time.Hour),
LastMessageAt: now,
RecentMessages: []store.MessageRow{{ChannelName: "general", CreatedAt: now, Content: "hello"}},
},
report.Digest{
Since: now.Add(-24 * time.Hour),
Until: now,
WindowLabel: "1d",
Channels: []report.ChannelDigest{{
ChannelID: "c1",
ChannelName: "general",
Kind: "text",
GuildID: "g1",
Messages: 3,
Replies: 1,
ActiveAuthors: 2,
TopPosters: []report.RankedCount{{Name: "Peter", Count: 2}},
TopMentions: []report.RankedCount{{Count: 1}},
}},
Totals: report.DigestTotals{Messages: 3, Replies: 1, Channels: 1, ActiveAuthors: 2},
},
report.Quiet{
Since: now.Add(-24 * time.Hour),
Until: now,
Channels: []report.QuietChannel{{
ChannelID: "c1",
ChannelName: "general",
Kind: "text",
LastMessage: "",
DaysSilent: -1,
}},
Totals: report.QuietTotals{Channels: 1},
},
report.Trends{
Since: now.AddDate(0, 0, -14),
Until: now,
Weeks: 2,
Rows: []report.TrendsRow{{
ChannelID: "c1",
ChannelName: "general",
Kind: "text",
GuildID: "g1",
Weekly: []report.WeeklyCount{
{WeekStart: now.AddDate(0, 0, -14), Messages: 1},
{WeekStart: now.AddDate(0, 0, -7), Messages: 2},
},
}},
},
map[string]any{"b": 2, "a": 1},
}
for _, value := range values {
var out bytes.Buffer
require.NoError(t, printHuman(&out, value))
require.NotEmpty(t, out.String())
}
var plain bytes.Buffer
require.NoError(t, printPlain(&plain, report.Quiet{Channels: []report.QuietChannel{{ChannelID: "c1", ChannelName: "general", Kind: "text", GuildID: "g1", LastMessage: "now", DaysSilent: 0}}}))
require.NoError(t, printPlain(&plain, report.Trends{Rows: []report.TrendsRow{{GuildID: "g1", ChannelID: "c1", ChannelName: "general", Kind: "text", Weekly: []report.WeeklyCount{{WeekStart: now, Messages: 2}}}}}))
require.Error(t, printPlain(io.Discard, struct{}{}))
require.Error(t, printHuman(io.Discard, struct{}{}))
require.Equal(t, "this is a profile field with a very l...", trimForTable("this is a profile field with a very long text value"))
}
func TestStatusSearchSQLAndListings(t *testing.T) {
@ -76,6 +266,21 @@ func TestStatusSearchSQLAndListings(t *testing.T) {
NormalizedContent: "panic locked database",
RawJSON: `{}`,
}))
require.NoError(t, s.UpsertGuild(ctx, store.GuildRecord{ID: "g2", Name: "Other Guild", RawJSON: `{}`}))
require.NoError(t, s.UpsertChannel(ctx, store.ChannelRecord{ID: "c2", GuildID: "g2", Kind: "text", Name: "random", RawJSON: `{}`}))
require.NoError(t, s.UpsertMessage(ctx, store.MessageRecord{
ID: "m-other",
GuildID: "g2",
ChannelID: "c2",
ChannelName: "random",
AuthorID: "u2",
AuthorName: "Outside",
MessageType: 0,
CreatedAt: time.Now().UTC().Add(-time.Hour).Format(time.RFC3339Nano),
Content: "outside default guild",
NormalizedContent: "outside default guild",
RawJSON: `{}`,
}))
require.NoError(t, s.UpsertMessage(ctx, store.MessageRecord{
ID: "m2",
GuildID: "g1",
@ -120,6 +325,7 @@ func TestStatusSearchSQLAndListings(t *testing.T) {
tests := [][]string{
{"--config", cfgPath, "status"},
{"--config", cfgPath, "search", "panic"},
{"--config", cfgPath, "search", "panic", "--limit", "1"},
{"--config", cfgPath, "search", "stack"},
{"--config", cfgPath, "search", "--include-empty", "Peter"},
{"--config", cfgPath, "messages", "--channel", "general", "--days", "7", "--all"},
@ -137,6 +343,100 @@ func TestStatusSearchSQLAndListings(t *testing.T) {
require.NoError(t, Run(ctx, args, &out, &bytes.Buffer{}))
require.NotEmpty(t, out.String())
}
for _, args := range [][]string{
{"--config", cfgPath, "metadata", "--json"},
{"--config", cfgPath, "status", "--json"},
} {
var out bytes.Buffer
require.NoError(t, Run(ctx, args, &out, &bytes.Buffer{}))
var payload map[string]any
require.NoError(t, json.Unmarshal(out.Bytes(), &payload))
require.NotEmpty(t, payload)
}
before, err := os.ReadFile(dbPath)
require.NoError(t, err)
var out bytes.Buffer
require.NoError(t, Run(ctx, []string{"--config", cfgPath, "--json", "tui", "--limit", "5"}, &out, &bytes.Buffer{}))
var rows []map[string]any
require.NoError(t, json.Unmarshal(out.Bytes(), &rows))
require.NotEmpty(t, rows)
require.Equal(t, "panic locked database", rows[0]["title"])
require.Equal(t, "discord", rows[0]["source"])
require.Equal(t, "message", rows[0]["kind"])
require.Equal(t, "Guild", rows[0]["scope"])
require.Equal(t, "general", rows[0]["container"])
require.Equal(t, "https://discord.com/channels/g1/c1/m1", rows[0]["url"])
after, err := os.ReadFile(dbPath)
require.NoError(t, err)
require.Equal(t, before, after, "tui --json should not mutate the database")
}
func TestTUIHelpReturnsUsage(t *testing.T) {
var stdout bytes.Buffer
var stderr bytes.Buffer
require.NoError(t, Run(context.Background(), []string{"tui", "--help"}, &stdout, &stderr))
require.Contains(t, stdout.String(), "Usage of tui:")
require.Contains(t, stdout.String(), "-limit")
require.Contains(t, stdout.String(), "right-click")
require.Contains(t, stdout.String(), "# jump")
require.Empty(t, stderr.String())
}
func TestControlStatusIncludesShareAndFileSizes(t *testing.T) {
dir := t.TempDir()
dbPath := filepath.Join(dir, "discrawl.db")
require.NoError(t, os.WriteFile(dbPath, []byte("db"), 0o600))
require.NoError(t, os.WriteFile(dbPath+"-wal", []byte("wal"), 0o600))
cfg := config.Default()
cfg.DBPath = dbPath
cfg.Share.Remote = "https://github.com/openclaw/discrawl-share.git"
cfg.Share.RepoPath = filepath.Join(dir, "share")
status := store.Status{
DBPath: dbPath,
MessageCount: 5,
ChannelCount: 2,
}
out := controlStatus(filepath.Join(dir, "config.toml"), cfg, status, true)
require.Equal(t, int64(2), out.DatabaseBytes)
require.Equal(t, int64(3), out.WALBytes)
require.Zero(t, fileSize(filepath.Join(dir, "missing.db")))
require.NotNil(t, out.Share)
require.True(t, out.Share.Enabled)
require.True(t, out.Share.NeedsUpdate)
require.Contains(t, out.Summary, "5 messages")
}
func TestFormattingAndTUISourceBranches(t *testing.T) {
require.Equal(t, "-", formatDaysSilent(-1))
require.Equal(t, "4", formatDaysSilent(4))
require.Equal(t, "0", formatWindowDuration(0))
require.Equal(t, "2d", formatWindowDuration(48*time.Hour))
require.Equal(t, "3h", formatWindowDuration(3*time.Hour))
require.Equal(t, "1h30m0s", formatWindowDuration(90*time.Minute))
require.Equal(t, 6*time.Hour, mustDuration("bogus"))
require.Equal(t, 15*time.Minute, mustDuration("15m"))
cfg := config.Default()
cfg.DBPath = "/tmp/discrawl.db"
r := &runtime{cfg: cfg}
require.Equal(t, "local", r.archiveSourceKind())
require.Equal(t, cfg.DBPath, r.archiveSourceLocation())
guilds, err := r.resolveTUIGuilds(false, "", "")
require.NoError(t, err)
require.Empty(t, guilds)
r.cfg.DefaultGuildID = "guild-one"
guilds, err = r.resolveTUIGuilds(false, "", "")
require.NoError(t, err)
require.Equal(t, []string{"guild-one"}, guilds)
r.cfg.Share.Remote = "https://github.com/openclaw/discrawl-share.git"
require.Equal(t, "remote", r.archiveSourceKind())
require.Equal(t, r.cfg.Share.Remote, r.archiveSourceLocation())
}
func TestWiretapImportsDesktopDirectMessages(t *testing.T) {
@ -183,6 +483,53 @@ func TestWiretapImportsDesktopDirectMessages(t *testing.T) {
require.Contains(t, out.String(), "secret DM launch plan")
}
func TestDiscordTUIRowsIncludePaneMetadata(t *testing.T) {
rows := discordTUIRows([]store.MessageRow{{
MessageID: "m1",
GuildID: "@me",
GuildName: "Discord Direct Messages",
ChannelID: "c1",
ChannelName: "Vincent K",
AuthorID: "u1",
AuthorName: "Peter",
Content: "hello from desktop",
DisplayContent: "hello from Vincent",
CreatedAt: time.Date(2026, 5, 2, 12, 0, 0, 0, time.UTC),
ReplyToMessage: "m0",
HasAttachments: true,
AttachmentNames: "trace.txt",
AttachmentText: "stack trace line one",
Pinned: true,
}})
require.Len(t, rows, 1)
require.Equal(t, "hello from Vincent", rows[0].Title)
require.Contains(t, rows[0].Detail, "hello from Vincent")
require.Contains(t, rows[0].Detail, "Attachments")
require.Contains(t, rows[0].Detail, "stack trace line one")
require.Equal(t, "hello from Vincent", rows[0].Text)
require.Equal(t, "Direct messages", rows[0].Scope)
require.Equal(t, "Vincent K", rows[0].Container)
require.Contains(t, rows[0].Tags, "dm")
require.Equal(t, "true", rows[0].Fields["attachments"])
require.Equal(t, "trace.txt", rows[0].Fields["attachment_names"])
require.Equal(t, "true", rows[0].Fields["pinned"])
require.Equal(t, "m0", rows[0].Fields["reply_to"])
require.Equal(t, "@me", rows[0].Fields["guild_id"])
rows = discordTUIRows([]store.MessageRow{{
MessageID: "m2",
GuildID: "g1",
ChannelID: "c2",
AuthorID: "439223656200273932",
Content: "desktop-only author",
CreatedAt: time.Date(2026, 5, 2, 12, 0, 0, 0, time.UTC),
Source: "discord_desktop",
}})
require.Equal(t, "user:439223...3932", rows[0].Author)
require.Equal(t, "DM c2", discordContainerLabel(store.MessageRow{GuildID: "@me", ChannelID: "c2"}))
require.Contains(t, rows[0].Tags, "discord_desktop")
}
func TestParseMessageWindow(t *testing.T) {
rt := &runtime{now: func() time.Time {
return time.Date(2026, 4, 24, 12, 0, 0, 0, time.UTC)
@ -600,7 +947,7 @@ func TestShareUpdateImportsNewRemoteSnapshot(t *testing.T) {
require.Contains(t, out.String(), "newer git snapshot arrived")
}
func TestSyncImportsGitShareBeforeLiveDiscord(t *testing.T) {
func TestSyncSkipsGitShareByDefaultAndCanImportBeforeLiveDiscord(t *testing.T) {
ctx := context.Background()
dir := t.TempDir()
remoteRepo := filepath.Join(dir, "remote.git")
@ -643,17 +990,33 @@ func TestSyncImportsGitShareBeforeLiveDiscord(t *testing.T) {
}
require.NoError(t, rt.dispatch([]string{"sync", "--all"}))
require.True(t, hybrid.sawGitMessage)
require.False(t, hybrid.sawGitMessage)
reader, err := store.Open(ctx, cfg.DBPath)
require.NoError(t, err)
defer func() { _ = reader.Close() }()
rows, err := reader.ListMessages(ctx, store.MessageListOptions{Channel: "general", IncludeEmpty: true})
require.NoError(t, err)
contents := make([]string, 0, len(rows))
for _, row := range rows {
contents = append(contents, row.Content)
}
require.NotContains(t, contents, "automatic updates work")
require.Contains(t, contents, "live discord filled the delta")
require.NoError(t, reader.Close())
hybrid.sawGitMessage = false
require.NoError(t, rt.dispatch([]string{"sync", "--all", "--update=auto"}))
require.True(t, hybrid.sawGitMessage)
reader, err = store.Open(ctx, cfg.DBPath)
require.NoError(t, err)
defer func() { _ = reader.Close() }()
rows, err = reader.ListMessages(ctx, store.MessageListOptions{Channel: "general", IncludeEmpty: true})
require.NoError(t, err)
contents = contents[:0]
for _, row := range rows {
contents = append(contents, row.Content)
}
require.Contains(t, contents, "automatic updates work")
require.Contains(t, contents, "live discord filled the delta")
}
@ -691,6 +1054,63 @@ func TestSyncLockSerializesConcurrentRuns(t *testing.T) {
require.ErrorIs(t, err, context.DeadlineExceeded)
}
func TestReadCommandsDoNotWaitForSyncLock(t *testing.T) {
if goruntime.GOOS == "windows" {
t.Skip("sync lock timing is flaky on Windows")
}
ctx := context.Background()
dir := t.TempDir()
cfg := config.Default()
cfg.DBPath = filepath.Join(dir, "discrawl.db")
cfgPath := filepath.Join(dir, "config.toml")
require.NoError(t, config.Write(cfgPath, cfg))
s := seedCLIStore(t, cfg.DBPath)
require.NoError(t, s.Close())
firstRelease, err := acquireSyncLock(ctx, filepath.Join(dir, ".discrawl-sync.lock"))
require.NoError(t, err)
defer func() { _ = firstRelease() }()
for _, args := range [][]string{
{"--config", cfgPath, "search", "automatic"},
{"--config", cfgPath, "messages", "--channel", "general", "--last", "1"},
{"--config", cfgPath, "sql", "select count(*) as total from messages"},
} {
runCtx, cancel := context.WithTimeout(ctx, 100*time.Millisecond)
var out bytes.Buffer
err := Run(runCtx, args, &out, &bytes.Buffer{})
cancel()
require.NoError(t, err, args)
require.NotEmpty(t, out.String(), args)
}
}
func TestReadCommandsMigrateOlderLocalStore(t *testing.T) {
ctx := context.Background()
dir := t.TempDir()
cfg := config.Default()
cfg.DBPath = filepath.Join(dir, "discrawl.db")
cfgPath := filepath.Join(dir, "config.toml")
require.NoError(t, config.Write(cfgPath, cfg))
s := seedCLIStore(t, cfg.DBPath)
_, err := s.DB().ExecContext(ctx, `pragma user_version = 1`)
require.NoError(t, err)
require.NoError(t, s.Close())
var out bytes.Buffer
require.NoError(t, Run(ctx, []string{"--config", cfgPath, "search", "automatic"}, &out, &bytes.Buffer{}))
require.Contains(t, out.String(), "automatic updates work")
reader, err := store.OpenReadOnly(ctx, cfg.DBPath)
require.NoError(t, err)
defer func() { _ = reader.Close() }()
var version int
require.NoError(t, reader.DB().QueryRowContext(ctx, `pragma user_version`).Scan(&version))
require.Equal(t, 2, version)
}
func seedCLIStore(t *testing.T, path string) *store.Store {
t.Helper()
ctx := context.Background()
@ -1543,6 +1963,17 @@ func TestHelpers(t *testing.T) {
require.Equal(t, []string{"a", "b"}, csvList("a,b,a"))
require.Equal(t, "x", (&cliError{code: 2, err: assertErr("x")}).Error())
mode, err := syncShareUpdateMode([]string{"--all"})
require.NoError(t, err)
require.Equal(t, shareUpdateNever, mode)
mode, err = syncShareUpdateMode([]string{"--update=auto"})
require.NoError(t, err)
require.Equal(t, shareUpdateAuto, mode)
mode, err = syncShareUpdateMode([]string{"--update", "force"})
require.NoError(t, err)
require.Equal(t, shareUpdateForce, mode)
_, err = syncShareUpdateMode([]string{"--update"})
require.Error(t, err)
require.Equal(t, 2, ExitCode(usageErr(assertErr("x"))))
require.Equal(t, 4, ExitCode(authErr(assertErr("x"))))
require.Equal(t, 5, ExitCode(dbErr(assertErr("x"))))
@ -1584,7 +2015,49 @@ func TestRuntimeHelpersAndSubcommands(t *testing.T) {
s, err := store.Open(ctx, dbPath)
require.NoError(t, err)
require.NoError(t, s.UpsertChannel(ctx, store.ChannelRecord{ID: "c1", GuildID: "g1", Kind: "text", Name: "general", RawJSON: `{}`}))
require.NoError(t, s.UpsertChannel(ctx, store.ChannelRecord{ID: "dm1", GuildID: store.DirectMessageGuildID, Kind: "dm", Name: "Alice", RawJSON: `{}`}))
require.NoError(t, s.UpsertMember(ctx, store.MemberRecord{GuildID: "g1", UserID: "u1", Username: "peter", RoleIDsJSON: `[]`, RawJSON: `{}`}))
base := time.Date(2026, 3, 8, 10, 0, 0, 0, time.UTC)
require.NoError(t, s.UpsertMessages(ctx, []store.MessageMutation{
{
Record: store.MessageRecord{
ID: "m1",
GuildID: "g1",
ChannelID: "c1",
ChannelName: "general",
AuthorID: "u1",
AuthorName: "peter",
CreatedAt: base.Format(time.RFC3339Nano),
Content: "hello <@u1> in <#c1>",
NormalizedContent: "hello <@u1> in <#c1>",
RawJSON: `{"author":{"username":"peter"}}`,
},
Mentions: []store.MentionEventRecord{{
MessageID: "m1",
GuildID: "g1",
ChannelID: "c1",
AuthorID: "u1",
TargetType: "user",
TargetID: "u1",
TargetName: "peter",
EventAt: base.Format(time.RFC3339Nano),
}},
},
{
Record: store.MessageRecord{
ID: "dm-msg",
GuildID: store.DirectMessageGuildID,
ChannelID: "dm1",
ChannelName: "Alice",
AuthorID: "u2",
AuthorName: "Alice",
CreatedAt: base.Add(time.Minute).Format(time.RFC3339Nano),
Content: "private hello",
NormalizedContent: "private hello",
RawJSON: `{"source":"discord_desktop"}`,
},
},
}))
require.NoError(t, s.Close())
rt := &runtime{
@ -1604,11 +2077,23 @@ func TestRuntimeHelpersAndSubcommands(t *testing.T) {
require.NoError(t, rt.runMessages([]string{"--channel", "#general", "--hours", "6", "--last", "1"}))
require.NoError(t, rt.runMessages([]string{"--channel", "#general", "--days", "7", "--all"}))
require.NoError(t, rt.runMessages([]string{"--channel", "#general", "--days", "7", "--all", "--include-empty"}))
require.NoError(t, rt.runMessages([]string{"--channel", "#general", "--since", "2026-03-08T00:00:00Z", "--before", "2026-03-09T00:00:00Z", "--limit", "1"}))
require.NoError(t, rt.runMessages([]string{"--dm", "--channel", "Alice", "--last", "1"}))
require.NoError(t, rt.runDirectMessages([]string{"--list"}))
require.NoError(t, rt.runDirectMessages([]string{"--with", "Alice", "--search", "private", "--limit", "1"}))
require.NoError(t, rt.runDirectMessages([]string{"--with", "Alice", "--since", "2026-03-08T00:00:00Z", "--before", "2026-03-09T00:00:00Z", "--all"}))
require.NoError(t, rt.runMentions([]string{"--channel", "#general", "--target", "u2"}))
require.NoError(t, rt.runMentions([]string{"--channel", "#general", "--days", "7", "--type", "user"}))
require.NoError(t, rt.runDigest([]string{"--since", "12h", "--channel", "general", "--top-n", "2"}))
require.NoError(t, rt.runReport([]string{"--readme", filepath.Join(dir, "README.md")}))
require.NoError(t, rt.runSearch([]string{"--include-empty", "Peter"}))
require.NoError(t, rt.runChannels([]string{"show", "c1"}))
require.NoError(t, rt.runChannels([]string{"list"}))
require.NoError(t, rt.runStatus(nil))
require.NoError(t, rt.runAnalytics([]string{}))
require.NoError(t, rt.runTUI([]string{"--json", "--limit", "1", "--include-empty"}))
require.NoError(t, rt.runAnalytics([]string{"quiet", "--since", "1d"}))
require.NoError(t, rt.runAnalytics([]string{"trends", "--weeks", "1", "--channel", "general"}))
return nil
}))
}
@ -1926,6 +2411,8 @@ func TestCommandUsageErrors(t *testing.T) {
require.Equal(t, 2, ExitCode(rt.runMessages([]string{"--days", "-1"})))
require.Equal(t, 2, ExitCode(rt.runMessages([]string{"--days", "1", "--since", "2026-03-01T00:00:00Z"})))
require.Equal(t, 2, ExitCode(rt.runSync([]string{"--all", "--guild", "g1"})))
require.Equal(t, 2, ExitCode(rt.runSync([]string{"--update", "bogus"})))
require.Equal(t, 2, ExitCode(rt.runSync([]string{"--update=force", "--no-update"})))
require.Equal(t, 2, ExitCode(rt.runChannels(nil)))
require.Equal(t, 2, ExitCode(rt.runStatus([]string{"extra"})))
require.NoError(t, (&runtime{stdout: &bytes.Buffer{}}).runDoctor(nil))

View File

@ -0,0 +1,96 @@
package cli
import (
"errors"
"flag"
"fmt"
"io"
"os"
"time"
"github.com/openclaw/crawlkit/control"
"github.com/openclaw/discrawl/internal/config"
"github.com/openclaw/discrawl/internal/store"
)
func (r *runtime) runMetadata(args []string) error {
fs := flag.NewFlagSet("metadata", flag.ContinueOnError)
fs.SetOutput(io.Discard)
jsonOut := fs.Bool("json", false, "")
if err := fs.Parse(args); err != nil {
return usageErr(err)
}
if fs.NArg() != 0 {
return usageErr(errors.New("metadata takes flags only"))
}
if *jsonOut {
r.json = true
}
cfg := config.Default()
manifest := control.NewManifest("discrawl", "Discord Crawl", "discrawl")
manifest.Description = "Local-first Discord archive crawler."
manifest.Branding = control.Branding{SymbolName: "bubble.left.and.bubble.right.fill", AccentColor: "#5865f2", BundleIdentifier: "com.hnc.Discord"}
manifest.Paths = control.Paths{
DefaultConfig: config.ResolvePath(""),
ConfigEnv: config.DefaultConfigEnv,
DefaultDatabase: cfg.DBPath,
DefaultCache: cfg.CacheDir,
DefaultLogs: cfg.LogDir,
DefaultShare: cfg.Share.RepoPath,
}
manifest.Capabilities = []string{"metadata", "status", "doctor", "sync", "tap", "tui", "git-share", "sql", "embeddings"}
manifest.Privacy = control.Privacy{ContainsPrivateMessages: true, ExportsSecrets: false, LocalOnlyScopes: []string{"discord", "desktop-cache", "sqlite", "git-share"}}
manifest.Commands = map[string]control.Command{
"status": {Title: "Status", Argv: []string{"discrawl", "status", "--json"}, JSON: true},
"doctor": {Title: "Doctor", Argv: []string{"discrawl", "doctor", "--json"}, JSON: true},
"sync": {Title: "Sync", Argv: []string{"discrawl", "--json", "sync"}, JSON: true, Mutates: true},
"tap": {Title: "Import desktop cache", Argv: []string{"discrawl", "--json", "tap"}, JSON: true, Mutates: true},
"cache-import": {Title: "Import desktop cache", Argv: []string{"discrawl", "--json", "cache-import"}, JSON: true, Mutates: true},
"wiretap": {Title: "Legacy desktop cache import", Argv: []string{"discrawl", "--json", "wiretap"}, JSON: true, Mutates: true, Legacy: true, Deprecated: true},
"tui": {Title: "Terminal browser", Argv: []string{"discrawl", "tui"}},
"tui-json": {Title: "Terminal browser rows", Argv: []string{"discrawl", "tui", "--json"}, JSON: true},
"publish": {Title: "Publish share", Argv: []string{"discrawl", "--json", "publish"}, JSON: true, Mutates: true},
"subscribe": {Title: "Subscribe share", Argv: []string{"discrawl", "--json", "subscribe"}, JSON: true, Mutates: true},
"update": {Title: "Update share", Argv: []string{"discrawl", "--json", "update"}, JSON: true, Mutates: true},
}
return r.print(manifest)
}
func controlStatus(configPath string, cfg config.Config, status store.Status, shareNeedsUpdate bool) control.Status {
counts := []control.Count{
control.NewCount("guilds", "Guilds", int64(status.GuildCount)),
control.NewCount("channels", "Channels", int64(status.ChannelCount)),
control.NewCount("threads", "Threads", int64(status.ThreadCount)),
control.NewCount("messages", "Messages", int64(status.MessageCount)),
control.NewCount("members", "Members", int64(status.MemberCount)),
control.NewCount("embedding_backlog", "Embedding backlog", int64(status.EmbeddingBacklog)),
}
out := control.NewStatus("discrawl", fmt.Sprintf("%d messages across %d channels", status.MessageCount, status.ChannelCount))
out.State = "current"
out.ConfigPath = configPath
out.DatabasePath = status.DBPath
out.Counts = counts
if !status.LastSyncAt.IsZero() {
out.LastSyncAt = status.LastSyncAt.UTC().Format(time.RFC3339)
}
db := control.SQLiteDatabase("primary", "Discord archive", "archive", status.DBPath, true, counts)
out.DatabaseBytes = db.Bytes
out.WALBytes = fileSize(status.DBPath + "-wal")
out.Databases = []control.Database{db}
out.Share = &control.Share{
Enabled: cfg.ShareEnabled(),
RepoPath: cfg.Share.RepoPath,
Remote: cfg.Share.Remote,
Branch: cfg.Share.Branch,
NeedsUpdate: shareNeedsUpdate,
}
return out
}
func fileSize(path string) int64 {
info, err := os.Stat(path)
if err != nil {
return 0
}
return info.Size()
}

View File

@ -9,7 +9,7 @@ import (
"strings"
"time"
"github.com/steipete/discrawl/internal/report"
"github.com/openclaw/discrawl/internal/report"
)
func (r *runtime) runDigest(args []string) error {

View File

@ -10,8 +10,8 @@ import (
"github.com/stretchr/testify/require"
"github.com/steipete/discrawl/internal/config"
"github.com/steipete/discrawl/internal/store"
"github.com/openclaw/discrawl/internal/config"
"github.com/openclaw/discrawl/internal/store"
)
func TestParseLookback(t *testing.T) {

View File

@ -8,7 +8,7 @@ import (
"strings"
"time"
"github.com/steipete/discrawl/internal/store"
"github.com/openclaw/discrawl/internal/store"
)
const defaultDMLast = 50

View File

@ -6,7 +6,7 @@ import (
"strings"
"time"
"github.com/steipete/discrawl/internal/store"
"github.com/openclaw/discrawl/internal/store"
)
func (r *runtime) resolveSyncGuilds(guild, guilds string) []string {

View File

@ -8,7 +8,7 @@ import (
"strings"
"time"
"github.com/steipete/discrawl/internal/store"
"github.com/openclaw/discrawl/internal/store"
)
func (r *runtime) runMentions(args []string) error {

View File

@ -8,7 +8,7 @@ import (
"strings"
"time"
"github.com/steipete/discrawl/internal/store"
"github.com/openclaw/discrawl/internal/store"
)
const defaultMessageLimit = 200

View File

@ -11,10 +11,10 @@ import (
"text/tabwriter"
"time"
"github.com/steipete/discrawl/internal/discorddesktop"
"github.com/steipete/discrawl/internal/report"
"github.com/steipete/discrawl/internal/store"
"github.com/steipete/discrawl/internal/syncer"
"github.com/openclaw/discrawl/internal/discorddesktop"
"github.com/openclaw/discrawl/internal/report"
"github.com/openclaw/discrawl/internal/store"
"github.com/openclaw/discrawl/internal/syncer"
)
func (r *runtime) print(value any) error {
@ -100,11 +100,16 @@ Usage:
discrawl [global flags] <command> [args]
Commands:
metadata
version
init
sync
tail
tap
cache-import
wiretap
search
tui
messages
digest
analytics
@ -142,8 +147,8 @@ func printHuman(w io.Writer, value any) error {
}
}
if v.Wiretap != nil {
if _, err := fmt.Fprintf(w, "wiretap_files=%d\nwiretap_unchanged=%d\nwiretap_messages=%d\nwiretap_dm_messages=%d\nwiretap_dm_channels=%d\nwiretap_guild_messages=%d\nwiretap_skipped_messages=%d\nwiretap_skipped_channels=%d\n",
v.Wiretap.FilesScanned, v.Wiretap.FilesUnchanged, v.Wiretap.Messages, v.Wiretap.DMMessages, v.Wiretap.DMChannels, v.Wiretap.GuildMessages, v.Wiretap.SkippedMessages, v.Wiretap.SkippedChannels); err != nil {
if _, err := fmt.Fprintf(w, "wiretap_visited=%d\nwiretap_files=%d\nwiretap_unchanged=%d\nwiretap_fast_skipped=%d\nwiretap_messages=%d\nwiretap_dm_messages=%d\nwiretap_dm_channels=%d\nwiretap_guild_messages=%d\nwiretap_skipped_messages=%d\nwiretap_skipped_channels=%d\nwiretap_checkpoints=%d\n",
v.Wiretap.FilesVisited, v.Wiretap.FilesScanned, v.Wiretap.FilesUnchanged, v.Wiretap.CacheFilesFastSkipped, v.Wiretap.Messages, v.Wiretap.DMMessages, v.Wiretap.DMChannels, v.Wiretap.GuildMessages, v.Wiretap.SkippedMessages, v.Wiretap.SkippedChannels, v.Wiretap.Checkpoints); err != nil {
return err
}
}
@ -152,8 +157,8 @@ func printHuman(w io.Writer, value any) error {
_, err := fmt.Fprintf(w, "guilds=%d channels=%d threads=%d members=%d messages=%d\n", v.Guilds, v.Channels, v.Threads, v.Members, v.Messages)
return err
case discorddesktop.Stats:
_, err := fmt.Fprintf(w, "path=%s\nfiles=%d\nskipped=%d\nunchanged=%d\nobjects=%d\nguilds=%d\nchannels=%d\nmessages=%d\ndm_messages=%d\ndm_channels=%d\nguild_messages=%d\nskipped_messages=%d\nskipped_channels=%d\ndry_run=%t\n",
v.Path, v.FilesScanned, v.FilesSkipped, v.FilesUnchanged, v.JSONObjects, v.Guilds, v.Channels, v.Messages, v.DMMessages, v.DMChannels, v.GuildMessages, v.SkippedMessages, v.SkippedChannels, v.DryRun)
_, err := fmt.Fprintf(w, "path=%s\nvisited=%d\nfiles=%d\nskipped=%d\nunchanged=%d\nfast_skipped=%d\nobjects=%d\nguilds=%d\nchannels=%d\nmessages=%d\ndm_messages=%d\ndm_channels=%d\nguild_messages=%d\nskipped_messages=%d\nskipped_channels=%d\ncheckpoints=%d\nfull_cache=%t\ndry_run=%t\n",
v.Path, v.FilesVisited, v.FilesScanned, v.FilesSkipped, v.FilesUnchanged, v.CacheFilesFastSkipped, v.JSONObjects, v.Guilds, v.Channels, v.Messages, v.DMMessages, v.DMChannels, v.GuildMessages, v.SkippedMessages, v.SkippedChannels, v.Checkpoints, v.FullCache, v.DryRun)
return err
case store.Status:
_, err := fmt.Fprintf(w, "db=%s\nguilds=%d\nchannels=%d\nthreads=%d\nmessages=%d\nmembers=%d\nembedding_backlog=%d\nlast_sync=%s\nlast_tail_event=%s\n",

View File

@ -7,8 +7,8 @@ import (
"github.com/stretchr/testify/require"
"github.com/steipete/discrawl/internal/store"
"github.com/steipete/discrawl/internal/syncer"
"github.com/openclaw/discrawl/internal/store"
"github.com/openclaw/discrawl/internal/syncer"
)
func TestPrintRows(t *testing.T) {

View File

@ -9,9 +9,9 @@ import (
"os"
"strings"
"github.com/steipete/discrawl/internal/config"
"github.com/steipete/discrawl/internal/embed"
"github.com/steipete/discrawl/internal/store"
"github.com/openclaw/crawlkit/embed"
"github.com/openclaw/discrawl/internal/config"
"github.com/openclaw/discrawl/internal/store"
)
func (r *runtime) runSearch(args []string) error {
@ -25,7 +25,7 @@ func (r *runtime) runSearch(args []string) error {
dm := fs.Bool("dm", false, "")
guildsFlag := fs.String("guilds", "", "")
guildFlag := fs.String("guild", "", "")
if err := fs.Parse(args); err != nil {
if err := fs.Parse(permuteSearchFlags(args)); err != nil {
return usageErr(err)
}
if fs.NArg() != 1 {
@ -67,6 +67,51 @@ func (r *runtime) runSearch(args []string) error {
}
}
func permuteSearchFlags(args []string) []string {
valueFlags := map[string]struct{}{
"--mode": {},
"--channel": {},
"--author": {},
"--limit": {},
"--guilds": {},
"--guild": {},
}
boolFlags := map[string]struct{}{
"--include-empty": {},
"--dm": {},
}
flags := make([]string, 0, len(args))
positionals := make([]string, 0, len(args))
for i := 0; i < len(args); i++ {
arg := args[i]
if arg == "--" {
positionals = append(positionals, args[i+1:]...)
break
}
if name, _, ok := strings.Cut(arg, "="); ok {
if _, known := valueFlags[name]; known {
flags = append(flags, arg)
continue
}
if _, known := boolFlags[name]; known {
flags = append(flags, arg)
continue
}
}
if _, known := boolFlags[arg]; known {
flags = append(flags, arg)
continue
}
if _, known := valueFlags[arg]; known && i+1 < len(args) {
flags = append(flags, arg, args[i+1])
i++
continue
}
positionals = append(positionals, arg)
}
return append(flags, positionals...)
}
func (r *runtime) searchMessagesSemantic(opts store.SearchOptions) ([]store.SearchResult, error) {
semanticOpts, err := r.semanticSearchOptions(opts)
if err != nil {
@ -112,7 +157,7 @@ func (r *runtime) semanticSearchOptions(opts store.SearchOptions) (store.Semanti
providerFactory := r.newEmbed
if providerFactory == nil {
providerFactory = func(cfg config.EmbeddingsConfig) (embed.Provider, error) {
return embed.NewProvider(cfg)
return embed.NewProvider(crawlkitEmbeddingConfig(cfg))
}
}
provider, err := providerFactory(r.cfg.Search.Embeddings)

View File

@ -6,7 +6,7 @@ import (
"slices"
"strings"
"github.com/steipete/discrawl/internal/syncer"
"github.com/openclaw/discrawl/internal/syncer"
)
func (r *runtime) syncMessagesQuery(channel, guild, guilds string) error {
@ -96,3 +96,27 @@ func hasBoolFlag(args []string, name string) bool {
}
return false
}
func boolFlagEnabled(args []string, name string) bool {
for _, arg := range args {
if arg == name {
return true
}
if raw, ok := strings.CutPrefix(arg, name+"="); ok {
switch strings.ToLower(strings.TrimSpace(raw)) {
case "1", "t", "true", "y", "yes", "on":
return true
}
}
}
return false
}
func hasHelpArg(args []string) bool {
for _, arg := range args {
if arg == "help" || arg == "--help" || arg == "-h" {
return true
}
}
return false
}

View File

@ -8,8 +8,8 @@ import (
"github.com/stretchr/testify/require"
"github.com/steipete/discrawl/internal/config"
"github.com/steipete/discrawl/internal/store"
"github.com/openclaw/discrawl/internal/config"
"github.com/openclaw/discrawl/internal/store"
)
func TestMessageSyncOptionsNumericChannelID(t *testing.T) {

View File

@ -5,7 +5,7 @@ import (
"flag"
"io"
"github.com/steipete/discrawl/internal/report"
"github.com/openclaw/discrawl/internal/report"
)
func (r *runtime) runReport(args []string) error {

View File

@ -6,10 +6,10 @@ import (
"io"
"os"
"github.com/steipete/discrawl/internal/config"
"github.com/steipete/discrawl/internal/report"
"github.com/steipete/discrawl/internal/share"
"github.com/steipete/discrawl/internal/store"
"github.com/openclaw/discrawl/internal/config"
"github.com/openclaw/discrawl/internal/report"
"github.com/openclaw/discrawl/internal/share"
"github.com/openclaw/discrawl/internal/store"
)
func (r *runtime) runPublish(args []string) error {
@ -136,13 +136,15 @@ func (r *runtime) runSubscribe(args []string) error {
if err != nil {
return configErr(err)
}
opts := share.Options{RepoPath: expandedRepo, Remote: cfg.Share.Remote, Branch: cfg.Share.Branch}
opts := share.Options{RepoPath: expandedRepo, Remote: cfg.Share.Remote, Branch: cfg.Share.Branch, Progress: r.shareProgress}
if *withEmbeddings {
applyEmbeddingShareOptions(&opts, cfg)
}
r.setSyncLockPhase("share pull")
if err := share.Pull(r.ctx, opts); err != nil {
return err
}
r.setSyncLockPhase("share import")
manifest, imported, err := share.ImportIfChanged(r.ctx, s, opts)
if err != nil {
return err
@ -176,12 +178,15 @@ func (r *runtime) runUpdate(args []string) error {
if err != nil {
return err
}
opts.Progress = r.shareProgress
if *withEmbeddings {
applyEmbeddingShareOptions(&opts, r.cfg)
}
r.setSyncLockPhase("share pull")
if err := share.Pull(r.ctx, opts); err != nil {
return err
}
r.setSyncLockPhase("share import")
manifest, imported, err := share.ImportIfChanged(r.ctx, r.store, opts)
if err != nil {
return err

View File

@ -0,0 +1,110 @@
package cli
import (
"errors"
"fmt"
"strings"
"time"
"github.com/openclaw/discrawl/internal/share"
)
type shareUpdateMode string
const (
shareUpdateConfigured shareUpdateMode = "configured"
shareUpdateAuto shareUpdateMode = "auto"
shareUpdateNever shareUpdateMode = "never"
shareUpdateForce shareUpdateMode = "force"
)
func boolShareUpdateMode(enabled bool) shareUpdateMode {
if enabled {
return shareUpdateConfigured
}
return shareUpdateNever
}
func parseShareUpdateMode(raw string) (shareUpdateMode, error) {
switch shareUpdateMode(strings.ToLower(strings.TrimSpace(raw))) {
case "", shareUpdateAuto:
return shareUpdateAuto, nil
case shareUpdateNever:
return shareUpdateNever, nil
case shareUpdateForce:
return shareUpdateForce, nil
default:
return "", fmt.Errorf("invalid --update %q; use auto, never, or force", raw)
}
}
func syncShareUpdateMode(args []string) (shareUpdateMode, error) {
mode := shareUpdateNever
sawNoUpdate := false
sawUpdate := false
for i := 0; i < len(args); i++ {
arg := args[i]
switch {
case arg == "--no-update":
sawNoUpdate = true
mode = shareUpdateNever
case arg == "--update":
if i+1 >= len(args) {
return "", errors.New("--update requires auto, never, or force")
}
parsed, err := parseShareUpdateMode(args[i+1])
if err != nil {
return "", err
}
sawUpdate = true
mode = parsed
i++
case strings.HasPrefix(arg, "--update="):
parsed, err := parseShareUpdateMode(strings.TrimPrefix(arg, "--update="))
if err != nil {
return "", err
}
sawUpdate = true
mode = parsed
}
}
if sawNoUpdate && sawUpdate && mode != shareUpdateNever {
return "", errors.New("use either --no-update or --update, not both")
}
return mode, nil
}
func (r *runtime) shareProgress(progress share.ImportProgress) {
if progress.Phase == "" {
return
}
phase := "share " + progress.Phase
if progress.Table != "" {
phase += " " + progress.Table
}
if progress.File != "" {
phase += " " + progress.File
}
r.setSyncLockPhase(phase)
attrs := []any{"phase", progress.Phase}
if progress.Table != "" {
attrs = append(attrs, "table", progress.Table)
}
if progress.Rows != 0 {
attrs = append(attrs, "rows", progress.Rows)
}
if progress.TotalRows != 0 {
attrs = append(attrs, "total_rows", progress.TotalRows)
}
if progress.File != "" {
attrs = append(attrs, "file", progress.File, "file_index", progress.FileIndex, "file_count", progress.FileCount)
}
r.logger.Info("share import progress", attrs...)
}
func (r *runtime) nowUTC() time.Time {
if r.now != nil {
return r.now().UTC()
}
return time.Now().UTC()
}

View File

@ -3,9 +3,12 @@ package cli
import (
"context"
"fmt"
"os"
"path/filepath"
"strings"
"time"
"github.com/steipete/discrawl/internal/config"
"github.com/openclaw/discrawl/internal/config"
)
func (r *runtime) withSyncLock(fn func() error) error {
@ -21,13 +24,60 @@ func (r *runtime) withSyncLock(fn func() error) error {
return err
}
r.dbLockHeld = true
r.lockStarted = r.nowUTC()
r.setSyncLockPhase("locked")
defer func() {
r.dbLockHeld = false
r.lockStarted = time.Time{}
_ = release()
}()
return fn()
}
func (r *runtime) tryWithSyncLock(fn func() error) (bool, error) {
if r.dbLockHeld {
return true, fn()
}
lockPath, err := r.syncLockPath()
if err != nil {
return false, err
}
release, locked, err := tryAcquireSyncLock(lockPath)
if err != nil || !locked {
return locked, err
}
r.dbLockHeld = true
r.lockStarted = r.nowUTC()
r.setSyncLockPhase("locked")
defer func() {
r.dbLockHeld = false
r.lockStarted = time.Time{}
_ = release()
}()
return true, fn()
}
func (r *runtime) setSyncLockPhase(phase string) {
if !r.dbLockHeld {
return
}
path, err := r.syncLockPath()
if err != nil {
return
}
started := r.lockStarted
if started.IsZero() {
started = r.nowUTC()
}
body := fmt.Sprintf("pid=%d\nstarted_at=%s\nupdated_at=%s\nphase=%s\n",
os.Getpid(),
started.Format(time.RFC3339Nano),
r.nowUTC().Format(time.RFC3339Nano),
phase,
)
_ = os.WriteFile(path, []byte(body), 0o600)
}
func (r *runtime) syncLockPath() (string, error) {
dbPath, err := config.ExpandPath(r.cfg.DBPath)
if err != nil {
@ -38,6 +88,12 @@ func (r *runtime) syncLockPath() (string, error) {
func syncLockErr(ctx context.Context, path string) error {
if ctx.Err() != nil {
if body, err := os.ReadFile(path); err == nil {
details := strings.TrimSpace(string(body))
if details != "" {
return fmt.Errorf("wait for sync lock %s (%s): %w", path, strings.ReplaceAll(details, "\n", ", "), ctx.Err())
}
}
return fmt.Errorf("wait for sync lock %s: %w", path, ctx.Err())
}
return nil

View File

@ -7,3 +7,7 @@ import "context"
func acquireSyncLock(context.Context, string) (func() error, error) {
return func() error { return nil }, nil
}
func tryAcquireSyncLock(string) (func() error, bool, error) {
return func() error { return nil }, true, nil
}

View File

@ -51,3 +51,29 @@ func acquireSyncLock(ctx context.Context, path string) (func() error, error) {
}
}
}
func tryAcquireSyncLock(path string) (func() error, bool, error) {
file, err := os.OpenFile(path, os.O_CREATE|os.O_RDWR, 0o600)
if err != nil {
return nil, false, fmt.Errorf("open sync lock: %w", err)
}
err = unix.Flock(int(file.Fd()), unix.LOCK_EX|unix.LOCK_NB)
if err != nil {
_ = file.Close()
if errors.Is(err, unix.EWOULDBLOCK) || errors.Is(err, unix.EAGAIN) {
return nil, false, nil
}
return nil, false, fmt.Errorf("acquire sync lock: %w", err)
}
_, _ = file.Seek(0, 0)
_ = file.Truncate(0)
_, _ = fmt.Fprintf(file, "pid=%d\n", os.Getpid())
return func() error {
unlockErr := unix.Flock(int(file.Fd()), unix.LOCK_UN)
closeErr := file.Close()
if unlockErr != nil {
return unlockErr
}
return closeErr
}, true, nil
}

View File

@ -49,3 +49,28 @@ func acquireSyncLock(ctx context.Context, path string) (func() error, error) {
}
}
}
func tryAcquireSyncLock(path string) (func() error, bool, error) {
file, err := os.OpenFile(path, os.O_CREATE|os.O_RDWR, 0o600)
if err != nil {
return nil, false, fmt.Errorf("open sync lock: %w", err)
}
handle := windows.Handle(file.Fd())
overlapped := &windows.Overlapped{}
err = windows.LockFileEx(handle, windows.LOCKFILE_EXCLUSIVE_LOCK|windows.LOCKFILE_FAIL_IMMEDIATELY, 0, 1, 0, overlapped)
if err != nil {
_ = file.Close()
return nil, false, nil
}
_, _ = file.Seek(0, 0)
_ = file.Truncate(0)
_, _ = fmt.Fprintf(file, "pid=%d\n", os.Getpid())
return func() error {
unlockErr := windows.UnlockFileEx(handle, 0, 1, 0, overlapped)
closeErr := file.Close()
if unlockErr != nil {
return unlockErr
}
return closeErr
}, true, nil
}

View File

@ -0,0 +1,239 @@
package cli
import (
"context"
"errors"
"flag"
"fmt"
"strings"
"github.com/openclaw/crawlkit/tui"
"github.com/openclaw/discrawl/internal/store"
)
func (r *runtime) runTUI(args []string) error {
fs := flag.NewFlagSet("tui", flag.ContinueOnError)
fs.SetOutput(r.stderr)
fs.Usage = func() {
_, _ = fmt.Fprintln(fs.Output(), "Usage of tui:")
fs.PrintDefaults()
_, _ = fmt.Fprintln(fs.Output())
_, _ = fmt.Fprintln(fs.Output(), tui.ControlsHelp())
}
if hasHelpArg(args) {
fs.SetOutput(r.stdout)
}
channel := fs.String("channel", "", "channel id")
author := fs.String("author", "", "author/user id")
limit := fs.Int("limit", 200, "row limit")
includeEmpty := fs.Bool("include-empty", false, "include empty messages")
dm := fs.Bool("dm", false, "browse direct messages")
guildsFlag := fs.String("guilds", "", "comma-separated guild ids")
guildFlag := fs.String("guild", "", "guild id")
jsonOut := fs.Bool("json", false, "write browser rows as JSON")
if len(args) == 1 && args[0] == "help" {
fs.Usage()
return nil
}
if err := fs.Parse(args); err != nil {
if errors.Is(err, flag.ErrHelp) {
return nil
}
return usageErr(err)
}
if *jsonOut {
r.json = true
}
if fs.NArg() != 0 {
return usageErr(errors.New("tui takes flags only"))
}
if *limit <= 0 {
return usageErr(errors.New("tui --limit must be positive"))
}
guildIDs, err := r.resolveTUIGuilds(*dm, *guildFlag, *guildsFlag)
if err != nil {
return usageErr(err)
}
if r.store == nil {
return tui.Browse(r.ctx, tui.BrowseOptions{
AppName: "discrawl",
Title: "discrawl archive",
EmptyMessage: "discrawl has no local messages yet",
JSON: r.json,
Layout: tui.LayoutChat,
SourceKind: r.archiveSourceKind(),
SourceLocation: r.archiveSourceLocation(),
Stdout: r.stdout,
})
}
loadRows := func() ([]tui.Row, error) {
rows, err := r.store.ListMessagesWithThreadContext(r.ctx, store.MessageListOptions{
GuildIDs: guildIDs,
Channel: *channel,
Author: *author,
Last: *limit,
IncludeEmpty: *includeEmpty,
})
if err != nil {
return nil, err
}
return discordTUIRows(rows), nil
}
archiveRows, err := loadRows()
if err != nil {
return err
}
return tui.Browse(r.ctx, tui.BrowseOptions{
AppName: "discrawl",
Title: "discrawl archive",
EmptyMessage: "discrawl has no local messages yet",
Rows: archiveRows,
Refresh: func(context.Context) ([]tui.Row, error) { return loadRows() },
JSON: r.json,
Layout: tui.LayoutChat,
SourceKind: r.archiveSourceKind(),
SourceLocation: r.archiveSourceLocation(),
Stdout: r.stdout,
})
}
func (r *runtime) resolveTUIGuilds(dm bool, guild, guilds string) ([]string, error) {
guildIDs, err := directMessageGuildScope(dm, guild, guilds)
if err != nil || dm || len(guildIDs) > 0 {
return guildIDs, err
}
if defaultGuild := r.cfg.EffectiveDefaultGuildID(); defaultGuild != "" {
return []string{defaultGuild}, nil
}
return nil, nil
}
func (r *runtime) archiveSourceKind() string {
if strings.TrimSpace(r.cfg.Share.Remote) != "" {
return tui.SourceRemote
}
return tui.SourceLocal
}
func (r *runtime) archiveSourceLocation() string {
if strings.TrimSpace(r.cfg.Share.Remote) != "" {
return r.cfg.Share.Remote
}
return r.cfg.DBPath
}
func discordTUIRows(rows []store.MessageRow) []tui.Row {
items := make([]tui.Row, 0, len(rows))
for _, row := range rows {
content := discordDisplayContent(row)
title := strings.TrimSpace(content)
detail := discordDetailContent(row, content)
if title == "" {
title = firstNonEmpty(strings.TrimSpace(row.AttachmentText), row.MessageID)
}
tags := []string{row.GuildID, row.ChannelID}
if row.GuildID == "@me" {
tags = append(tags, "dm")
}
if row.Source != "" {
tags = append(tags, row.Source)
}
items = append(items, tui.Row{
Source: "discord",
Kind: "message",
ID: row.MessageID,
ParentID: row.ReplyToMessage,
Scope: discordScopeLabel(row),
Container: discordContainerLabel(row),
Author: discordAuthorLabel(row),
Title: title,
Text: content,
Detail: detail,
URL: discordMessageURL(row),
CreatedAt: formatTime(row.CreatedAt),
Tags: tags,
Fields: map[string]string{
"attachment_names": row.AttachmentNames,
"attachments": boolString(row.HasAttachments),
"author_id": row.AuthorID,
"channel_id": row.ChannelID,
"guild_id": row.GuildID,
"pinned": boolString(row.Pinned),
"reply_to": row.ReplyToMessage,
"source": row.Source,
},
})
}
return items
}
func discordDetailContent(row store.MessageRow, content string) string {
var parts []string
if strings.TrimSpace(content) != "" {
parts = append(parts, strings.TrimSpace(content))
}
if strings.TrimSpace(row.AttachmentText) != "" {
parts = append(parts, "Attachments\n"+strings.TrimSpace(row.AttachmentText))
}
if len(parts) == 0 {
return ""
}
return strings.Join(parts, "\n\n")
}
func discordDisplayContent(row store.MessageRow) string {
if content := strings.TrimSpace(row.DisplayContent); content != "" {
return content
}
return row.Content
}
func discordMessageURL(row store.MessageRow) string {
guildID := strings.TrimSpace(row.GuildID)
channelID := strings.TrimSpace(row.ChannelID)
messageID := strings.TrimSpace(row.MessageID)
if guildID == "" || channelID == "" || messageID == "" {
return ""
}
return "https://discord.com/channels/" + guildID + "/" + channelID + "/" + messageID
}
func discordScopeLabel(row store.MessageRow) string {
if row.GuildID == "@me" {
return "Direct messages"
}
return firstNonEmpty(row.GuildName, row.GuildID)
}
func discordContainerLabel(row store.MessageRow) string {
if row.GuildID == "@me" {
return firstNonEmpty(row.ChannelName, "DM "+compactDiscordID(row.ChannelID))
}
return firstNonEmpty(row.ChannelName, row.ChannelID)
}
func discordAuthorLabel(row store.MessageRow) string {
if name := strings.TrimSpace(row.AuthorName); name != "" {
return name
}
if id := strings.TrimSpace(row.AuthorID); id != "" {
return "user:" + compactDiscordID(id)
}
return ""
}
func compactDiscordID(id string) string {
id = strings.TrimSpace(id)
if len(id) <= 10 {
return id
}
return id[:6] + "..." + id[len(id)-4:]
}
func boolString(value bool) string {
if value {
return "true"
}
return ""
}

View File

@ -1,3 +1,3 @@
package cli
var version = "0.6.4"
var version = "0.7.0"

View File

@ -9,7 +9,7 @@ import (
"strings"
"time"
"github.com/pelletier/go-toml/v2"
crawlconfig "github.com/openclaw/crawlkit/config"
)
const (
@ -44,6 +44,7 @@ type DiscordConfig struct {
type DesktopConfig struct {
Path string `toml:"path"`
MaxFileBytes int64 `toml:"max_file_bytes"`
FullCache bool `toml:"full_cache"`
}
type SyncConfig struct {
@ -84,14 +85,25 @@ type TokenResolution struct {
Path string
}
var appConfig = crawlconfig.App{Name: "discrawl", ConfigEnv: DefaultConfigEnv, BaseDir: "~/.discrawl", LegacyBaseDir: "~/.discrawl"}
func Default() Config {
home, _ := os.UserHomeDir()
base := filepath.Join(home, ".discrawl")
paths, err := appConfig.DefaultPaths()
if err != nil {
base := filepath.Join(home, ".discrawl")
paths = crawlconfig.Paths{
DBPath: filepath.Join(base, "discrawl.db"),
CacheDir: filepath.Join(base, "cache"),
LogDir: filepath.Join(base, "logs"),
ShareDir: filepath.Join(base, "share"),
}
}
return Config{
Version: 1,
DBPath: filepath.Join(base, "discrawl.db"),
CacheDir: filepath.Join(base, "cache"),
LogDir: filepath.Join(base, "logs"),
DBPath: paths.DBPath,
CacheDir: paths.CacheDir,
LogDir: paths.LogDir,
DefaultGuildID: "",
Discord: DiscordConfig{
TokenSource: "env",
@ -123,7 +135,7 @@ func Default() Config {
},
},
Share: ShareConfig{
RepoPath: filepath.Join(base, "share"),
RepoPath: paths.ShareDir,
Branch: "main",
AutoUpdate: true,
StaleAfter: "15m",
@ -144,14 +156,12 @@ func defaultSyncConcurrency() int {
}
func ResolvePath(flagPath string) string {
if strings.TrimSpace(flagPath) != "" {
return flagPath
path, err := appConfig.ResolveConfigPath(flagPath)
if err != nil {
home, _ := os.UserHomeDir()
return filepath.Join(home, ".discrawl", "config.toml")
}
if envPath := strings.TrimSpace(os.Getenv(DefaultConfigEnv)); envPath != "" {
return envPath
}
home, _ := os.UserHomeDir()
return filepath.Join(home, ".discrawl", "config.toml")
return path
}
func Load(path string) (Config, error) {
@ -160,13 +170,9 @@ func Load(path string) (Config, error) {
if err != nil {
return Config{}, err
}
data, err := os.ReadFile(expanded)
if err != nil {
if err := crawlconfig.LoadTOML(expanded, &cfg); err != nil {
return Config{}, err
}
if err := toml.Unmarshal(data, &cfg); err != nil {
return Config{}, fmt.Errorf("parse config: %w", err)
}
if err := cfg.Normalize(); err != nil {
return Config{}, err
}
@ -181,14 +187,7 @@ func Write(path string, cfg Config) error {
if err != nil {
return err
}
if err := os.MkdirAll(filepath.Dir(expanded), 0o755); err != nil {
return fmt.Errorf("mkdir config dir: %w", err)
}
data, err := toml.Marshal(cfg)
if err != nil {
return fmt.Errorf("marshal config: %w", err)
}
return os.WriteFile(expanded, data, 0o600)
return crawlconfig.WriteTOML(expanded, cfg, 0o600)
}
func (c *Config) Normalize() error {
@ -342,35 +341,18 @@ func (c Config) ShareEnabled() bool {
}
func EnsureRuntimeDirs(cfg Config) error {
paths := []string{cfg.CacheDir, cfg.LogDir, filepath.Dir(cfg.DBPath)}
for _, path := range paths {
expanded, err := ExpandPath(path)
if err != nil {
return err
}
if err := os.MkdirAll(expanded, 0o755); err != nil {
return fmt.Errorf("mkdir %s: %w", expanded, err)
}
}
return nil
return crawlconfig.EnsureRuntimeDirs(crawlconfig.RuntimeConfig{
DBPath: cfg.DBPath,
CacheDir: cfg.CacheDir,
LogDir: cfg.LogDir,
})
}
func ExpandPath(path string) (string, error) {
if strings.TrimSpace(path) == "" {
return "", errors.New("empty path")
}
if strings.HasPrefix(path, "~/") || path == "~" {
home, err := os.UserHomeDir()
if err != nil {
return "", fmt.Errorf("home dir: %w", err)
}
if path == "~" {
path = home
} else {
path = filepath.Join(home, strings.TrimPrefix(path, "~/"))
}
}
return filepath.Clean(os.ExpandEnv(path)), nil
return filepath.Clean(os.ExpandEnv(crawlconfig.ExpandHome(path))), nil
}
func uniqueStrings(in []string) []string {

View File

@ -9,6 +9,7 @@ import (
"fmt"
"io"
"io/fs"
"maps"
"os"
"path/filepath"
"regexp"
@ -19,7 +20,7 @@ import (
"time"
"unicode"
"github.com/steipete/discrawl/internal/store"
"github.com/openclaw/discrawl/internal/store"
)
const (
@ -27,35 +28,45 @@ const (
DirectMessageGuildName = "Discord Direct Messages"
defaultMaxFileBytes = 64 << 20
maxObjectBytes = 4 << 20
cacheSniffBytes = 1 << 20
checkpointEveryFiles = 256
)
var channelRouteRE = regexp.MustCompile(`/channels/(@me|[0-9]{12,24})/([0-9]{12,24})`)
var (
channelRouteRE = regexp.MustCompile(`/channels/(@me|[0-9]{12,24})/([0-9]{12,24})`)
apiMessagesRouteRE = regexp.MustCompile(`/api/v[0-9]+/channels/[0-9]{12,24}/messages`)
)
type Options struct {
Path string
MaxFileBytes int64
DryRun bool
FullCache bool
Now func() time.Time
}
type Stats struct {
Path string `json:"path"`
FilesScanned int `json:"files_scanned"`
FilesSkipped int `json:"files_skipped"`
FilesUnchanged int `json:"files_unchanged"`
BytesScanned int64 `json:"bytes_scanned"`
JSONObjects int `json:"json_objects"`
Guilds int `json:"guilds"`
Channels int `json:"channels"`
Messages int `json:"messages"`
DMMessages int `json:"dm_messages"`
DMChannels int `json:"dm_channels"`
GuildMessages int `json:"guild_messages"`
SkippedMessages int `json:"skipped_messages"`
SkippedChannels int `json:"skipped_channels"`
DryRun bool `json:"dry_run,omitempty"`
StartedAt time.Time `json:"started_at"`
FinishedAt time.Time `json:"finished_at"`
Path string `json:"path"`
FilesVisited int `json:"files_visited"`
FilesScanned int `json:"files_scanned"`
FilesSkipped int `json:"files_skipped"`
FilesUnchanged int `json:"files_unchanged"`
CacheFilesFastSkipped int `json:"cache_files_fast_skipped"`
BytesScanned int64 `json:"bytes_scanned"`
JSONObjects int `json:"json_objects"`
Guilds int `json:"guilds"`
Channels int `json:"channels"`
Messages int `json:"messages"`
DMMessages int `json:"dm_messages"`
DMChannels int `json:"dm_channels"`
GuildMessages int `json:"guild_messages"`
SkippedMessages int `json:"skipped_messages"`
SkippedChannels int `json:"skipped_channels"`
Checkpoints int `json:"checkpoints"`
DryRun bool `json:"dry_run,omitempty"`
FullCache bool `json:"full_cache,omitempty"`
StartedAt time.Time `json:"started_at"`
FinishedAt time.Time `json:"finished_at"`
}
type snapshot struct {
@ -67,8 +78,9 @@ type snapshot struct {
}
type fileFingerprint struct {
Size int64 `json:"size"`
ModUnixNS int64 `json:"mod_unix_ns"`
Size int64 `json:"size"`
ModUnixNS int64 `json:"mod_unix_ns"`
Status string `json:"status,omitempty"`
}
type scanState struct {
@ -77,8 +89,42 @@ type scanState struct {
channels map[string]store.ChannelRecord
}
type fileSource int
const (
fileSourceContext fileSource = iota
fileSourceCacheData
)
type fileCandidate struct {
absPath string
relPath string
relKey string
source fileSource
info fs.FileInfo
fingerprint fileFingerprint
}
type scanTotals struct {
guilds map[string]struct{}
channels map[string]struct{}
messages map[string]struct{}
dmMessages map[string]struct{}
guildMessages map[string]struct{}
dmChannels map[string]struct{}
skippedMessages map[string]struct{}
skippedChannels map[string]struct{}
}
type unresolvedMessages map[string]string
const wiretapFileIndexScope = "wiretap:file_index:v1"
const (
fileStatusImported = "imported"
fileStatusSkipped = "skipped"
)
func DefaultPath() string {
home, _ := os.UserHomeDir()
switch runtime.GOOS {
@ -105,25 +151,29 @@ func Import(ctx context.Context, st *store.Store, opts Options) (Stats, error) {
if err != nil {
return Stats{}, err
}
stats, snap, err := scan(ctx, opts, state)
if opts.FullCache {
stats, snap, err := scanFullCache(ctx, opts, state)
if err != nil {
return stats, err
}
stats.DryRun = opts.DryRun
if opts.DryRun {
return stats, nil
}
if err := writeSnapshot(ctx, st, snap, len(state.previous) == 0); err != nil {
return stats, err
}
if err := saveFileIndex(ctx, st, opts, state.current); err != nil {
return stats, err
}
stats.Checkpoints = 1
return stats, nil
}
stats, err := scanAndImport(ctx, st, opts, state)
if err != nil {
return stats, err
}
stats.DryRun = opts.DryRun
if opts.DryRun {
return stats, nil
}
fullScan := len(state.previous) == 0
if snapshotHasChanges(snap) || fullScan {
if err := writeSnapshot(ctx, st, snap, fullScan); err != nil {
return stats, err
}
} else if err := st.SetSyncState(ctx, "wiretap:last_import", time.Now().UTC().Format(time.RFC3339Nano)); err != nil {
return stats, err
}
if err := saveFileIndex(ctx, st, state.current); err != nil {
return stats, err
}
return stats, nil
}
@ -136,7 +186,7 @@ func loadScanState(ctx context.Context, st *store.Store, opts Options) (scanStat
if st == nil || opts.DryRun {
return state, nil
}
raw, err := st.GetSyncState(ctx, wiretapFileIndexScope)
raw, err := st.GetSyncState(ctx, fileIndexScope(opts))
if err != nil {
return state, err
}
@ -160,19 +210,107 @@ func loadScanState(ctx context.Context, st *store.Store, opts Options) (scanStat
return state, nil
}
func saveFileIndex(ctx context.Context, st *store.Store, index map[string]fileFingerprint) error {
func fileIndexScope(Options) string {
return wiretapFileIndexScope
}
func saveFileIndex(ctx context.Context, st *store.Store, opts Options, index map[string]fileFingerprint) error {
body, err := json.Marshal(index)
if err != nil {
return err
}
return st.SetSyncState(ctx, wiretapFileIndexScope, string(body))
return st.SetSyncState(ctx, fileIndexScope(opts), string(body))
}
func sameFileFingerprint(a, b fileFingerprint) bool {
return a.Size == b.Size && a.ModUnixNS == b.ModUnixNS
}
func isImportedFingerprint(fingerprint fileFingerprint) bool {
return fingerprint.Status == "" || fingerprint.Status == fileStatusImported
}
func importedFingerprint(fingerprint fileFingerprint) fileFingerprint {
fingerprint.Status = fileStatusImported
return fingerprint
}
func skippedFingerprint(fingerprint fileFingerprint) fileFingerprint {
fingerprint.Status = fileStatusSkipped
return fingerprint
}
func snapshotHasChanges(snap snapshot) bool {
return len(snap.guilds) > 0 || len(snap.channels) > 0 || len(snap.messages) > 0
}
func scan(ctx context.Context, opts Options, state scanState) (Stats, snapshot, error) {
func scanAndImport(ctx context.Context, st *store.Store, opts Options, state scanState) (Stats, error) {
now := opts.Now
if now == nil {
now = time.Now
}
root := strings.TrimSpace(opts.Path)
if root == "" {
root = DefaultPath()
}
stats := Stats{Path: root, FullCache: opts.FullCache, StartedAt: now().UTC()}
rootFS, err := os.OpenRoot(root)
if err != nil {
stats.FinishedAt = now().UTC()
return stats, ignoreCacheFileError(err)
}
defer func() { _ = rootFS.Close() }()
contextFiles, cacheFiles, err := discoverCandidates(ctx, root, rootFS, opts, state, &stats)
if err != nil {
stats.FinishedAt = now().UTC()
return stats, err
}
fullScan := len(state.previous) == 0
if fullScan && !opts.DryRun {
if err := st.DeleteGuildData(ctx, "@unknown"); err != nil {
stats.FinishedAt = now().UTC()
return stats, err
}
}
run := newImportRun(ctx, st, opts, state, rootFS, &stats)
if err := run.scanContext(contextFiles); err != nil {
stats.FinishedAt = now().UTC()
return stats, err
}
if err := collectCacheRouteHints(ctx, rootFS, cacheFiles, run.base); err != nil {
stats.FinishedAt = now().UTC()
return stats, err
}
if err := run.scanCacheBatches(cacheFiles); err != nil {
stats.FinishedAt = now().UTC()
return stats, err
}
if err := run.retryPending(); err != nil {
stats.FinishedAt = now().UTC()
return stats, err
}
if !opts.DryRun {
if len(contextFiles) == 0 && len(cacheFiles) == 0 {
if err := st.SetSyncState(ctx, "wiretap:last_import", time.Now().UTC().Format(time.RFC3339Nano)); err != nil {
stats.FinishedAt = now().UTC()
return stats, err
}
if err := saveFileIndex(ctx, st, opts, state.current); err != nil {
stats.FinishedAt = now().UTC()
return stats, err
}
stats.Checkpoints++
}
if err := st.DeleteOrphanChannels(ctx, DirectMessageGuildID); err != nil {
stats.FinishedAt = now().UTC()
return stats, err
}
}
stats.FinishedAt = now().UTC()
return stats, nil
}
func scanFullCache(ctx context.Context, opts Options, state scanState) (Stats, snapshot, error) {
now := opts.Now
if now == nil {
now = time.Now
@ -185,14 +323,8 @@ func scan(ctx context.Context, opts Options, state scanState) (Stats, snapshot,
if maxBytes <= 0 {
maxBytes = defaultMaxFileBytes
}
stats := Stats{Path: root, StartedAt: now().UTC()}
snap := snapshot{
guilds: map[string]store.GuildRecord{},
channels: map[string]store.ChannelRecord{},
messages: map[string]store.MessageMutation{},
routes: map[string]string{},
userLabels: map[string]userLabel{},
}
stats := Stats{Path: root, FullCache: true, StartedAt: now().UTC()}
snap := newSnapshot()
rootFS, err := os.OpenRoot(root)
if err != nil {
stats.FinishedAt = now().UTC()
@ -212,6 +344,7 @@ func scan(ctx context.Context, opts Options, state scanState) (Stats, snapshot,
}
return nil
}
stats.FilesVisited++
info, err := entry.Info()
if err != nil {
stats.FilesSkipped++
@ -231,8 +364,8 @@ func scan(ctx context.Context, opts Options, state scanState) (Stats, snapshot,
Size: info.Size(),
ModUnixNS: info.ModTime().UnixNano(),
}
state.current[relKey] = fingerprint
if previous, ok := state.previous[relKey]; ok && previous == fingerprint {
state.current[relKey] = importedFingerprint(fingerprint)
if previous, ok := state.previous[relKey]; ok && sameFileFingerprint(previous, fingerprint) && isImportedFingerprint(previous) {
stats.FilesUnchanged++
return nil
}
@ -267,15 +400,181 @@ func scan(ctx context.Context, opts Options, state scanState) (Stats, snapshot,
}); err != nil {
return stats, snap, err
}
reconcileMessages(snap, state.channels)
totals := newScanTotals()
finalizeSnapshot(snap, state.channels, totals, &stats, true)
stats.FinishedAt = now().UTC()
return stats, snap, nil
}
func discoverCandidates(ctx context.Context, root string, rootFS *os.Root, opts Options, state scanState, stats *Stats) ([]fileCandidate, []fileCandidate, error) {
var contextFiles []fileCandidate
var cacheFiles []fileCandidate
maxBytes := opts.MaxFileBytes
if maxBytes <= 0 {
maxBytes = defaultMaxFileBytes
}
err := filepath.WalkDir(root, func(path string, entry fs.DirEntry, err error) error {
if err != nil {
return ignoreCacheFileError(err)
}
if ctx.Err() != nil {
return ctx.Err()
}
if entry.IsDir() {
if shouldSkipDir(entry.Name()) && path != root {
return filepath.SkipDir
}
return nil
}
stats.FilesVisited++
info, err := entry.Info()
if err != nil {
stats.FilesSkipped++
return ignoreCacheFileError(err)
}
if !isCandidateFile(path) || info.Size() <= 0 || info.Size() > maxBytes {
stats.FilesSkipped++
return nil
}
relPath, err := filepath.Rel(root, path)
if err != nil {
stats.FilesSkipped++
return ignoreCacheFileError(err)
}
relKey := filepath.ToSlash(relPath)
fingerprint := fileFingerprint{
Size: info.Size(),
ModUnixNS: info.ModTime().UnixNano(),
}
candidate := fileCandidate{
absPath: path,
relPath: relPath,
relKey: relKey,
source: sourceForPath(root, path, relPath),
info: info,
fingerprint: fingerprint,
}
if candidate.source == fileSourceCacheData {
if previous, ok := state.previous[relKey]; ok && sameFileFingerprint(previous, fingerprint) {
if !opts.FullCache || isImportedFingerprint(previous) {
state.current[relKey] = previous
stats.FilesUnchanged++
return nil
}
}
if !opts.FullCache {
ok, err := cacheFileHasRouteHint(rootFS, relPath)
if err != nil {
stats.FilesSkipped++
return ignoreCacheFileError(err)
}
if !ok {
state.current[relKey] = skippedFingerprint(fingerprint)
stats.FilesSkipped++
stats.CacheFilesFastSkipped++
return nil
}
}
cacheFiles = append(cacheFiles, candidate)
return nil
}
if previous, ok := state.previous[relKey]; ok && sameFileFingerprint(previous, fingerprint) {
state.current[relKey] = previous
stats.FilesUnchanged++
return nil
}
contextFiles = append(contextFiles, candidate)
return nil
})
return contextFiles, cacheFiles, err
}
func scanCandidates(ctx context.Context, rootFS *os.Root, opts Options, candidates []fileCandidate, snap snapshot, channelLookup map[string]store.ChannelRecord, stats *Stats) error {
maxBytes := opts.MaxFileBytes
if maxBytes <= 0 {
maxBytes = defaultMaxFileBytes
}
for _, candidate := range candidates {
if err := ctx.Err(); err != nil {
return err
}
data, err := rootFS.ReadFile(candidate.relPath)
if err != nil {
stats.FilesSkipped++
if err := ignoreCacheFileError(err); err != nil {
return err
}
continue
}
stats.FilesScanned++
stats.BytesScanned += int64(len(data))
collectChannelRoutes(snap, bytes.ToValidUTF8(data, nil))
objects := extractJSONValues(bytes.ToValidUTF8(data, nil))
for _, payload := range extractGzipPayloads(data, maxBytes) {
if err := ctx.Err(); err != nil {
return err
}
collectChannelRoutes(snap, bytes.ToValidUTF8(payload, nil))
objects = append(objects, extractJSONValues(bytes.ToValidUTF8(payload, nil))...)
}
stats.JSONObjects += len(objects)
for _, raw := range objects {
if err := ctx.Err(); err != nil {
return err
}
var value any
if err := json.Unmarshal(raw, &value); err != nil {
continue
}
collectValue(snap, channelLookup, value, candidate.info.ModTime().UTC())
}
}
return nil
}
func collectCacheRouteHints(ctx context.Context, rootFS *os.Root, candidates []fileCandidate, snap snapshot) error {
for _, candidate := range candidates {
if err := ctx.Err(); err != nil {
return err
}
data, err := readFilePrefix(rootFS, candidate.relPath)
if err != nil {
if err := ignoreCacheFileError(err); err != nil {
return err
}
continue
}
collectChannelRoutes(snap, bytes.ToValidUTF8(data, nil))
}
return nil
}
func newScanTotals() scanTotals {
return scanTotals{
guilds: map[string]struct{}{},
channels: map[string]struct{}{},
messages: map[string]struct{}{},
dmMessages: map[string]struct{}{},
guildMessages: map[string]struct{}{},
dmChannels: map[string]struct{}{},
skippedMessages: map[string]struct{}{},
skippedChannels: map[string]struct{}{},
}
}
func finalizeSnapshot(snap snapshot, channelLookup map[string]store.ChannelRecord, totals scanTotals, stats *Stats, recordSkipped bool) unresolvedMessages {
reconcileMessages(snap, channelLookup)
inferDirectMessageNames(snap)
reconcileMessages(snap, state.channels)
skippedChannels := map[string]struct{}{}
reconcileMessages(snap, channelLookup)
unresolved := unresolvedMessages{}
for id, msg := range snap.messages {
guildID := msg.Record.GuildID
if guildID == "" {
stats.SkippedMessages++
skippedChannels[msg.Record.ChannelID] = struct{}{}
unresolved[id] = msg.Record.ChannelID
if recordSkipped {
totals.skippedMessages[id] = struct{}{}
totals.skippedChannels[msg.Record.ChannelID] = struct{}{}
}
delete(snap.messages, id)
continue
}
@ -283,34 +582,183 @@ func scan(ctx context.Context, opts Options, state scanState) (Stats, snapshot,
snap.guilds[guildID] = syntheticGuild(guildID, guildName(guildID))
}
if _, ok := snap.channels[msg.Record.ChannelID]; !ok {
snap.channels[msg.Record.ChannelID] = syntheticChannel(msg.Record.ChannelID, guildID, msg.Record.ChannelName)
if channel, ok := channelLookup[msg.Record.ChannelID]; ok && channel.GuildID != "" {
snap.channels[msg.Record.ChannelID] = channel
} else {
snap.channels[msg.Record.ChannelID] = syntheticChannel(msg.Record.ChannelID, guildID, msg.Record.ChannelName)
}
}
snap.messages[id] = msg
}
messageChannels := map[string]struct{}{}
dmChannels := map[string]struct{}{}
for _, msg := range snap.messages {
messageChannels[msg.Record.ChannelID] = struct{}{}
totals.messages[msg.Record.ID] = struct{}{}
switch msg.Record.GuildID {
case DirectMessageGuildID:
stats.DMMessages++
dmChannels[msg.Record.ChannelID] = struct{}{}
totals.dmMessages[msg.Record.ID] = struct{}{}
totals.dmChannels[msg.Record.ChannelID] = struct{}{}
default:
stats.GuildMessages++
totals.guildMessages[msg.Record.ID] = struct{}{}
}
}
for id := range snap.channels {
if _, ok := messageChannels[id]; !ok {
delete(snap.channels, id)
}
for id, channel := range snap.channels {
channelLookup[id] = channel
totals.channels[id] = struct{}{}
}
stats.DMChannels = len(dmChannels)
stats.SkippedChannels = len(skippedChannels)
stats.Guilds = len(snap.guilds)
stats.Channels = len(snap.channels)
stats.Messages = len(snap.messages)
stats.FinishedAt = now().UTC()
return stats, snap, nil
for id := range snap.guilds {
totals.guilds[id] = struct{}{}
}
stats.DMChannels = len(totals.dmChannels)
stats.SkippedChannels = len(totals.skippedChannels)
stats.Guilds = len(totals.guilds)
stats.Channels = len(totals.channels)
stats.Messages = len(totals.messages)
stats.DMMessages = len(totals.dmMessages)
stats.GuildMessages = len(totals.guildMessages)
stats.SkippedMessages = len(totals.skippedMessages)
return unresolved
}
func mergeUnresolved(dst, src unresolvedMessages) {
maps.Copy(dst, src)
}
func recordUnresolved(unresolved unresolvedMessages, totals scanTotals, stats *Stats) {
for messageID, channelID := range unresolved {
totals.skippedMessages[messageID] = struct{}{}
totals.skippedChannels[channelID] = struct{}{}
}
stats.SkippedChannels = len(totals.skippedChannels)
stats.SkippedMessages = len(totals.skippedMessages)
}
func commitSnapshot(ctx context.Context, st *store.Store, opts Options, state scanState, candidates []fileCandidate, snap snapshot, checkpoint bool, stats *Stats) error {
if opts.DryRun {
return nil
}
if !checkpoint {
if snapshotHasChanges(snap) {
return writeSnapshot(ctx, st, snapshotWithoutMessageEvents(snap), false)
}
return nil
}
if snapshotHasChanges(snap) {
if err := writeSnapshot(ctx, st, snap, false); err != nil {
return err
}
} else if err := st.SetSyncState(ctx, "wiretap:last_import", time.Now().UTC().Format(time.RFC3339Nano)); err != nil {
return err
}
for _, candidate := range candidates {
state.current[candidate.relKey] = importedFingerprint(candidate.fingerprint)
}
if err := saveFileIndex(ctx, st, opts, state.current); err != nil {
return err
}
stats.Checkpoints++
return nil
}
func checkpointScannedCandidates(ctx context.Context, st *store.Store, opts Options, state scanState, candidates []fileCandidate, stats *Stats) error {
if opts.DryRun {
return nil
}
if err := st.SetSyncState(ctx, "wiretap:last_import", time.Now().UTC().Format(time.RFC3339Nano)); err != nil {
return err
}
for _, candidate := range candidates {
state.current[candidate.relKey] = importedFingerprint(candidate.fingerprint)
}
if err := saveFileIndex(ctx, st, opts, state.current); err != nil {
return err
}
stats.Checkpoints++
return nil
}
func snapshotWithoutMessageEvents(snap snapshot) snapshot {
out := snapshot{
guilds: snap.guilds,
channels: snap.channels,
messages: make(map[string]store.MessageMutation, len(snap.messages)),
routes: snap.routes,
userLabels: snap.userLabels,
}
for id, message := range snap.messages {
message.Options.AppendEvent = false
out.messages[id] = message
}
return out
}
func newSnapshot() snapshot {
return snapshot{
guilds: map[string]store.GuildRecord{},
channels: map[string]store.ChannelRecord{},
messages: map[string]store.MessageMutation{},
routes: map[string]string{},
userLabels: map[string]userLabel{},
}
}
func newSnapshotWithContext(base snapshot) snapshot {
snap := newSnapshot()
maps.Copy(snap.routes, base.routes)
maps.Copy(snap.userLabels, base.userLabels)
return snap
}
func mergeSnapshotContext(base snapshot, next snapshot) {
for channelID, guildID := range next.routes {
collectChannelRoute(base, channelID, guildID)
}
maps.Copy(base.userLabels, next.userLabels)
maps.Copy(base.channels, next.channels)
}
func copyChannelLookup(in map[string]store.ChannelRecord) map[string]store.ChannelRecord {
out := make(map[string]store.ChannelRecord, len(in))
maps.Copy(out, in)
return out
}
func sourceForPath(root, path, relPath string) fileSource {
if isRouteFilteredCachePath(root, path, relPath) {
return fileSourceCacheData
}
return fileSourceContext
}
func isRouteFilteredCachePath(root, path, relPath string) bool {
cleanRoot := filepath.ToSlash(root)
cleanPath := filepath.ToSlash(path)
cleanRel := filepath.ToSlash(relPath)
return filepath.Base(cleanRoot) == "Cache_Data" ||
filepath.Base(cleanRoot) == "CacheStorage" ||
strings.Contains(cleanPath, "/Cache/Cache_Data/") ||
strings.Contains(cleanPath, "/Service Worker/CacheStorage/") ||
strings.HasPrefix(cleanRel, "Cache_Data/") ||
strings.HasPrefix(cleanRel, "Service Worker/CacheStorage/")
}
func cacheFileHasRouteHint(rootFS *os.Root, relPath string) (bool, error) {
data, err := readFilePrefix(rootFS, relPath)
if err != nil {
return false, err
}
return channelRouteRE.Match(data) || apiMessagesRouteRE.Match(data), nil
}
func readFilePrefix(rootFS *os.Root, relPath string) ([]byte, error) {
file, err := rootFS.Open(relPath)
if err != nil {
return nil, err
}
defer func() { _ = file.Close() }()
data, err := io.ReadAll(io.LimitReader(file, cacheSniffBytes))
if err != nil {
return nil, err
}
return data, nil
}
func ignoreCacheFileError(error) error {

View File

@ -0,0 +1,198 @@
package discorddesktop
import (
"context"
"os"
"path/filepath"
"runtime"
"testing"
"time"
"github.com/stretchr/testify/require"
"github.com/openclaw/discrawl/internal/store"
)
func TestFileFingerprintStatusHelpers(t *testing.T) {
base := fileFingerprint{Size: 123, ModUnixNS: 456}
require.True(t, sameFileFingerprint(base, fileFingerprint{Size: 123, ModUnixNS: 456, Status: fileStatusSkipped}))
require.False(t, sameFileFingerprint(base, fileFingerprint{Size: 124, ModUnixNS: 456}))
require.False(t, sameFileFingerprint(base, fileFingerprint{Size: 123, ModUnixNS: 457}))
require.True(t, isImportedFingerprint(base))
require.True(t, isImportedFingerprint(importedFingerprint(base)))
require.False(t, isImportedFingerprint(skippedFingerprint(base)))
require.Equal(t, fileStatusImported, importedFingerprint(base).Status)
require.Equal(t, fileStatusSkipped, skippedFingerprint(base).Status)
require.Equal(t, wiretapFileIndexScope, fileIndexScope(Options{}))
require.Equal(t, wiretapFileIndexScope, fileIndexScope(Options{FullCache: true}))
}
func TestSnapshotCopyHelpers(t *testing.T) {
base := newSnapshot()
base.routes["111111111111111121"] = "999999999999999996"
base.userLabels["222222222222222232"] = userLabel{Name: "Alice"}
base.channels["111111111111111121"] = store.ChannelRecord{ID: "111111111111111121", GuildID: "999999999999999996", Name: "general"}
snap := newSnapshotWithContext(base)
require.Equal(t, base.routes, snap.routes)
require.Equal(t, base.userLabels, snap.userLabels)
require.Empty(t, snap.channels)
next := newSnapshot()
next.routes["111111111111111122"] = "999999999999999996"
next.userLabels["222222222222222233"] = userLabel{Name: "Bob"}
next.channels["111111111111111122"] = store.ChannelRecord{ID: "111111111111111122", GuildID: "999999999999999996", Name: "random"}
mergeSnapshotContext(base, next)
require.Equal(t, "999999999999999996", base.routes["111111111111111122"])
require.Equal(t, "Bob", base.userLabels["222222222222222233"].Name)
require.Equal(t, "random", base.channels["111111111111111122"].Name)
lookup := copyChannelLookup(base.channels)
lookup["111111111111111122"] = store.ChannelRecord{ID: "changed"}
require.Equal(t, "random", base.channels["111111111111111122"].Name)
}
func TestSnapshotWithoutMessageEvents(t *testing.T) {
snap := newSnapshot()
snap.messages["333333333333333346"] = store.MessageMutation{
Record: store.MessageRecord{ID: "333333333333333346"},
Options: store.WriteOptions{
AppendEvent: true,
EnqueueEmbedding: true,
},
}
stripped := snapshotWithoutMessageEvents(snap)
require.False(t, stripped.messages["333333333333333346"].Options.AppendEvent)
require.True(t, stripped.messages["333333333333333346"].Options.EnqueueEmbedding)
require.True(t, snap.messages["333333333333333346"].Options.AppendEvent)
}
func TestRouteFilteredCacheHelpers(t *testing.T) {
require.Equal(t, fileSourceCacheData, sourceForPath("/tmp/discord", "/tmp/discord/Cache/Cache_Data/entry", "Cache/Cache_Data/entry"))
require.Equal(t, fileSourceCacheData, sourceForPath("/tmp/discord", "/tmp/discord/Service Worker/CacheStorage/cache/entry", "Service Worker/CacheStorage/cache/entry"))
require.Equal(t, fileSourceContext, sourceForPath("/tmp/discord", "/tmp/discord/Local Storage/leveldb/000001.log", "Local Storage/leveldb/000001.log"))
}
func TestCacheFileHasRouteHint(t *testing.T) {
dir := t.TempDir()
require.NoError(t, os.WriteFile(filepath.Join(dir, "route"), []byte("https://discord.com/api/v9/channels/111111111111111121/messages?limit=50"), 0o600))
require.NoError(t, os.WriteFile(filepath.Join(dir, "plain"), []byte("no discord route here"), 0o600))
root, err := os.OpenRoot(dir)
require.NoError(t, err)
defer func() { _ = root.Close() }()
ok, err := cacheFileHasRouteHint(root, "route")
require.NoError(t, err)
require.True(t, ok)
ok, err = cacheFileHasRouteHint(root, "plain")
require.NoError(t, err)
require.False(t, ok)
_, err = cacheFileHasRouteHint(root, "missing")
require.Error(t, err)
}
func TestImportAndStateEdgeBranches(t *testing.T) {
ctx := context.Background()
_, err := Import(ctx, nil, Options{})
require.ErrorContains(t, err, "store is required")
configHome := t.TempDir()
t.Setenv("XDG_CONFIG_HOME", configHome)
if runtime.GOOS == "linux" {
require.Equal(t, filepath.Join(configHome, "discord"), DefaultPath())
}
dir := t.TempDir()
s, err := store.Open(ctx, filepath.Join(dir, "discrawl.db"))
require.NoError(t, err)
defer func() { _ = s.Close() }()
stats, err := Import(ctx, s, Options{
Path: dir,
Now: func() time.Time { return time.Date(2026, 5, 8, 12, 0, 0, 0, time.UTC) },
})
require.NoError(t, err)
require.Equal(t, dir, stats.Path)
require.Equal(t, 1, stats.Checkpoints)
stats, err = Import(ctx, nil, Options{Path: filepath.Join(dir, "missing"), DryRun: true})
require.NoError(t, err)
require.True(t, stats.DryRun)
stats, err = Import(ctx, nil, Options{Path: dir, DryRun: true, FullCache: true})
require.NoError(t, err)
require.True(t, stats.FullCache)
require.NoError(t, s.SetSyncState(ctx, fileIndexScope(Options{}), "{not-json"))
require.NoError(t, s.UpsertChannel(ctx, store.ChannelRecord{ID: "c1", GuildID: "g1", Kind: "text", Name: "general", RawJSON: `{}`}))
state, err := loadScanState(ctx, s, Options{})
require.NoError(t, err)
require.Empty(t, state.previous)
require.Equal(t, "general", state.channels["c1"].Name)
}
func TestSnapshotFinalizeAndCommitBranches(t *testing.T) {
ctx := context.Background()
s, err := store.Open(ctx, filepath.Join(t.TempDir(), "discrawl.db"))
require.NoError(t, err)
defer func() { _ = s.Close() }()
snap := newSnapshot()
snap.messages["m-missing"] = store.MessageMutation{
Record: store.MessageRecord{ID: "m-missing", ChannelID: "c-missing", RawJSON: `{}`},
}
snap.messages["m-known"] = store.MessageMutation{
Record: store.MessageRecord{ID: "m-known", GuildID: "g1", ChannelID: "c1", ChannelName: "general", RawJSON: `{}`},
}
stats := &Stats{}
totals := newScanTotals()
unresolved := finalizeSnapshot(snap, map[string]store.ChannelRecord{
"c1": {ID: "c1", GuildID: "g1", Kind: "text", Name: "general", RawJSON: `{}`},
}, totals, stats, true)
require.Equal(t, unresolvedMessages{"m-missing": "c-missing"}, unresolved)
require.Equal(t, 1, stats.Messages)
require.Equal(t, 1, stats.SkippedMessages)
require.Equal(t, "general", snap.channels["c1"].Name)
require.Equal(t, "g1", snap.guilds["g1"].ID)
more := unresolvedMessages{"m2": "c2"}
mergeUnresolved(unresolved, more)
recordUnresolved(unresolved, totals, stats)
require.Equal(t, 2, stats.SkippedMessages)
state := scanState{current: map[string]fileFingerprint{}}
candidates := []fileCandidate{{relKey: "Cache_Data/entry", fingerprint: fileFingerprint{Size: 10, ModUnixNS: 20}}}
require.NoError(t, commitSnapshot(ctx, s, Options{DryRun: true}, state, candidates, newSnapshot(), true, stats))
require.NoError(t, commitSnapshot(ctx, s, Options{}, state, candidates, newSnapshot(), false, stats))
require.NoError(t, commitSnapshot(ctx, s, Options{}, state, candidates, newSnapshot(), true, stats))
require.True(t, isImportedFingerprint(state.current["Cache_Data/entry"]))
require.NoError(t, checkpointScannedCandidates(ctx, s, Options{DryRun: true}, state, candidates, stats))
require.NoError(t, checkpointScannedCandidates(ctx, s, Options{}, state, candidates, stats))
}
func TestRouteHintCollectionBranches(t *testing.T) {
dir := t.TempDir()
require.NoError(t, os.WriteFile(filepath.Join(dir, "route"), []byte("https://discord.com/channels/123456789012/111111111111111121"), 0o600))
require.NoError(t, os.WriteFile(filepath.Join(dir, "plain"), []byte("plain"), 0o600))
root, err := os.OpenRoot(dir)
require.NoError(t, err)
defer func() { _ = root.Close() }()
snap := newSnapshot()
err = collectCacheRouteHints(context.Background(), root, []fileCandidate{
{relPath: "missing"},
{relPath: "plain"},
{relPath: "route"},
}, snap)
require.NoError(t, err)
require.Equal(t, "123456789012", snap.routes["111111111111111121"])
canceled, cancel := context.WithCancel(context.Background())
cancel()
require.ErrorIs(t, collectCacheRouteHints(canceled, root, []fileCandidate{{relPath: "route"}}, newSnapshot()), context.Canceled)
}

View File

@ -0,0 +1,387 @@
package discorddesktop
import (
"context"
"fmt"
"os"
"path/filepath"
"strconv"
"testing"
"github.com/stretchr/testify/require"
"github.com/openclaw/discrawl/internal/store"
)
func TestImportFastCacheSkipsUnroutedCacheDataUnlessFullCache(t *testing.T) {
ctx := context.Background()
dir := t.TempDir()
cachePath := filepath.Join(dir, "Cache", "Cache_Data")
require.NoError(t, os.MkdirAll(cachePath, 0o755))
require.NoError(t, os.WriteFile(filepath.Join(cachePath, "entry_0"), []byte(`
{"id":"111111111111111121","guild_id":"999999999999999996","type":0,"name":"slow-cache"}
{"id":"333333333333333346","channel_id":"111111111111111121","content":"unrouted historical cache","timestamp":"2026-04-23T18:20:43Z","author":{"id":"222222222222222232","username":"alice"}}
`), 0o600))
fastStore, err := store.Open(ctx, filepath.Join(dir, "fast.db"))
require.NoError(t, err)
defer func() { _ = fastStore.Close() }()
stats, err := Import(ctx, fastStore, Options{Path: dir})
require.NoError(t, err)
require.Equal(t, 0, stats.FilesScanned)
require.Equal(t, 1, stats.CacheFilesFastSkipped)
require.Equal(t, 0, stats.Messages)
results, err := fastStore.SearchMessages(ctx, store.SearchOptions{Query: "unrouted historical", Limit: 10})
require.NoError(t, err)
require.Empty(t, results)
stats, err = Import(ctx, fastStore, Options{Path: dir})
require.NoError(t, err)
require.Equal(t, 0, stats.FilesScanned)
require.Equal(t, 0, stats.CacheFilesFastSkipped)
require.Equal(t, 1, stats.FilesUnchanged)
stats, err = Import(ctx, fastStore, Options{Path: dir, FullCache: true})
require.NoError(t, err)
require.Equal(t, 1, stats.FilesScanned)
require.Equal(t, 1, stats.Messages)
fullStore, err := store.Open(ctx, filepath.Join(dir, "full.db"))
require.NoError(t, err)
defer func() { _ = fullStore.Close() }()
stats, err = Import(ctx, fullStore, Options{Path: dir, FullCache: true})
require.NoError(t, err)
require.Equal(t, 1, stats.FilesScanned)
require.Equal(t, 0, stats.CacheFilesFastSkipped)
require.Equal(t, 1, stats.Messages)
results, err = fullStore.SearchMessages(ctx, store.SearchOptions{Query: "unrouted historical", Limit: 10})
require.NoError(t, err)
require.Len(t, results, 1)
require.Equal(t, "slow-cache", results[0].ChannelName)
}
func TestImportCheckpointsCacheBatches(t *testing.T) {
ctx := context.Background()
dir := t.TempDir()
cachePath := filepath.Join(dir, "Cache", "Cache_Data")
require.NoError(t, os.MkdirAll(cachePath, 0o755))
for i := range checkpointEveryFiles + 1 {
channelID := "111111111111111121"
messageID := 333333333333333346 + i
body := bytesf(`https://discord.com/channels/999999999999999996/%s
{"id":"%d","channel_id":"%s","content":"checkpoint cache %d","timestamp":"2026-04-23T18:20:43Z","author":{"id":"222222222222222232","username":"alice"}}
`, channelID, messageID, channelID, i)
require.NoError(t, os.WriteFile(filepath.Join(cachePath, fmt.Sprintf("entry_%03d", i)), body, 0o600))
}
st, err := store.Open(ctx, filepath.Join(dir, "discrawl.db"))
require.NoError(t, err)
defer func() { _ = st.Close() }()
stats, err := Import(ctx, st, Options{Path: dir})
require.NoError(t, err)
require.Equal(t, checkpointEveryFiles+1, stats.FilesScanned)
require.Equal(t, checkpointEveryFiles+1, stats.Messages)
require.GreaterOrEqual(t, stats.Checkpoints, 2)
stats, err = Import(ctx, st, Options{Path: dir})
require.NoError(t, err)
require.Equal(t, 0, stats.FilesScanned)
require.Equal(t, checkpointEveryFiles+1, stats.FilesUnchanged)
}
func TestImportUsesLaterCacheMetadataBeforeCheckpointingEarlierBatch(t *testing.T) {
ctx := context.Background()
dir := t.TempDir()
cachePath := filepath.Join(dir, "Cache", "Cache_Data")
require.NoError(t, os.MkdirAll(cachePath, 0o755))
channelID := "111111111111111121"
guildID := "999999999999999996"
require.NoError(t, os.WriteFile(filepath.Join(cachePath, "entry_000"), bytesf(`https://discord.com/api/v9/channels/%s/messages?limit=50
{"id":"333333333333333346","channel_id":"%s","content":"needs later channel metadata","timestamp":"2026-04-23T18:20:43Z","author":{"id":"222222222222222232","username":"alice"}}
`, channelID, channelID), 0o600))
for i := 1; i < checkpointEveryFiles; i++ {
require.NoError(t, os.WriteFile(filepath.Join(cachePath, fmt.Sprintf("entry_%03d", i)), bytesf(
"https://discord.com/api/v9/channels/%s/messages?limit=50\n",
channelID,
), 0o600))
}
require.NoError(t, os.WriteFile(filepath.Join(cachePath, fmt.Sprintf("entry_%03d", checkpointEveryFiles)), bytesf(`https://discord.com/api/v9/channels/%s/messages?limit=50
{"id":"%s","guild_id":"%s","type":0,"name":"later-metadata"}
`, channelID, channelID, guildID), 0o600))
st, err := store.Open(ctx, filepath.Join(dir, "discrawl.db"))
require.NoError(t, err)
defer func() { _ = st.Close() }()
stats, err := Import(ctx, st, Options{Path: dir})
require.NoError(t, err)
require.Equal(t, checkpointEveryFiles+1+checkpointEveryFiles, stats.FilesScanned)
require.Equal(t, 1, stats.Messages)
require.GreaterOrEqual(t, stats.Checkpoints, 2)
results, err := st.SearchMessages(ctx, store.SearchOptions{Query: "needs later channel metadata", Limit: 10})
require.NoError(t, err)
require.Len(t, results, 1)
require.Equal(t, guildID, results[0].GuildID)
require.Equal(t, "later-metadata", results[0].ChannelName)
requireMessageCount(t, ctx, st, "message_events", 1)
stats, err = Import(ctx, st, Options{Path: dir})
require.NoError(t, err)
require.Equal(t, 0, stats.FilesScanned)
require.Equal(t, checkpointEveryFiles+1, stats.FilesUnchanged)
requireMessageCount(t, ctx, st, "message_events", 1)
}
func TestImportCheckpointsPartiallyResolvedRetryBatch(t *testing.T) {
ctx := context.Background()
dir := t.TempDir()
cachePath := filepath.Join(dir, "Cache", "Cache_Data")
require.NoError(t, os.MkdirAll(cachePath, 0o755))
resolvedChannelID := "111111111111111121"
unresolvedChannelID := "111111111111111122"
guildID := "999999999999999996"
require.NoError(t, os.WriteFile(filepath.Join(cachePath, "entry_000"), bytesf(`https://discord.com/api/v10/channels/%s/messages?limit=50
https://discord.com/api/v9/channels/%s/messages?limit=50
{"id":"333333333333333346","channel_id":"%s","content":"partially resolved retry message","timestamp":"2026-04-23T18:20:43Z","author":{"id":"222222222222222232","username":"alice"}}
{"id":"333333333333333347","channel_id":"%s","content":"still unresolved retry message","timestamp":"2026-04-23T18:20:44Z","author":{"id":"222222222222222232","username":"alice"}}
`, resolvedChannelID, unresolvedChannelID, resolvedChannelID, unresolvedChannelID), 0o600))
for i := 1; i < checkpointEveryFiles; i++ {
require.NoError(t, os.WriteFile(filepath.Join(cachePath, fmt.Sprintf("entry_%03d", i)), bytesf(
"https://discord.com/api/v9/channels/%s/messages?limit=50\n",
resolvedChannelID,
), 0o600))
}
require.NoError(t, os.WriteFile(filepath.Join(cachePath, fmt.Sprintf("entry_%03d", checkpointEveryFiles)), bytesf(`https://discord.com/api/v9/channels/%s/messages?limit=50
{"id":"%s","guild_id":"%s","type":0,"name":"partially-resolved"}
`, resolvedChannelID, resolvedChannelID, guildID), 0o600))
st, err := store.Open(ctx, filepath.Join(dir, "discrawl.db"))
require.NoError(t, err)
defer func() { _ = st.Close() }()
stats, err := Import(ctx, st, Options{Path: dir})
require.NoError(t, err)
require.Equal(t, checkpointEveryFiles+1+checkpointEveryFiles, stats.FilesScanned)
require.Equal(t, 1, stats.Messages)
require.Equal(t, 1, stats.SkippedMessages)
require.GreaterOrEqual(t, stats.Checkpoints, 2)
results, err := st.SearchMessages(ctx, store.SearchOptions{Query: "partially resolved retry", Limit: 10})
require.NoError(t, err)
require.Len(t, results, 1)
require.Equal(t, "partially-resolved", results[0].ChannelName)
results, err = st.SearchMessages(ctx, store.SearchOptions{Query: "still unresolved retry", Limit: 10})
require.NoError(t, err)
require.Empty(t, results)
requireMessageCount(t, ctx, st, "message_events", 1)
stats, err = Import(ctx, st, Options{Path: dir})
require.NoError(t, err)
require.Equal(t, 0, stats.FilesScanned)
require.Equal(t, checkpointEveryFiles+1, stats.FilesUnchanged)
requireMessageCount(t, ctx, st, "message_events", 1)
}
func TestImportCheckpointsUnresolvableRouteBearingCacheMisses(t *testing.T) {
ctx := context.Background()
dir := t.TempDir()
cachePath := filepath.Join(dir, "Cache", "Cache_Data")
require.NoError(t, os.MkdirAll(cachePath, 0o755))
channelID := "111111111111111121"
require.NoError(t, os.WriteFile(filepath.Join(cachePath, "entry_000"), bytesf(`https://discord.com/api/v9/channels/%s/messages?limit=50
{"id":"333333333333333346","channel_id":"%s","content":"permanent unresolved cache miss","timestamp":"2026-04-23T18:20:43Z","author":{"id":"222222222222222232","username":"alice"}}
`, channelID, channelID), 0o600))
st, err := store.Open(ctx, filepath.Join(dir, "discrawl.db"))
require.NoError(t, err)
defer func() { _ = st.Close() }()
stats, err := Import(ctx, st, Options{Path: dir})
require.NoError(t, err)
require.Equal(t, 1, stats.FilesScanned)
require.Equal(t, 1, stats.SkippedMessages)
require.Equal(t, 1, stats.Checkpoints)
results, err := st.SearchMessages(ctx, store.SearchOptions{Query: "permanent unresolved", Limit: 10})
require.NoError(t, err)
require.Empty(t, results)
stats, err = Import(ctx, st, Options{Path: dir})
require.NoError(t, err)
require.Equal(t, 0, stats.FilesScanned)
require.Equal(t, 1, stats.FilesUnchanged)
}
func TestImportDoesNotAppendEventsForSkippedMixedBatch(t *testing.T) {
ctx := context.Background()
dir := t.TempDir()
cachePath := filepath.Join(dir, "Cache", "Cache_Data")
require.NoError(t, os.MkdirAll(cachePath, 0o755))
guildID := "999999999999999996"
resolvedChannelID := "111111111111111121"
unresolvedChannelID := "111111111111111122"
require.NoError(t, os.WriteFile(filepath.Join(cachePath, "entry_000"), bytesf(`https://discord.com/channels/%s/%s
https://discord.com/api/v9/channels/%s/messages?limit=50
{"id":"333333333333333346","channel_id":"%s","content":"mixed resolved message","timestamp":"2026-04-23T18:20:43Z","author":{"id":"222222222222222232","username":"alice"}}
{"id":"333333333333333347","channel_id":"%s","content":"mixed unresolved message","timestamp":"2026-04-23T18:20:44Z","author":{"id":"222222222222222232","username":"alice"}}
`, guildID, resolvedChannelID, unresolvedChannelID, resolvedChannelID, unresolvedChannelID), 0o600))
st, err := store.Open(ctx, filepath.Join(dir, "discrawl.db"))
require.NoError(t, err)
defer func() { _ = st.Close() }()
stats, err := Import(ctx, st, Options{Path: dir})
require.NoError(t, err)
require.Equal(t, 1, stats.FilesScanned)
require.Equal(t, 1, stats.Checkpoints)
requireMessageCount(t, ctx, st, "message_events", 0)
results, err := st.SearchMessages(ctx, store.SearchOptions{Query: "mixed resolved", Limit: 10})
require.NoError(t, err)
require.Len(t, results, 1)
results, err = st.SearchMessages(ctx, store.SearchOptions{Query: "mixed unresolved", Limit: 10})
require.NoError(t, err)
require.Empty(t, results)
stats, err = Import(ctx, st, Options{Path: dir})
require.NoError(t, err)
require.Equal(t, 0, stats.FilesScanned)
require.Equal(t, 1, stats.FilesUnchanged)
requireMessageCount(t, ctx, st, "message_events", 0)
}
func TestImportDoesNotDuplicateEventsWhenSwitchingFullCacheModes(t *testing.T) {
ctx := context.Background()
dir := t.TempDir()
cachePath := filepath.Join(dir, "Cache", "Cache_Data")
require.NoError(t, os.MkdirAll(cachePath, 0o755))
channelID := "111111111111111121"
guildID := "999999999999999996"
require.NoError(t, os.WriteFile(filepath.Join(cachePath, "entry_000"), bytesf(`https://discord.com/channels/%s/%s
{"id":"%s","guild_id":"%s","type":0,"name":"mode-switch"}
{"id":"333333333333333346","channel_id":"%s","content":"mode switch event once","timestamp":"2026-04-23T18:20:43Z","author":{"id":"222222222222222232","username":"alice"}}
`, guildID, channelID, channelID, guildID, channelID), 0o600))
t.Run("full then default", func(t *testing.T) {
st, err := store.Open(ctx, filepath.Join(dir, "full-first.db"))
require.NoError(t, err)
defer func() { _ = st.Close() }()
stats, err := Import(ctx, st, Options{Path: dir, FullCache: true})
require.NoError(t, err)
require.Equal(t, 1, stats.FilesScanned)
require.Equal(t, 1, stats.Messages)
requireMessageCount(t, ctx, st, "message_events", 1)
stats, err = Import(ctx, st, Options{Path: dir})
require.NoError(t, err)
require.Equal(t, 0, stats.FilesScanned)
require.Equal(t, 1, stats.FilesUnchanged)
requireMessageCount(t, ctx, st, "message_events", 1)
})
t.Run("default then full", func(t *testing.T) {
st, err := store.Open(ctx, filepath.Join(dir, "default-first.db"))
require.NoError(t, err)
defer func() { _ = st.Close() }()
stats, err := Import(ctx, st, Options{Path: dir})
require.NoError(t, err)
require.Equal(t, 1, stats.FilesScanned)
require.Equal(t, 1, stats.Messages)
requireMessageCount(t, ctx, st, "message_events", 1)
stats, err = Import(ctx, st, Options{Path: dir, FullCache: true})
require.NoError(t, err)
require.Equal(t, 0, stats.FilesScanned)
require.Equal(t, 1, stats.FilesUnchanged)
requireMessageCount(t, ctx, st, "message_events", 1)
})
}
func TestImportFastCachePreservesKnownChannelMetadataAcrossBatches(t *testing.T) {
ctx := context.Background()
dir := t.TempDir()
leveldbPath := filepath.Join(dir, "Local Storage", "leveldb")
cachePath := filepath.Join(dir, "Cache", "Cache_Data")
require.NoError(t, os.MkdirAll(leveldbPath, 0o755))
require.NoError(t, os.MkdirAll(cachePath, 0o755))
channelID := "111111111111111121"
guildID := "999999999999999996"
require.NoError(t, os.WriteFile(filepath.Join(leveldbPath, "000001.log"), bytesf(
`{"id":"%s","guild_id":"%s","type":11,"name":"known-thread","thread_metadata":{"archived":false}}`,
channelID,
guildID,
), 0o600))
require.NoError(t, os.WriteFile(filepath.Join(cachePath, "entry_0"), bytesf(`https://discord.com/channels/%s/%s
{"id":"333333333333333346","channel_id":"%s","content":"thread metadata cache","timestamp":"2026-04-23T18:20:43Z","author":{"id":"222222222222222232","username":"alice"}}
`, guildID, channelID, channelID), 0o600))
st, err := store.Open(ctx, filepath.Join(dir, "discrawl.db"))
require.NoError(t, err)
defer func() { _ = st.Close() }()
stats, err := Import(ctx, st, Options{Path: dir})
require.NoError(t, err)
require.Equal(t, 1, stats.Messages)
channels, err := st.Channels(ctx, guildID)
require.NoError(t, err)
require.Len(t, channels, 1)
require.Equal(t, "known-thread", channels[0].Name)
require.Equal(t, "thread_public", channels[0].Kind)
_, rows, err := st.ReadOnlyQuery(ctx, "select raw_json from channels where id = '111111111111111121'")
require.NoError(t, err)
require.Len(t, rows, 1)
require.Contains(t, rows[0][0], `"type":11`)
}
func TestImportFastCacheRouteFiltersServiceWorkerCacheStorage(t *testing.T) {
ctx := context.Background()
dir := t.TempDir()
cachePath := filepath.Join(dir, "Service Worker", "CacheStorage", "cache-id")
require.NoError(t, os.MkdirAll(cachePath, 0o755))
require.NoError(t, os.WriteFile(filepath.Join(cachePath, "unrouted"), []byte(`
{"id":"111111111111111121","guild_id":"999999999999999996","type":0,"name":"service-worker-cache"}
{"id":"333333333333333346","channel_id":"111111111111111121","content":"service worker historical cache","timestamp":"2026-04-23T18:20:43Z","author":{"id":"222222222222222232","username":"alice"}}
`), 0o600))
st, err := store.Open(ctx, filepath.Join(dir, "discrawl.db"))
require.NoError(t, err)
defer func() { _ = st.Close() }()
stats, err := Import(ctx, st, Options{Path: dir})
require.NoError(t, err)
require.Equal(t, 0, stats.FilesScanned)
require.Equal(t, 1, stats.CacheFilesFastSkipped)
results, err := st.SearchMessages(ctx, store.SearchOptions{Query: "service worker historical", Limit: 10})
require.NoError(t, err)
require.Empty(t, results)
}
func requireMessageCount(t *testing.T, ctx context.Context, st *store.Store, table string, expected int) {
t.Helper()
_, rows, err := st.ReadOnlyQuery(ctx, "select count(*) from "+table)
require.NoError(t, err)
require.Len(t, rows, 1)
require.Len(t, rows[0], 1)
require.Equal(t, strconv.Itoa(expected), rows[0][0])
}
func bytesf(format string, args ...any) []byte {
return fmt.Appendf(nil, format, args...)
}

View File

@ -0,0 +1,110 @@
package discorddesktop
import (
"context"
"os"
"github.com/openclaw/discrawl/internal/store"
)
type importRun struct {
ctx context.Context
st *store.Store
opts Options
state scanState
rootFS *os.Root
channelLookup map[string]store.ChannelRecord
totals scanTotals
stats *Stats
base snapshot
pending []fileCandidate
pendingUnresolved unresolvedMessages
pendingLookupSize int
pendingRouteSize int
}
func newImportRun(ctx context.Context, st *store.Store, opts Options, state scanState, rootFS *os.Root, stats *Stats) *importRun {
return &importRun{
ctx: ctx,
st: st,
opts: opts,
state: state,
rootFS: rootFS,
channelLookup: copyChannelLookup(state.channels),
totals: newScanTotals(),
stats: stats,
base: newSnapshot(),
pendingUnresolved: unresolvedMessages{},
pendingLookupSize: -1,
pendingRouteSize: -1,
}
}
func (r *importRun) scanContext(candidates []fileCandidate) error {
if err := scanCandidates(r.ctx, r.rootFS, r.opts, candidates, r.base, r.channelLookup, r.stats); err != nil {
return err
}
return r.finalizeAndCommit(candidates, r.base, false)
}
func (r *importRun) scanCacheBatches(candidates []fileCandidate) error {
for start := 0; start < len(candidates); start += checkpointEveryFiles {
end := min(start+checkpointEveryFiles, len(candidates))
batchCandidates := candidates[start:end]
batch := newSnapshotWithContext(r.base)
if err := scanCandidates(r.ctx, r.rootFS, r.opts, batchCandidates, batch, r.channelLookup, r.stats); err != nil {
return err
}
if err := r.finalizeAndCommit(batchCandidates, batch, false); err != nil {
return err
}
mergeSnapshotContext(r.base, batch)
}
return nil
}
func (r *importRun) finalizeAndCommit(candidates []fileCandidate, snap snapshot, recordSkipped bool) error {
unresolved := finalizeSnapshot(snap, r.channelLookup, r.totals, r.stats, recordSkipped)
checkpoint := len(unresolved) == 0
if !checkpoint {
r.deferCandidates(candidates, unresolved)
}
if len(candidates) == 0 && !snapshotHasChanges(snap) {
return nil
}
return commitSnapshot(r.ctx, r.st, r.opts, r.state, candidates, snap, checkpoint, r.stats)
}
func (r *importRun) deferCandidates(candidates []fileCandidate, unresolved unresolvedMessages) {
r.pending = append(r.pending, candidates...)
mergeUnresolved(r.pendingUnresolved, unresolved)
if r.pendingLookupSize >= 0 {
return
}
r.pendingLookupSize = len(r.channelLookup)
r.pendingRouteSize = len(r.base.routes)
}
func (r *importRun) retryPending() error {
if len(r.pending) == 0 {
return nil
}
if !r.pendingCanResolve() {
recordUnresolved(r.pendingUnresolved, r.totals, r.stats)
return checkpointScannedCandidates(r.ctx, r.st, r.opts, r.state, r.pending, r.stats)
}
retry := newSnapshotWithContext(r.base)
if err := scanCandidates(r.ctx, r.rootFS, r.opts, r.pending, retry, r.channelLookup, r.stats); err != nil {
return err
}
finalizeSnapshot(retry, r.channelLookup, r.totals, r.stats, true)
if err := commitSnapshot(r.ctx, r.st, r.opts, r.state, r.pending, retry, true, r.stats); err != nil {
return err
}
mergeSnapshotContext(r.base, retry)
return nil
}
func (r *importRun) pendingCanResolve() bool {
return len(r.channelLookup) > r.pendingLookupSize || len(r.base.routes) > r.pendingRouteSize
}

View File

@ -13,7 +13,7 @@ import (
"github.com/stretchr/testify/require"
"github.com/steipete/discrawl/internal/store"
"github.com/openclaw/discrawl/internal/store"
)
func TestDesktopPathAndImportHelpers(t *testing.T) {

View File

@ -0,0 +1,165 @@
package discorddesktop
import (
"encoding/json"
"testing"
"time"
"github.com/stretchr/testify/require"
"github.com/openclaw/discrawl/internal/store"
)
func TestPrimitiveValueHelpers(t *testing.T) {
raw := map[string]any{
"string": "value",
"blank": " ",
"int": 3,
"int64": int64(4),
"float": float64(5),
"json_number": json.Number("6"),
"numeric": "7",
"bad_numeric": "nope",
"truthy": true,
"array": []any{"one", "two"},
}
require.Equal(t, "value", stringField(raw, "string"))
require.Empty(t, stringField(raw, "blank"))
require.Equal(t, "6", stringField(raw, "json_number"))
require.Empty(t, stringField(raw, "int"))
require.Empty(t, stringField(raw, "missing"))
for key, want := range map[string]int{
"int": 3,
"float": 5,
"json_number": 6,
} {
got, ok := intField(raw, key)
require.True(t, ok, key)
require.Equal(t, want, got, key)
}
_, ok := intField(raw, "bad_numeric")
require.False(t, ok)
_, ok = intField(raw, "int64")
require.False(t, ok)
_, ok = intField(raw, "numeric")
require.False(t, ok)
_, ok = intField(raw, "missing")
require.False(t, ok)
require.Equal(t, int64(3), int64Field(raw, "int"))
require.Equal(t, int64(4), int64Field(raw, "int64"))
require.Equal(t, int64(5), int64Field(raw, "float"))
require.Equal(t, int64(6), int64Field(raw, "json_number"))
require.Zero(t, int64Field(raw, "numeric"))
require.Zero(t, int64Field(raw, "bad_numeric"))
require.True(t, boolField(raw, "truthy"))
require.False(t, boolField(raw, "missing"))
require.Equal(t, 2, lenArray(raw["array"]))
require.Zero(t, lenArray(raw["string"]))
require.Equal(t, "fallback", firstNonEmpty("", " ", "fallback", "later"))
require.Empty(t, firstNonEmpty("", " "))
}
func TestDiscordValueFormatHelpers(t *testing.T) {
require.Equal(t, "456789", shortID("123456789"))
require.Equal(t, "short", shortID("short"))
require.Equal(t, "Discord Direct Messages", guildName(DirectMessageGuildID))
require.Equal(t, "Discord Desktop Guild 123456", guildName("123456"))
require.Equal(t, "dm", kindForChannelType(1, true))
require.Equal(t, "group_dm", kindForChannelType(3, true))
require.Equal(t, "thread_public", kindForChannelType(11, false))
require.Equal(t, "thread_private", kindForChannelType(12, false))
require.Equal(t, "thread_announcement", kindForChannelType(10, false))
require.Equal(t, "desktop", kindForChannelType(2, false))
require.Equal(t, "desktop", kindForChannelType(4, false))
require.Equal(t, "announcement", kindForChannelType(5, false))
require.Equal(t, "forum", kindForChannelType(15, false))
require.Equal(t, "desktop", kindForChannelType(16, false))
require.Equal(t, "text", kindForChannelType(0, false))
}
func TestDiscordMessagePayloadHelpers(t *testing.T) {
raw := map[string]any{
"id": "333333333333333333",
"channel_id": "111111111111111111",
"guild_id": "999999999999999999",
"type": float64(0),
"timestamp": "2026-05-08T12:00:00Z",
"edited_timestamp": "2026-05-08T12:05:00Z",
"content": "hello\u200b\nworld",
"message_reference": map[string]any{"message_id": "222222222222222222"},
"author": map[string]any{
"id": "444444444444444444",
"username": "peter",
"global_name": "Peter",
"display_name": "Peter S",
"discriminator": "0",
"bot": true,
},
"attachments": []any{
map[string]any{"filename": "trace.txt", "content_type": "text/plain", "size": float64(12), "url": "https://cdn.example/trace.txt"},
map[string]any{"id": "att2"},
"ignored",
},
"mentions": []any{
map[string]any{"id": "555555555555555555", "username": "alice", "global_name": "Alice"},
map[string]any{"username": "missing"},
},
"embeds": []any{
map[string]any{"title": "Deploy", "description": "Ready"},
map[string]any{"title": " "},
},
}
at := parseDiscordTime("2026-05-08T12:00:00Z")
attachments := parseAttachments(raw, "333333333333333333", "999999999999999999", "111111111111111111", "444444444444444444")
require.Len(t, attachments, 2)
require.Equal(t, "333333333333333333:0", attachments[0].AttachmentID)
require.Equal(t, "trace.txt", attachments[0].Filename)
require.Equal(t, "att2", attachments[1].Filename)
require.Equal(t, []string{"trace.txt", "att2"}, attachmentText(attachments))
mentions := parseMentions(raw, "333333333333333333", "999999999999999999", "111111111111111111", "444444444444444444", at)
require.Equal(t, []store.MentionEventRecord{{
MessageID: "333333333333333333",
GuildID: "999999999999999999",
ChannelID: "111111111111111111",
AuthorID: "444444444444444444",
TargetType: "user",
TargetID: "555555555555555555",
TargetName: "Alice",
EventAt: at.Format(time.RFC3339Nano),
}}, mentions)
require.Equal(t, []string{"Deploy", "Ready"}, embedText(raw))
require.Equal(t, "helloworld\ntrace.txt\natt2\nDeploy\nReady", normalizeText(raw["content"], attachmentText(attachments), embedText(raw)))
require.Equal(t, "hidden text", cleanText("\u200bhidden\x00 text\n"))
require.Equal(t, "222222222222222222", messageReferenceID(raw))
require.Empty(t, messageReferenceID(map[string]any{}))
require.Contains(t, syntheticGuild("g1", "Guild").RawJSON, "discord_desktop")
require.Equal(t, "dm", syntheticChannel("c1", DirectMessageGuildID, "Alice").Kind)
require.Equal(t, "group_dm", syntheticChannel("c2", DirectMessageGuildID, "Alice, Bob").Kind)
require.Equal(t, "channel-123456", syntheticChannel("123456123456", "g1", "").Name)
require.Contains(t, channelRawJSON(raw, "c1", "g1", "general", "text"), `"kind":"text"`)
require.Contains(t, messageRawJSON(raw, "333333333333333333", "999999999999999999", "111111111111111111", "444444444444444444"), "desktop_cache_note")
require.Equal(t, "Alice, Bob", recipientLabel([]any{
map[string]any{"username": "Bob"},
map[string]any{"global_name": "Alice"},
map[string]any{},
}))
require.True(t, parseDiscordTime("2026-05-08T12:00:00.123Z").Equal(time.Date(2026, 5, 8, 12, 0, 0, 123000000, time.UTC)))
require.True(t, parseDiscordTime("bad").IsZero())
require.True(t, parseDiscordTime("").IsZero())
require.False(t, snowflakeTime("175928847299117063").IsZero())
require.True(t, snowflakeTime("bad").IsZero())
require.Empty(t, formatOptionalTime(time.Time{}))
require.Equal(t, "2026-05-08T12:00:00Z", formatOptionalTime(at))
require.True(t, looksSnowflake("123456789012"))
require.False(t, looksSnowflake("123"))
require.False(t, looksSnowflake("12345678901x"))
}

View File

@ -1,91 +0,0 @@
package embed
import (
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"net/http"
)
type ollamaProvider struct {
client *http.Client
baseURL string
model string
maxInputChars int
}
type ollamaEmbedRequest struct {
Model string `json:"model"`
Input []string `json:"input"`
}
type ollamaEmbedResponse struct {
Model string `json:"model"`
Embeddings [][]float32 `json:"embeddings"`
}
func newOllamaProvider(settings providerSettings) Provider {
return &ollamaProvider{
client: settings.HTTPClient,
baseURL: settings.BaseURL,
model: settings.Model,
maxInputChars: settings.MaxInputChars,
}
}
func (p *ollamaProvider) Embed(ctx context.Context, inputs []string) (EmbeddingBatch, error) {
if len(inputs) == 0 {
return EmbeddingBatch{Model: p.model}, nil
}
payload := ollamaEmbedRequest{
Model: p.model,
Input: trimInputs(inputs, p.maxInputChars),
}
var response ollamaEmbedResponse
if err := postJSON(ctx, p.client, p.baseURL+"/api/embed", "", payload, &response); err != nil {
return EmbeddingBatch{}, err
}
if len(response.Embeddings) != len(inputs) {
return EmbeddingBatch{}, fmt.Errorf("ollama embedding response returned %d vectors for %d inputs", len(response.Embeddings), len(inputs))
}
dimensions, err := inferDimensions(response.Embeddings)
if err != nil {
return EmbeddingBatch{}, err
}
model := response.Model
if model == "" {
model = p.model
}
return EmbeddingBatch{Model: model, Dimensions: dimensions, Vectors: response.Embeddings}, nil
}
func postJSON(ctx context.Context, client *http.Client, endpoint, apiKey string, payload any, target any) error {
body, err := json.Marshal(payload)
if err != nil {
return fmt.Errorf("marshal embedding request: %w", err)
}
req, err := http.NewRequestWithContext(ctx, http.MethodPost, endpoint, bytes.NewReader(body))
if err != nil {
return fmt.Errorf("build embedding request: %w", err)
}
req.Header.Set("Content-Type", "application/json")
req.Header.Set("Accept", "application/json")
if apiKey != "" {
req.Header.Set("Authorization", "Bearer "+apiKey)
}
resp, err := client.Do(req)
if err != nil {
return fmt.Errorf("embedding request failed: %w", err)
}
defer func() { _ = resp.Body.Close() }()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
msg, _ := io.ReadAll(io.LimitReader(resp.Body, 4096))
return &HTTPError{StatusCode: resp.StatusCode, Body: string(msg)}
}
if err := json.NewDecoder(resp.Body).Decode(target); err != nil {
return fmt.Errorf("decode embedding response: %w", err)
}
return nil
}

View File

@ -1,82 +0,0 @@
package embed
import (
"context"
"fmt"
"net/http"
)
type openAICompatibleProvider struct {
client *http.Client
baseURL string
apiKey string
model string
maxInputChars int
}
type openAIEmbeddingRequest struct {
Model string `json:"model"`
Input []string `json:"input"`
}
type openAIEmbeddingResponse struct {
Model string `json:"model"`
Data []openAIEmbeddingItem `json:"data"`
}
type openAIEmbeddingItem struct {
Index *int `json:"index"`
Embedding []float32 `json:"embedding"`
}
func newOpenAICompatibleProvider(settings providerSettings) Provider {
return &openAICompatibleProvider{
client: settings.HTTPClient,
baseURL: settings.BaseURL,
apiKey: settings.APIKey,
model: settings.Model,
maxInputChars: settings.MaxInputChars,
}
}
func (p *openAICompatibleProvider) Embed(ctx context.Context, inputs []string) (EmbeddingBatch, error) {
if len(inputs) == 0 {
return EmbeddingBatch{Model: p.model}, nil
}
payload := openAIEmbeddingRequest{
Model: p.model,
Input: trimInputs(inputs, p.maxInputChars),
}
var response openAIEmbeddingResponse
if err := postJSON(ctx, p.client, p.baseURL+"/embeddings", p.apiKey, payload, &response); err != nil {
return EmbeddingBatch{}, err
}
if len(response.Data) != len(inputs) {
return EmbeddingBatch{}, fmt.Errorf("openai-compatible embedding response returned %d vectors for %d inputs", len(response.Data), len(inputs))
}
vectors := make([][]float32, len(inputs))
seen := make([]bool, len(inputs))
for position, item := range response.Data {
index := position
if item.Index != nil {
index = *item.Index
}
if index < 0 || index >= len(inputs) {
return EmbeddingBatch{}, fmt.Errorf("openai-compatible embedding response index %d out of range", index)
}
if seen[index] {
return EmbeddingBatch{}, fmt.Errorf("openai-compatible embedding response duplicated index %d", index)
}
seen[index] = true
vectors[index] = item.Embedding
}
dimensions, err := inferDimensions(vectors)
if err != nil {
return EmbeddingBatch{}, err
}
model := response.Model
if model == "" {
model = p.model
}
return EmbeddingBatch{Model: model, Dimensions: dimensions, Vectors: vectors}, nil
}

View File

@ -1,310 +0,0 @@
package embed
import (
"context"
"errors"
"fmt"
"net"
"net/http"
"net/url"
"os"
"strings"
"time"
"github.com/steipete/discrawl/internal/config"
)
const (
ProviderOpenAI = "openai"
ProviderOllama = "ollama"
ProviderLlamaCpp = "llamacpp"
ProviderOpenAICompatible = "openai_compatible"
DefaultOpenAIBaseURL = "https://api.openai.com/v1"
DefaultOllamaBaseURL = "http://127.0.0.1:11434"
DefaultLlamaCppBaseURL = "http://127.0.0.1:8080/v1"
DefaultOpenAIModel = "text-embedding-3-small"
DefaultLocalEmbeddingModel = "nomic-embed-text"
DefaultBatchSize = 64
DefaultMaxInputChars = 12000
DefaultRequestTimeout = 2 * time.Minute
DefaultProbeTimeout = 2 * time.Second
)
// Provider is the narrow embedding surface used by later queue/search work.
type Provider interface {
Embed(ctx context.Context, inputs []string) (EmbeddingBatch, error)
}
type EmbeddingBatch struct {
Model string
Dimensions int
Vectors [][]float32
}
type HTTPError struct {
StatusCode int
Body string
}
func (e *HTTPError) Error() string {
return fmt.Sprintf("embedding request failed with HTTP %d: %s", e.StatusCode, e.Body)
}
func IsRateLimitError(err error) bool {
var httpErr *HTTPError
return errors.As(err, &httpErr) && httpErr.StatusCode == http.StatusTooManyRequests
}
type CheckResult struct {
Provider string
Model string
BaseURL string
Status string
Warning string
Probed bool
}
type Option func(*providerOptions)
type providerOptions struct {
httpClient *http.Client
timeoutOverride time.Duration
}
type providerSettings struct {
Name string
Model string
BaseURL string
APIKey string
MaxInputChars int
Timeout time.Duration
HTTPClient *http.Client
}
func WithHTTPClient(client *http.Client) Option {
return func(opts *providerOptions) {
opts.httpClient = client
}
}
func WithRequestTimeout(timeout time.Duration) Option {
return func(opts *providerOptions) {
opts.timeoutOverride = timeout
}
}
func NewProvider(cfg config.EmbeddingsConfig, opts ...Option) (Provider, error) {
settings, err := resolveProviderConfig(cfg, true, opts...)
if err != nil {
return nil, err
}
return newProvider(settings)
}
func CheckProvider(ctx context.Context, cfg config.EmbeddingsConfig) CheckResult {
settings, err := resolveProviderConfig(cfg, true, WithRequestTimeout(DefaultProbeTimeout))
if err != nil {
return CheckResult{
Provider: normalizedProviderName(cfg.Provider),
Model: strings.TrimSpace(cfg.Model),
BaseURL: strings.TrimSpace(cfg.BaseURL),
Status: "warning",
Warning: err.Error(),
}
}
result := CheckResult{
Provider: settings.Name,
Model: settings.Model,
BaseURL: settings.BaseURL,
Status: "ok",
}
if !shouldProbe(settings) {
return result
}
provider, err := newProvider(settings)
if err != nil {
result.Status = "warning"
result.Warning = err.Error()
return result
}
probeCtx, cancel := context.WithTimeout(ctx, DefaultProbeTimeout)
defer cancel()
if _, err := provider.Embed(probeCtx, []string{"discrawl probe"}); err != nil {
result.Status = "warning"
result.Warning = err.Error()
return result
}
result.Probed = true
return result
}
func resolveProviderConfig(cfg config.EmbeddingsConfig, validateAPIKey bool, opts ...Option) (providerSettings, error) {
options := providerOptions{}
for _, opt := range opts {
opt(&options)
}
name := normalizedProviderName(cfg.Provider)
if name == "" {
name = ProviderOpenAI
}
model := strings.TrimSpace(cfg.Model)
if model == "" {
model = defaultModel(name)
}
baseURL := strings.TrimRight(strings.TrimSpace(cfg.BaseURL), "/")
if baseURL == "" {
switch name {
case ProviderOpenAI:
baseURL = DefaultOpenAIBaseURL
case ProviderOllama:
baseURL = DefaultOllamaBaseURL
case ProviderLlamaCpp:
baseURL = DefaultLlamaCppBaseURL
case ProviderOpenAICompatible:
return providerSettings{}, fmt.Errorf("embedding provider %q requires base_url", name)
}
}
timeout := DefaultRequestTimeout
if strings.TrimSpace(cfg.RequestTimeout) != "" {
parsed, err := time.ParseDuration(cfg.RequestTimeout)
if err != nil {
return providerSettings{}, fmt.Errorf("parse embeddings request_timeout: %w", err)
}
if parsed <= 0 {
return providerSettings{}, errors.New("embeddings request_timeout must be positive")
}
timeout = parsed
}
if options.timeoutOverride > 0 && options.timeoutOverride < timeout {
timeout = options.timeoutOverride
}
maxInputChars := cfg.MaxInputChars
if maxInputChars <= 0 {
maxInputChars = DefaultMaxInputChars
}
switch name {
case ProviderOpenAI, ProviderOllama, ProviderLlamaCpp, ProviderOpenAICompatible:
default:
return providerSettings{}, fmt.Errorf("unsupported embedding provider %q", name)
}
apiKey, err := resolveAPIKey(name, cfg.APIKeyEnv, validateAPIKey)
if err != nil {
return providerSettings{}, err
}
client := options.httpClient
if client == nil {
client = &http.Client{Timeout: timeout}
}
if _, err := url.ParseRequestURI(baseURL); err != nil {
return providerSettings{}, fmt.Errorf("invalid embeddings base_url %q: %w", baseURL, err)
}
return providerSettings{
Name: name,
Model: model,
BaseURL: baseURL,
APIKey: apiKey,
MaxInputChars: maxInputChars,
Timeout: timeout,
HTTPClient: client,
}, nil
}
func newProvider(settings providerSettings) (Provider, error) {
switch settings.Name {
case ProviderOllama:
return newOllamaProvider(settings), nil
case ProviderOpenAI, ProviderLlamaCpp, ProviderOpenAICompatible:
return newOpenAICompatibleProvider(settings), nil
default:
return nil, fmt.Errorf("unsupported embedding provider %q", settings.Name)
}
}
func resolveAPIKey(provider, apiKeyEnv string, validate bool) (string, error) {
envName := strings.TrimSpace(apiKeyEnv)
required := provider == ProviderOpenAI
if envName == "" {
if required {
envName = "OPENAI_API_KEY"
} else {
return "", nil
}
}
value := strings.TrimSpace(os.Getenv(envName))
if value == "" {
if required || validate {
return "", fmt.Errorf("embedding provider %q requires API key env %s", provider, envName)
}
return "", nil
}
return value, nil
}
func normalizedProviderName(provider string) string {
return strings.ToLower(strings.TrimSpace(provider))
}
func defaultModel(provider string) string {
switch provider {
case ProviderOllama, ProviderLlamaCpp:
return DefaultLocalEmbeddingModel
default:
return DefaultOpenAIModel
}
}
func shouldProbe(settings providerSettings) bool {
switch settings.Name {
case ProviderOllama, ProviderLlamaCpp:
return true
case ProviderOpenAICompatible:
return isLoopbackBaseURL(settings.BaseURL)
default:
return false
}
}
func isLoopbackBaseURL(rawURL string) bool {
parsed, err := url.Parse(rawURL)
if err != nil {
return false
}
host := parsed.Hostname()
if host == "localhost" {
return true
}
ip := net.ParseIP(host)
return ip != nil && ip.IsLoopback()
}
func trimInputs(inputs []string, maxChars int) []string {
if maxChars <= 0 {
maxChars = DefaultMaxInputChars
}
out := make([]string, len(inputs))
for i, input := range inputs {
runes := []rune(input)
if len(runes) > maxChars {
runes = runes[:maxChars]
}
out[i] = string(runes)
}
return out
}
func inferDimensions(vectors [][]float32) (int, error) {
dimensions := 0
for _, vector := range vectors {
if len(vector) == 0 {
return 0, errors.New("embedding response contained an empty vector")
}
if dimensions == 0 {
dimensions = len(vector)
continue
}
if len(vector) != dimensions {
return 0, fmt.Errorf("embedding response dimensions mismatch: got %d want %d", len(vector), dimensions)
}
}
return dimensions, nil
}

View File

@ -1,319 +0,0 @@
package embed
import (
"context"
"encoding/json"
"net/http"
"net/http/httptest"
"testing"
"time"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/steipete/discrawl/internal/config"
)
func TestOllamaProviderEmbeds(t *testing.T) {
t.Parallel()
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
assert.Equal(t, "/api/embed", r.URL.Path)
assert.Equal(t, http.MethodPost, r.Method)
var req ollamaEmbedRequest
assert.NoError(t, json.NewDecoder(r.Body).Decode(&req))
assert.Equal(t, "nomic-embed-text", req.Model)
assert.Equal(t, []string{"abcd", "xy"}, req.Input)
_, _ = w.Write([]byte(`{"model":"nomic-embed-text","embeddings":[[1,2,3],[4,5,6]]}`))
}))
defer server.Close()
provider, err := NewProvider(config.EmbeddingsConfig{
Provider: ProviderOllama,
Model: "nomic-embed-text",
BaseURL: server.URL,
MaxInputChars: 4,
RequestTimeout: "5s",
})
require.NoError(t, err)
batch, err := provider.Embed(context.Background(), []string{"abcdef", "xy"})
require.NoError(t, err)
require.Equal(t, "nomic-embed-text", batch.Model)
require.Equal(t, 3, batch.Dimensions)
require.Equal(t, [][]float32{{1, 2, 3}, {4, 5, 6}}, batch.Vectors)
}
func TestOpenAICompatibleProviderEmbedsAndUsesAuth(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
assert.Equal(t, "/embeddings", r.URL.Path)
assert.Equal(t, "Bearer secret", r.Header.Get("Authorization"))
var req openAIEmbeddingRequest
assert.NoError(t, json.NewDecoder(r.Body).Decode(&req))
assert.Equal(t, "local-model", req.Model)
assert.Equal(t, []string{"one", "two"}, req.Input)
_, _ = w.Write([]byte(`{
"model":"local-model",
"data":[
{"index":1,"embedding":[3,4]},
{"index":0,"embedding":[1,2]}
]
}`))
}))
defer server.Close()
t.Setenv("DISCRAWL_EMBED_KEY", "secret")
provider, err := NewProvider(config.EmbeddingsConfig{
Provider: ProviderOpenAICompatible,
Model: "local-model",
BaseURL: server.URL,
APIKeyEnv: "DISCRAWL_EMBED_KEY",
RequestTimeout: "5s",
})
require.NoError(t, err)
batch, err := provider.Embed(context.Background(), []string{"one", "two"})
require.NoError(t, err)
require.Equal(t, "local-model", batch.Model)
require.Equal(t, 2, batch.Dimensions)
require.Equal(t, [][]float32{{1, 2}, {3, 4}}, batch.Vectors)
}
func TestProviderFactoryDefaultsAndValidation(t *testing.T) {
t.Setenv("OPENAI_API_KEY", "openai-secret")
openAI, err := resolveProviderConfig(config.EmbeddingsConfig{
Provider: ProviderOpenAI,
RequestTimeout: "5s",
}, true)
require.NoError(t, err)
require.Equal(t, DefaultOpenAIBaseURL, openAI.BaseURL)
require.Equal(t, DefaultOpenAIModel, openAI.Model)
require.Equal(t, "openai-secret", openAI.APIKey)
ollama, err := resolveProviderConfig(config.EmbeddingsConfig{
Provider: ProviderOllama,
RequestTimeout: "5s",
}, true)
require.NoError(t, err)
require.Equal(t, DefaultOllamaBaseURL, ollama.BaseURL)
require.Equal(t, DefaultLocalEmbeddingModel, ollama.Model)
llamaCpp, err := resolveProviderConfig(config.EmbeddingsConfig{
Provider: ProviderLlamaCpp,
RequestTimeout: "5s",
}, true)
require.NoError(t, err)
require.Equal(t, DefaultLlamaCppBaseURL, llamaCpp.BaseURL)
_, err = resolveProviderConfig(config.EmbeddingsConfig{
Provider: ProviderOpenAICompatible,
RequestTimeout: "5s",
}, true)
require.ErrorContains(t, err, "requires base_url")
}
func TestProviderFactoryRequiresOpenAIAPIKey(t *testing.T) {
t.Setenv("OPENAI_API_KEY", "")
_, err := NewProvider(config.EmbeddingsConfig{
Provider: ProviderOpenAI,
RequestTimeout: "5s",
})
require.ErrorContains(t, err, "requires API key env OPENAI_API_KEY")
}
func TestProviderFactoryReportsUnsupportedProviderBeforeAPIKey(t *testing.T) {
t.Setenv("MISSING_EMBED_KEY", "")
_, err := NewProvider(config.EmbeddingsConfig{
Provider: "bogus",
APIKeyEnv: "MISSING_EMBED_KEY",
RequestTimeout: "5s",
})
require.ErrorContains(t, err, "unsupported embedding provider \"bogus\"")
}
func TestCheckProviderProbesLocalProvider(t *testing.T) {
t.Parallel()
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
assert.Equal(t, "/api/embed", r.URL.Path)
_, _ = w.Write([]byte(`{"model":"nomic-embed-text","embeddings":[[1,2]]}`))
}))
defer server.Close()
result := CheckProvider(context.Background(), config.EmbeddingsConfig{
Provider: ProviderOllama,
Model: "nomic-embed-text",
BaseURL: server.URL,
RequestTimeout: "5s",
})
require.Equal(t, "ok", result.Status)
require.True(t, result.Probed)
require.Empty(t, result.Warning)
require.Equal(t, server.URL, result.BaseURL)
}
func TestCheckProviderWarnsOnLocalProbeFailure(t *testing.T) {
t.Parallel()
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
http.Error(w, "not ready", http.StatusServiceUnavailable)
}))
defer server.Close()
result := CheckProvider(context.Background(), config.EmbeddingsConfig{
Provider: ProviderOllama,
Model: "nomic-embed-text",
BaseURL: server.URL,
RequestTimeout: "5s",
})
require.Equal(t, "warning", result.Status)
require.Contains(t, result.Warning, "HTTP 503")
require.False(t, result.Probed)
}
func TestProviderExposesRateLimitErrors(t *testing.T) {
t.Parallel()
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
http.Error(w, "rate limited", http.StatusTooManyRequests)
}))
defer server.Close()
provider, err := NewProvider(config.EmbeddingsConfig{
Provider: ProviderOpenAICompatible,
Model: "local-model",
BaseURL: server.URL,
RequestTimeout: "5s",
})
require.NoError(t, err)
_, err = provider.Embed(context.Background(), []string{"one"})
require.ErrorContains(t, err, "HTTP 429")
require.True(t, IsRateLimitError(err))
}
func TestProviderRejectsInvalidResponses(t *testing.T) {
t.Parallel()
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
_, _ = w.Write([]byte(`{"data":[{"index":0,"embedding":[1]},{"index":1,"embedding":[2,3]}]}`))
}))
defer server.Close()
provider, err := NewProvider(config.EmbeddingsConfig{
Provider: ProviderOpenAICompatible,
Model: "local-model",
BaseURL: server.URL,
RequestTimeout: "5s",
})
require.NoError(t, err)
_, err = provider.Embed(context.Background(), []string{"one", "two"})
require.ErrorContains(t, err, "dimensions mismatch")
}
func TestEmbeddingProvidersHandleEmptyInputsAndIndexErrors(t *testing.T) {
t.Parallel()
settings := providerSettings{
Name: ProviderOllama,
Model: "model",
BaseURL: "http://127.0.0.1:1",
MaxInputChars: 10,
HTTPClient: http.DefaultClient,
}
ollama := newOllamaProvider(settings)
batch, err := ollama.Embed(context.Background(), nil)
require.NoError(t, err)
require.Equal(t, "model", batch.Model)
settings.Name = ProviderOpenAICompatible
openai := newOpenAICompatibleProvider(settings)
batch, err = openai.Embed(context.Background(), nil)
require.NoError(t, err)
require.Equal(t, "model", batch.Model)
tests := []struct {
name string
body string
inputs []string
want string
}{
{
name: "count",
body: `{"data":[]}`,
inputs: []string{"one"},
want: "returned 0 vectors for 1 inputs",
},
{
name: "range",
body: `{"data":[{"index":2,"embedding":[1]}]}`,
inputs: []string{"one"},
want: "index 2 out of range",
},
{
name: "duplicate",
body: `{"data":[{"index":0,"embedding":[1]},{"index":0,"embedding":[2]}]}`,
inputs: []string{"one", "two"},
want: "duplicated index 0",
},
}
for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
_, _ = w.Write([]byte(tc.body))
}))
defer server.Close()
provider, err := NewProvider(config.EmbeddingsConfig{
Provider: ProviderOpenAICompatible,
Model: "model",
BaseURL: server.URL,
RequestTimeout: "5s",
})
require.NoError(t, err)
_, err = provider.Embed(context.Background(), tc.inputs)
require.ErrorContains(t, err, tc.want)
})
}
}
func TestProviderOptionsAndProbeDecisions(t *testing.T) {
t.Parallel()
client := &http.Client{Timeout: time.Second}
settings, err := resolveProviderConfig(config.EmbeddingsConfig{
Provider: ProviderOllama,
BaseURL: "http://127.0.0.1:11434/",
RequestTimeout: "30s",
}, true, WithHTTPClient(client), WithRequestTimeout(50*time.Millisecond))
require.NoError(t, err)
require.Same(t, client, settings.HTTPClient)
require.Equal(t, 50*time.Millisecond, settings.Timeout)
require.Equal(t, "http://127.0.0.1:11434", settings.BaseURL)
require.True(t, shouldProbe(settings))
require.True(t, isLoopbackBaseURL("http://localhost:8080/v1"))
require.True(t, isLoopbackBaseURL("http://[::1]:8080/v1"))
require.False(t, isLoopbackBaseURL("https://api.example.com/v1"))
require.False(t, isLoopbackBaseURL("://bad"))
require.False(t, shouldProbe(providerSettings{Name: ProviderOpenAI}))
require.True(t, shouldProbe(providerSettings{Name: ProviderOpenAICompatible, BaseURL: "http://localhost:8080/v1"}))
require.False(t, shouldProbe(providerSettings{Name: ProviderOpenAICompatible, BaseURL: "https://api.example.com/v1"}))
}
func TestCheckProviderSkipsRemoteCompatibleProbe(t *testing.T) {
t.Parallel()
result := CheckProvider(context.Background(), config.EmbeddingsConfig{
Provider: ProviderOpenAICompatible,
Model: "remote-model",
BaseURL: "https://api.example.com/v1",
RequestTimeout: "5s",
})
require.Equal(t, "ok", result.Status)
require.False(t, result.Probed)
require.Empty(t, result.Warning)
}

View File

@ -7,7 +7,7 @@ import (
"strings"
"time"
"github.com/steipete/discrawl/internal/store"
"github.com/openclaw/discrawl/internal/store"
)
// DigestOptions controls how a Digest is built.

View File

@ -8,7 +8,7 @@ import (
"github.com/stretchr/testify/require"
"github.com/steipete/discrawl/internal/store"
"github.com/openclaw/discrawl/internal/store"
)
func TestBuildDigest(t *testing.T) {

View File

@ -8,7 +8,7 @@ import (
"strings"
"time"
"github.com/steipete/discrawl/internal/store"
"github.com/openclaw/discrawl/internal/store"
)
// QuietOptions controls how a Quiet report is built.

View File

@ -8,7 +8,7 @@ import (
"github.com/stretchr/testify/require"
"github.com/steipete/discrawl/internal/store"
"github.com/openclaw/discrawl/internal/store"
)
func TestBuildQuiet(t *testing.T) {

View File

@ -14,7 +14,7 @@ import (
"text/template"
"time"
"github.com/steipete/discrawl/internal/store"
"github.com/openclaw/discrawl/internal/store"
)
const (

View File

@ -10,7 +10,7 @@ import (
"github.com/stretchr/testify/require"
"github.com/steipete/discrawl/internal/store"
"github.com/openclaw/discrawl/internal/store"
)
func TestBuildRenderAndUpdateReadme(t *testing.T) {

Some files were not shown because too many files have changed in this diff Show More