Compare commits
No commits in common. "main" and "v0.6.3" have entirely different histories.
@ -1,59 +1,36 @@
|
||||
---
|
||||
name: discrawl
|
||||
description: Use for local Discord archive search, sync freshness, DMs, channel summaries, desktop/API/git-share sources, TUI browsing, and Discrawl repo/release work.
|
||||
description: Use for local Discord archive search, sync freshness, DMs, channel summaries, and Discrawl repo/release work.
|
||||
---
|
||||
|
||||
# Discrawl
|
||||
|
||||
Use local Discord archive data first for Discord questions. Hit Discord APIs
|
||||
only when the archive is stale, missing the requested scope, or the user asks
|
||||
for current external context.
|
||||
Use local archive data first for Discord questions. Browse or hit live APIs only when the local archive is stale or the user asks for current external context.
|
||||
|
||||
## Sources
|
||||
|
||||
- DB: `~/.discrawl/discrawl.db`
|
||||
- Config: `~/.discrawl/config.toml`
|
||||
- Cache: `~/.discrawl/cache`
|
||||
- Logs: `~/.discrawl/logs`
|
||||
- Git share repo: `~/.discrawl/share`
|
||||
- Repo: `openclaw/discrawl`; use `~/GIT/_Perso/discrawl` only after verifying
|
||||
its remote targets `openclaw/discrawl`, otherwise use a fresh checkout
|
||||
- Preferred CLI: `discrawl`; fallback to `go run ./cmd/discrawl` from the repo if the installed binary is stale
|
||||
- Repo: `~/Projects/discrawl`
|
||||
- Preferred CLI: `discrawl`; fallback to repo binary if installed binary is stale
|
||||
|
||||
## Freshness
|
||||
|
||||
For recent/current questions, check freshness before analysis:
|
||||
|
||||
```bash
|
||||
discrawl status --json
|
||||
```
|
||||
|
||||
For precise freshness from the default database:
|
||||
|
||||
```bash
|
||||
sqlite3 ~/.discrawl/discrawl.db \
|
||||
"select coalesce(max(updated_at),'') from sync_state where scope like 'channel:%';"
|
||||
```
|
||||
|
||||
Routine diagnostics:
|
||||
Routine refresh:
|
||||
|
||||
```bash
|
||||
discrawl doctor
|
||||
```
|
||||
|
||||
Desktop-local refresh:
|
||||
|
||||
```bash
|
||||
discrawl sync --source wiretap
|
||||
```
|
||||
|
||||
Bot API latest refresh, when credentials are available:
|
||||
|
||||
```bash
|
||||
discrawl sync
|
||||
```
|
||||
|
||||
Use `--full` only for deliberate historical backfills:
|
||||
Historical/backfill refresh:
|
||||
|
||||
```bash
|
||||
discrawl sync --full
|
||||
@ -65,7 +42,7 @@ If SQLite reports busy/locked, check for stray `discrawl` processes before retry
|
||||
|
||||
1. Resolve scope: guild, channel, DM, author, keyword, date range.
|
||||
2. Check freshness for recent/current requests.
|
||||
3. Prefer CLI search/messages for slices; use read-only SQL for exact counts.
|
||||
3. Use CLI for normal reads; use SQL for precise counts/rankings.
|
||||
4. Report absolute date spans, counts, channel/DM names, and known gaps.
|
||||
|
||||
Common commands:
|
||||
@ -73,52 +50,26 @@ Common commands:
|
||||
```bash
|
||||
discrawl search "query"
|
||||
discrawl messages --channel '#maintainers' --days 7 --all
|
||||
discrawl dms --last 20
|
||||
discrawl tui --dm
|
||||
discrawl sql "select count(*) from messages;"
|
||||
discrawl --json sql "select count(*) from messages;"
|
||||
```
|
||||
|
||||
## SQL
|
||||
When the installed CLI lacks a new feature, build or run from `~/Projects/discrawl` before concluding the feature is missing.
|
||||
|
||||
Use `discrawl sql` for exact counts, joins, and ranking queries when normal
|
||||
CLI reads are too coarse. The command is read-only by default, accepts SQL as
|
||||
args or stdin, and supports `--json` for agent parsing.
|
||||
## Discord DMs
|
||||
|
||||
Useful examples:
|
||||
|
||||
```bash
|
||||
discrawl --json sql "select count(*) as messages from messages;"
|
||||
discrawl --json sql "select coalesce(nullif(c.name, ''), m.channel_id) as channel, count(*) as messages from messages m left join channels c on c.id = m.channel_id group by m.channel_id order by messages desc limit 20;"
|
||||
discrawl --json sql "select coalesce(nullif(mm.display_name, ''), nullif(mm.global_name, ''), nullif(mm.username, ''), m.author_id) as author, count(*) as messages from messages m left join members mm on mm.guild_id = m.guild_id and mm.user_id = m.author_id group by m.guild_id, m.author_id order by messages desc limit 20;"
|
||||
```
|
||||
|
||||
Never use `--unsafe --confirm` unless the user explicitly asks for a database
|
||||
mutation and the write has been reviewed.
|
||||
|
||||
When the installed CLI lacks a new feature, build or run from a verified
|
||||
`openclaw/discrawl` checkout before concluding the feature is missing.
|
||||
|
||||
## Discord Boundaries
|
||||
|
||||
Bot API sync requires configured Discord bot credentials; do not invent token
|
||||
availability. Desktop wiretap mode reads local Discord Desktop artifacts and
|
||||
must not extract credentials, use user tokens, call Discord as the user, or
|
||||
write to Discord application storage. Wiretap/Desktop cache DMs are local-only
|
||||
and must not be described as part of the published Git snapshot. Git-share
|
||||
snapshots must not include secrets or `@me` DM rows.
|
||||
Wiretap/Desktop cache DMs are local-only. Do not imply they are in the published Git snapshot. For missing recent DMs, refresh first; stale archive is a common cause.
|
||||
|
||||
## Verification
|
||||
|
||||
For repo edits, prefer existing Go gates:
|
||||
|
||||
```bash
|
||||
GOWORK=off go test ./...
|
||||
go test ./...
|
||||
```
|
||||
|
||||
Then run targeted CLI smoke for the touched surface, for example:
|
||||
|
||||
```bash
|
||||
discrawl doctor
|
||||
discrawl status --json
|
||||
discrawl search "test" --limit 5
|
||||
```
|
||||
|
||||
@ -1,12 +0,0 @@
|
||||
root = true
|
||||
|
||||
[*]
|
||||
charset = utf-8
|
||||
end_of_line = lf
|
||||
insert_final_newline = true
|
||||
indent_style = tab
|
||||
indent_size = 4
|
||||
|
||||
[*.{md,yml,yaml,json,toml}]
|
||||
indent_style = space
|
||||
indent_size = 2
|
||||
6
.gitattributes
vendored
6
.gitattributes
vendored
@ -1,6 +0,0 @@
|
||||
* text=auto
|
||||
*.go text eol=lf
|
||||
*.md text eol=lf
|
||||
*.toml text eol=lf
|
||||
*.yml text eol=lf
|
||||
*.yaml text eol=lf
|
||||
12
.github/CODEOWNERS
vendored
12
.github/CODEOWNERS
vendored
@ -1,12 +0,0 @@
|
||||
# Protect ownership and automation rules.
|
||||
/.github/CODEOWNERS @openclaw/openclaw-secops
|
||||
/.github/dependabot.yml @openclaw/openclaw-secops
|
||||
/.github/workflows/ @openclaw/openclaw-secops
|
||||
|
||||
# Release, backup, and package integrity surfaces.
|
||||
/.goreleaser.yaml @openclaw/openclaw-secops
|
||||
/go.mod @openclaw/openclaw-secops
|
||||
/go.sum @openclaw/openclaw-secops
|
||||
/scripts/*backup* @openclaw/openclaw-secops
|
||||
/scripts/*release* @openclaw/openclaw-secops
|
||||
/scripts/*publish* @openclaw/openclaw-secops
|
||||
20
.github/workflows/ci.yml
vendored
20
.github/workflows/ci.yml
vendored
@ -30,13 +30,13 @@ jobs:
|
||||
- name: Lint
|
||||
uses: golangci/golangci-lint-action@v9.2.0
|
||||
with:
|
||||
version: v2.12.1
|
||||
version: v2.11.1
|
||||
|
||||
- name: Install analyzers
|
||||
run: |
|
||||
go install honnef.co/go/tools/cmd/staticcheck@v0.7.0
|
||||
go install mvdan.cc/gofumpt@v0.9.2
|
||||
go install github.com/securego/gosec/v2/cmd/gosec@v2.26.1
|
||||
go install github.com/securego/gosec/v2/cmd/gosec@v2.25.0
|
||||
|
||||
- name: Vet
|
||||
run: go vet ./...
|
||||
@ -91,19 +91,7 @@ jobs:
|
||||
}'
|
||||
|
||||
- name: Build
|
||||
run: go build -o bin/discrawl ./cmd/discrawl
|
||||
|
||||
- name: Smoke test CLI control surface
|
||||
run: |
|
||||
set -euo pipefail
|
||||
output="$(./bin/discrawl help)"
|
||||
printf '%s\n' "$output"
|
||||
printf '%s' "$output" | grep -q "metadata"
|
||||
printf '%s' "$output" | grep -q "tui"
|
||||
test -n "$(./bin/discrawl --version)"
|
||||
./bin/discrawl metadata --json | grep -q '"schema_version"'
|
||||
./bin/discrawl status --json | grep -q '"databases"'
|
||||
./bin/discrawl tui --json | grep -q '^\['
|
||||
run: go build ./cmd/discrawl
|
||||
|
||||
deps:
|
||||
runs-on: ubuntu-latest
|
||||
@ -148,7 +136,7 @@ jobs:
|
||||
cache: true
|
||||
|
||||
- name: Snapshot release build
|
||||
uses: goreleaser/goreleaser-action@v7.2.1
|
||||
uses: goreleaser/goreleaser-action@v7.1.0
|
||||
with:
|
||||
distribution: goreleaser
|
||||
version: "~> v2"
|
||||
|
||||
37
.github/workflows/codeql.yml
vendored
37
.github/workflows/codeql.yml
vendored
@ -1,37 +0,0 @@
|
||||
name: CodeQL
|
||||
|
||||
on:
|
||||
pull_request:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
schedule:
|
||||
- cron: "29 4 * * 1"
|
||||
workflow_dispatch:
|
||||
|
||||
permissions:
|
||||
actions: read
|
||||
contents: read
|
||||
security-events: write
|
||||
|
||||
jobs:
|
||||
analyze:
|
||||
name: analyze
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Setup Go
|
||||
uses: actions/setup-go@v6
|
||||
with:
|
||||
go-version-file: go.mod
|
||||
cache: true
|
||||
|
||||
- name: Initialize CodeQL
|
||||
uses: github/codeql-action/init@v4
|
||||
with:
|
||||
languages: go
|
||||
|
||||
- name: Perform CodeQL Analysis
|
||||
uses: github/codeql-action/analyze@v4
|
||||
52
.github/workflows/pages.yml
vendored
52
.github/workflows/pages.yml
vendored
@ -1,52 +0,0 @@
|
||||
name: Pages
|
||||
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
paths:
|
||||
- "docs/**"
|
||||
- "scripts/build-docs-site.mjs"
|
||||
- ".github/workflows/pages.yml"
|
||||
workflow_dispatch:
|
||||
|
||||
permissions:
|
||||
contents: read
|
||||
pages: write
|
||||
id-token: write
|
||||
|
||||
concurrency:
|
||||
group: pages
|
||||
cancel-in-progress: false
|
||||
|
||||
jobs:
|
||||
deploy:
|
||||
name: Deploy docs
|
||||
runs-on: ubuntu-latest
|
||||
timeout-minutes: 10
|
||||
environment:
|
||||
name: github-pages
|
||||
url: ${{ steps.deployment.outputs.page_url }}
|
||||
steps:
|
||||
- name: Check out
|
||||
uses: actions/checkout@v6
|
||||
|
||||
- name: Set up Node
|
||||
uses: actions/setup-node@v6
|
||||
with:
|
||||
node-version: 24
|
||||
|
||||
- name: Build site
|
||||
run: node scripts/build-docs-site.mjs
|
||||
|
||||
- name: Configure Pages
|
||||
uses: actions/configure-pages@v6
|
||||
|
||||
- name: Upload artifact
|
||||
uses: actions/upload-pages-artifact@v5
|
||||
with:
|
||||
path: dist/docs-site
|
||||
|
||||
- name: Deploy
|
||||
id: deployment
|
||||
uses: actions/deploy-pages@v5
|
||||
7
.github/workflows/publish-discord-backup.yml
vendored
7
.github/workflows/publish-discord-backup.yml
vendored
@ -76,12 +76,7 @@ jobs:
|
||||
git clone "$BACKUP_REMOTE" "$BACKUP_REPO"
|
||||
go run ./cmd/discrawl --config "$CONFIG" init --db "$DB" --guild "$DISCRAWL_GUILD_ID"
|
||||
if [ -f "$BACKUP_REPO/manifest.json" ]; then
|
||||
if [ -s "$DB" ]; then
|
||||
echo "Restored Discord DB cache at $DB; skipping pre-sync snapshot import."
|
||||
else
|
||||
echo "Discord DB cache missing; importing latest published snapshot before latest-only sync."
|
||||
go run ./cmd/discrawl --config "$CONFIG" update --repo "$BACKUP_REPO" --remote "$BACKUP_REMOTE"
|
||||
fi
|
||||
go run ./cmd/discrawl --config "$CONFIG" update --repo "$BACKUP_REPO" --remote "$BACKUP_REMOTE"
|
||||
fi
|
||||
go run ./cmd/discrawl --config "$CONFIG" sync --guild "$DISCRAWL_GUILD_ID" --skip-members --latest-only
|
||||
git -C "$BACKUP_REPO" pull --ff-only origin main
|
||||
|
||||
61
.github/workflows/release.yml
vendored
61
.github/workflows/release.yml
vendored
@ -37,69 +37,10 @@ jobs:
|
||||
run: git checkout ${{ inputs.tag }}
|
||||
|
||||
- name: GoReleaser
|
||||
uses: goreleaser/goreleaser-action@v7.2.1
|
||||
uses: goreleaser/goreleaser-action@v7.1.0
|
||||
with:
|
||||
distribution: goreleaser
|
||||
version: "~> v2"
|
||||
args: release --clean --config /tmp/.goreleaser.yaml
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
update-homebrew-tap:
|
||||
runs-on: ubuntu-latest
|
||||
needs: goreleaser
|
||||
steps:
|
||||
- name: Resolve release tag
|
||||
run: |
|
||||
if [ "${{ github.event_name }}" = "workflow_dispatch" ]; then
|
||||
echo "RELEASE_TAG=${{ inputs.tag }}" >> "$GITHUB_ENV"
|
||||
else
|
||||
echo "RELEASE_TAG=${{ github.ref_name }}" >> "$GITHUB_ENV"
|
||||
fi
|
||||
|
||||
- name: Dispatch tap formula update
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.HOMEBREW_TAP_TOKEN }}
|
||||
run: |
|
||||
if [ -z "$GH_TOKEN" ]; then
|
||||
echo "::error::Set HOMEBREW_TAP_TOKEN with workflow access to steipete/homebrew-tap"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
request_id="discrawl-${RELEASE_TAG}-${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}"
|
||||
expected_title="Update discrawl for ${RELEASE_TAG} (${request_id})"
|
||||
|
||||
gh workflow run update-formula.yml \
|
||||
--repo steipete/homebrew-tap \
|
||||
--ref main \
|
||||
-f formula=discrawl \
|
||||
-f tag="$RELEASE_TAG" \
|
||||
-f repository=openclaw/discrawl \
|
||||
-f artifact_template="{formula}_{version}_{target}.tar.gz" \
|
||||
-f request_id="$request_id"
|
||||
|
||||
run_id=""
|
||||
for _ in {1..30}; do
|
||||
run_id=$(gh run list \
|
||||
--repo steipete/homebrew-tap \
|
||||
--workflow update-formula.yml \
|
||||
--branch main \
|
||||
--event workflow_dispatch \
|
||||
--limit 20 \
|
||||
--json databaseId,displayTitle \
|
||||
--jq ".[] | select(.displayTitle == \"$expected_title\") | .databaseId" | head -n1)
|
||||
if [ -n "$run_id" ]; then
|
||||
break
|
||||
fi
|
||||
sleep 5
|
||||
done
|
||||
|
||||
if [ -z "$run_id" ]; then
|
||||
echo "::error::Could not find tap workflow run with title: $expected_title"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
gh run watch "$run_id" \
|
||||
--repo steipete/homebrew-tap \
|
||||
--exit-status \
|
||||
--interval 10
|
||||
|
||||
63
.github/workflows/secret-scan.yml
vendored
63
.github/workflows/secret-scan.yml
vendored
@ -1,63 +0,0 @@
|
||||
name: "Security Gate: Secret Scanning"
|
||||
|
||||
on:
|
||||
push:
|
||||
branches: ["**"]
|
||||
pull_request:
|
||||
branches: [main, master]
|
||||
|
||||
permissions: {}
|
||||
|
||||
jobs:
|
||||
trufflehog:
|
||||
name: Scan for Verified Secrets
|
||||
runs-on: ubuntu-latest
|
||||
permissions:
|
||||
contents: read
|
||||
steps:
|
||||
- name: Checkout code
|
||||
uses: actions/checkout@v6
|
||||
with:
|
||||
fetch-depth: 0
|
||||
|
||||
- name: Resolve scan range
|
||||
id: scan_range
|
||||
env:
|
||||
EVENT_NAME: ${{ github.event_name }}
|
||||
PR_BASE_SHA: ${{ github.event.pull_request.base.sha }}
|
||||
PR_HEAD_SHA: ${{ github.event.pull_request.head.sha }}
|
||||
PUSH_BASE_SHA: ${{ github.event.before }}
|
||||
PUSH_HEAD_SHA: ${{ github.sha }}
|
||||
DEFAULT_BRANCH: ${{ github.event.repository.default_branch }}
|
||||
run: |
|
||||
set -euo pipefail
|
||||
zero_sha="0000000000000000000000000000000000000000"
|
||||
|
||||
if [[ "$EVENT_NAME" == "pull_request" ]]; then
|
||||
base="$PR_BASE_SHA"
|
||||
head="$PR_HEAD_SHA"
|
||||
else
|
||||
base="$PUSH_BASE_SHA"
|
||||
head="$PUSH_HEAD_SHA"
|
||||
if [[ -z "$base" || "$base" == "$zero_sha" ]]; then
|
||||
base="origin/$DEFAULT_BRANCH"
|
||||
fi
|
||||
fi
|
||||
|
||||
echo "base=$base" >> "$GITHUB_OUTPUT"
|
||||
echo "head=$head" >> "$GITHUB_OUTPUT"
|
||||
|
||||
- name: TruffleHog OSS
|
||||
id: trufflehog
|
||||
uses: trufflesecurity/trufflehog@v3.95.2
|
||||
with:
|
||||
path: ./
|
||||
base: ${{ steps.scan_range.outputs.base }}
|
||||
head: ${{ steps.scan_range.outputs.head }}
|
||||
extra_args: --only-verified --debug
|
||||
|
||||
- name: Notify on failure
|
||||
if: steps.trufflehog.outcome == 'failure'
|
||||
run: |
|
||||
echo "::error::Verified secrets found. Rotate the credential before merging."
|
||||
exit 1
|
||||
86
.github/workflows/stale.yml
vendored
86
.github/workflows/stale.yml
vendored
@ -1,86 +0,0 @@
|
||||
name: Stale
|
||||
|
||||
on:
|
||||
schedule:
|
||||
- cron: "25 4 * * *"
|
||||
workflow_dispatch:
|
||||
|
||||
permissions: {}
|
||||
|
||||
jobs:
|
||||
stale:
|
||||
permissions:
|
||||
issues: write
|
||||
pull-requests: write
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Mark stale unassigned issues and pull requests
|
||||
uses: actions/stale@v10
|
||||
with:
|
||||
days-before-issue-stale: 14
|
||||
days-before-issue-close: 7
|
||||
days-before-pr-stale: 14
|
||||
days-before-pr-close: 7
|
||||
stale-issue-label: stale
|
||||
stale-pr-label: stale
|
||||
exempt-issue-labels: enhancement,maintainer,pinned,security,no-stale
|
||||
exempt-pr-labels: maintainer,no-stale
|
||||
operations-per-run: 1000
|
||||
ascending: true
|
||||
exempt-all-assignees: true
|
||||
remove-stale-when-updated: true
|
||||
stale-issue-message: |
|
||||
This issue has been automatically marked as stale due to inactivity.
|
||||
Please add updated discrawl details or it will be closed.
|
||||
stale-pr-message: |
|
||||
This pull request has been automatically marked as stale due to inactivity.
|
||||
Please update it or it will be closed.
|
||||
close-issue-message: |
|
||||
Closing due to inactivity.
|
||||
If this still affects discrawl, open a new issue with current reproduction details.
|
||||
close-issue-reason: not_planned
|
||||
close-pr-message: |
|
||||
Closing due to inactivity.
|
||||
If this PR should be revived, reopen it with current context and validation.
|
||||
|
||||
- name: Mark stale assigned issues
|
||||
uses: actions/stale@v10
|
||||
with:
|
||||
days-before-issue-stale: 30
|
||||
days-before-issue-close: 10
|
||||
days-before-pr-stale: -1
|
||||
days-before-pr-close: -1
|
||||
stale-issue-label: stale
|
||||
exempt-issue-labels: enhancement,maintainer,pinned,security,no-stale
|
||||
operations-per-run: 1000
|
||||
ascending: true
|
||||
include-only-assigned: true
|
||||
remove-stale-when-updated: true
|
||||
stale-issue-message: |
|
||||
This assigned issue has been automatically marked as stale after 30 days of inactivity.
|
||||
Please add an update or it will be closed.
|
||||
close-issue-message: |
|
||||
Closing due to inactivity.
|
||||
If this still affects discrawl, reopen or file a new issue with current evidence.
|
||||
close-issue-reason: not_planned
|
||||
|
||||
- name: Mark stale assigned pull requests
|
||||
uses: actions/stale@v10
|
||||
with:
|
||||
days-before-issue-stale: -1
|
||||
days-before-issue-close: -1
|
||||
days-before-pr-stale: 27
|
||||
days-before-pr-close: 7
|
||||
stale-pr-label: stale
|
||||
exempt-pr-labels: maintainer,no-stale
|
||||
operations-per-run: 1000
|
||||
ascending: true
|
||||
include-only-assigned: true
|
||||
ignore-pr-updates: true
|
||||
remove-stale-when-updated: true
|
||||
stale-pr-message: |
|
||||
This assigned pull request has been automatically marked as stale after being open for 27 days.
|
||||
Please add an update or it will be closed.
|
||||
close-pr-message: |
|
||||
Closing due to inactivity.
|
||||
If this PR should be revived, reopen it with current context and validation.
|
||||
@ -12,7 +12,7 @@ builds:
|
||||
env:
|
||||
- CGO_ENABLED=0
|
||||
ldflags:
|
||||
- -s -w -X github.com/openclaw/discrawl/internal/cli.version={{ .Version }}
|
||||
- -s -w -X github.com/steipete/discrawl/internal/cli.version={{ .Version }}
|
||||
targets:
|
||||
- darwin_amd64
|
||||
- darwin_arm64
|
||||
|
||||
61
CHANGELOG.md
61
CHANGELOG.md
@ -1,65 +1,6 @@
|
||||
# Changelog
|
||||
|
||||
## 0.7.0 - 2026-05-08
|
||||
|
||||
### Changes
|
||||
|
||||
- Added `discrawl tui`, a terminal archive browser for stored guild messages and local `@me` wiretap DMs using the shared crawlkit pane browser.
|
||||
- Added crawlkit-backed `metadata --json`, `status --json`, and `doctor --json` control surfaces for launchers, automation, and CI checks.
|
||||
- Published the generated documentation site at `discrawl.sh`, including command pages, install/setup docs, configuration, security notes, guides, a contact page, and social cards.
|
||||
- Moved the Go module and release metadata to `github.com/openclaw/discrawl`.
|
||||
|
||||
### Fixes
|
||||
|
||||
- Kept documented command-local search flags working after the query, such as `discrawl search "term" --limit 5`. Thanks @PrinceOfEgypt.
|
||||
- Made the terminal browser more useful and accurate: default guild scoping, newest-message startup, compact panes, selected-message detail panes, count-header sorting, local/remote status labels, right-click actions, Discord message URLs, row labels, direct-message pane labels, mention rendering, inline mention resolution, attachment details, and reply-context hydration without broad thread scans.
|
||||
- Kept read-only commands such as `search`, `messages`, and safe `sql` usable while `tail` or another writer holds the sync lock. Thanks @PrinceOfEgypt.
|
||||
- Kept `tui --help`, status, and terminal-browser reads safe for fresh or missing local databases without triggering Git snapshot auto-update.
|
||||
- Kept local-only snapshot rows filtered during shared archive imports and forwarded snapshot import progress through the crawlkit import path.
|
||||
- Made stale Git snapshot imports plan shard deltas from crawlkit file fingerprints or Git object identity, so routine shared-archive refreshes import changed message tail shards instead of rebuilding every table and FTS index.
|
||||
- Included progress percentages in message-sync logs.
|
||||
- Fixed GoReleaser version stamping after the module path move.
|
||||
|
||||
### Documentation
|
||||
|
||||
- Documented the crawlkit-backed config/status/control, snapshot, mirror, sync-state, output, and shared TUI surfaces now used on `main`.
|
||||
- Clarified that Discord bot sync, desktop wiretap parsing, DM privacy filters, schema ownership, FTS/ranking, embeddings, and analytics remain app-owned.
|
||||
- Aligned terminal-browser docs with the gitcrawl-style shared TUI model: channel/person/thread groups, message rows, detail/thread panes, sorting, mouse selection, right-click actions, and local/remote status chrome.
|
||||
- Refreshed the repo-local `discrawl` agent skill for local Discord archive, freshness, query, boundary, TUI, verification, and read-only SQL workflows.
|
||||
|
||||
### Maintenance
|
||||
|
||||
- Migrated runtime paths, SQLite opening, archive mirror/export/import helpers, output/status wiring, and TUI plumbing onto the shared `crawlkit` infrastructure.
|
||||
- Moved reusable embedding providers and vector helpers onto `crawlkit` while keeping Discrawl-owned storage, FTS, queueing, and privacy filters local.
|
||||
- Updated crawlkit through `v0.4.1`, switched imports to `github.com/openclaw/crawlkit`, and added CI smoke coverage for the crawlkit control surface and merge behavior.
|
||||
- Added CodeQL, verified secret scanning, protected automation owners, stale issue automation, `.editorconfig`, and `.gitattributes`.
|
||||
- Added release workflow automation that dispatches the Homebrew tap formula update after GoReleaser publishes a tag.
|
||||
|
||||
## 0.6.6 - 2026-05-05
|
||||
|
||||
### Fixes
|
||||
|
||||
- `wiretap` now uses a fast default path for Discord Chromium cache imports: it scans cheap context files plus route-bearing HTTP cache entries, checkpoints file progress in batches, and leaves exhaustive historical cache archaeology behind `--full-cache` / `desktop.full_cache`.
|
||||
|
||||
## 0.6.5 - 2026-05-03
|
||||
|
||||
### Fixes
|
||||
|
||||
- Scheduled Discord backup publishing now skips redundant pre-sync snapshot imports when the workflow DB cache is warm, keeping fresh Git snapshots from getting delayed by a full archive reimport.
|
||||
- `discrawl sync` now keeps Git snapshot refreshes explicit by default; use `--update=auto` or `--update=force` when you want a sync run to pull/import the shared snapshot before live Discord or desktop-cache deltas.
|
||||
- Snapshot imports now emit phase/table/file progress and keep the sync lock file updated with the active phase, making long update/import runs diagnosable instead of looking hung.
|
||||
- Recent-message scans are backed by a plain `messages(created_at, id)` index so archive freshness and short-window analysis queries avoid full-table scans.
|
||||
|
||||
## 0.6.4 - 2026-05-03
|
||||
|
||||
### Fixes
|
||||
|
||||
- `discrawl` now handles SIGINT/SIGTERM by canceling active sync/import contexts so large SQLite and FTS writes can roll back and close cleanly instead of being terminated mid-transaction.
|
||||
|
||||
### Maintenance
|
||||
|
||||
- Refreshed dependency and CI tooling pins, including GoReleaser, `go-toml`, golangci-lint, and gosec.
|
||||
- Tightened CI compatibility with the latest linters and made signal-cancellation and sync fixture tests deterministic under the race detector.
|
||||
All notable changes to `discrawl` will be documented in this file.
|
||||
|
||||
## 0.6.3 - 2026-05-01
|
||||
|
||||
|
||||
49
README.md
49
README.md
@ -22,9 +22,6 @@ Wiretap DMs stay local and are never exported to the Git-backed snapshot mirror.
|
||||
- tails Gateway events for live updates, with periodic repair syncs
|
||||
- imports classifiable Discord Desktop cache messages with `wiretap`, including proven DMs under `@me`
|
||||
- publishes and imports private Git-backed archive snapshots for org-wide read access
|
||||
- browses stored messages and local DMs in a terminal archive UI
|
||||
- exposes `metadata --json`, `status --json`, and `doctor --json` for local
|
||||
launchers, automation, and CI
|
||||
- supports Git-only read mode with no Discord credentials on reader machines
|
||||
- generates backup README activity reports, with optional AI-written field notes
|
||||
- exposes read-only SQL for ad hoc analysis
|
||||
@ -117,7 +114,7 @@ discrawl --version
|
||||
Build from source:
|
||||
|
||||
```bash
|
||||
git clone https://github.com/openclaw/discrawl.git
|
||||
git clone https://github.com/steipete/discrawl.git
|
||||
cd discrawl
|
||||
go build -o bin/discrawl ./cmd/discrawl
|
||||
./bin/discrawl --version
|
||||
@ -162,25 +159,6 @@ discrawl messages --channel general --hours 24
|
||||
|
||||
## Commands
|
||||
|
||||
### `tui`
|
||||
|
||||
Opens the local terminal archive browser for stored messages.
|
||||
|
||||
```bash
|
||||
discrawl tui
|
||||
discrawl tui --guild 123456789012345678 --channel general
|
||||
discrawl tui --dm
|
||||
discrawl --json tui --limit 50
|
||||
```
|
||||
|
||||
The terminal browser uses the shared crawlkit explorer. The left pane groups
|
||||
channels, people, or threads; the middle pane lists messages; the right pane
|
||||
shows the selected message, surrounding conversation, and thread detail. Mouse
|
||||
selection, right-click actions, sortable headers, and the local/remote footer
|
||||
follow the same interaction model as `gitcrawl tui`. See
|
||||
[`docs/commands/tui.md`](docs/commands/tui.md) for flags and read-only/DM scope
|
||||
notes.
|
||||
|
||||
### `init`
|
||||
|
||||
Creates the local config and discovers accessible guilds.
|
||||
@ -195,20 +173,15 @@ discrawl init --db ~/data/discrawl.db
|
||||
|
||||
Refreshes SQLite from one or both archive sources.
|
||||
|
||||
By default, `sync` runs both live/local sources and does not import the Git snapshot first:
|
||||
By default, `sync` runs both sources:
|
||||
|
||||
- Discord bot-token sync for bot-visible guild data
|
||||
- local Discord Desktop cache import for classifiable cached messages and proven DMs
|
||||
|
||||
Use `discrawl update` when you want to pull/import the shared Git snapshot. If you intentionally want a sync run to import the snapshot before live deltas, pass `--update=auto` to import only when stale or `--update=force` to pull/import before syncing. `--no-update` is accepted as an explicit no-op alias for the default.
|
||||
|
||||
Run one explicit `--full` pass when you want a complete historical guild archive. Use plain `sync` afterward for frequent latest-message and desktop-cache refreshes.
|
||||
|
||||
```bash
|
||||
discrawl sync
|
||||
discrawl sync --update=auto
|
||||
discrawl sync --update=force
|
||||
discrawl sync --no-update
|
||||
discrawl sync --full
|
||||
discrawl sync --full --all
|
||||
discrawl sync --guild 123456789012345678
|
||||
@ -234,8 +207,7 @@ Bot sync modes:
|
||||
|
||||
| Command | Use when | Behavior |
|
||||
| --- | --- | --- |
|
||||
| `discrawl sync` | routine refresh | skips member refreshes, checks live top-level channels plus active threads, and only fetches new messages for channels with a stored latest cursor |
|
||||
| `discrawl sync --update=auto` | hybrid Git/live refresh | imports a stale Git snapshot first, then runs the routine live refresh |
|
||||
| `discrawl sync` | routine refresh | imports any stale Git snapshot first, skips member refreshes, checks live top-level channels plus active threads, and only fetches new messages for channels with a stored latest cursor |
|
||||
| `discrawl sync --all-channels` | repair pass | broad incremental sweep across every stored channel/thread, including archived threads |
|
||||
| `discrawl sync --full` | historical backfill | crawls older history until channels are complete; can take a long time on large servers |
|
||||
|
||||
@ -246,10 +218,9 @@ Bot sync modes:
|
||||
`--latest-only` is still accepted for explicit latest-only runs; it is now the default for untargeted `sync`. Use `--all-channels` to opt out of the fast default without doing a full historical crawl.
|
||||
When `--channels` includes a forum channel id, `discrawl` expands that forum's threads and syncs their messages as part of the targeted run.
|
||||
`--since` limits initial history/bootstrap and full-history backfill to messages at or after the given RFC3339 timestamp. It does not mark older history as complete, so a later `sync --full` without `--since` can continue the backfill.
|
||||
Long runs now emit periodic progress logs to stderr so large backfills and Git snapshot imports do not look hung.
|
||||
Long runs now emit periodic progress logs to stderr so large backfills do not look hung.
|
||||
If in-flight channels stop completing for a while, `discrawl` now emits `message sync waiting` heartbeat logs with the oldest active channel, per-channel page activity, and skip/defer counters, and every run ends with a `message sync finished` summary.
|
||||
Each channel crawl also has a bounded runtime budget, so a pathological channel is deferred and retried on the next sync instead of pinning a worker forever.
|
||||
Retryable failures and unavailable-channel markers are tracked per channel; stale unavailable markers are cleared after a later successful crawl, and marker cleanup is best-effort so one missing local sync-state row cannot crash the run.
|
||||
Full sync member refresh is best-effort and currently gives up after five minutes without a caller-supplied deadline, so message sync completion is not held hostage by a slow guild member crawl.
|
||||
When the archive is already complete, `sync --full` now reuses the stored backlog markers and limits steady-state refresh to live top-level channels plus active threads instead of revisiting every stored archived thread.
|
||||
If a guild already has a local member snapshot, routine syncs reuse it and skip another full member crawl until that snapshot ages out.
|
||||
@ -276,7 +247,6 @@ This is the path for searchable DMs because bot tokens cannot read personal dire
|
||||
discrawl wiretap
|
||||
discrawl wiretap --path "$HOME/Library/Application Support/discord"
|
||||
discrawl wiretap --dry-run
|
||||
discrawl wiretap --full-cache
|
||||
discrawl wiretap --watch-every 2m
|
||||
```
|
||||
|
||||
@ -288,8 +258,7 @@ Notes:
|
||||
- preserves existing local `@me` guilds, channels, messages, and attachments when importing a Git snapshot, so a shared guild mirror refresh does not wipe local wiretap DM search
|
||||
- drops message payloads whose channel cannot be classified from cached channel metadata or Discord route URLs; dropped rows are counted as `skipped_messages`
|
||||
- imports what Discord Desktop has cached locally, not complete live DM history
|
||||
- scans local `.ldb`, `.log`, `.json`, and `.txt` artifacts for Discord message JSON, plus route-bearing Chromium HTTP cache entries by default
|
||||
- use `--full-cache` or `desktop.full_cache = true` for exhaustive Chromium cache import when you want slower historical guild-cache archaeology
|
||||
- scans local `.ldb`, `.log`, `.json`, and `.txt` artifacts for Discord message JSON
|
||||
- does not extract, store, or print Discord auth tokens
|
||||
- `--max-file-bytes` skips unusually large files; default is 64 MiB
|
||||
|
||||
@ -485,9 +454,9 @@ discrawl subscribe --stale-after 15m https://github.com/example/discord-archive.
|
||||
discrawl subscribe --no-auto-update https://github.com/example/discord-archive.git
|
||||
```
|
||||
|
||||
Once `share.remote` is configured, read commands auto-fetch and import when the local share import is older than `share.stale_after` (default `15m`). Imports are planned from crawlkit shard fingerprints, with a Git-object fallback for older manifests, so routine updates normally read only changed tail shards and preserve local FTS rows instead of rebuilding the whole archive. `discrawl update` forces the same pull/import step manually. `discrawl sync` does not auto-import the share unless `--update=auto` or `--update=force` is provided, so routine live refreshes stay fast.
|
||||
Once `share.remote` is configured, read commands auto-fetch and import when the local share import is older than `share.stale_after` (default `15m`). `discrawl update` forces the same pull/import step manually.
|
||||
|
||||
Hybrid mode is supported too: keep normal Discord credentials configured and set `share.remote`. `discrawl sync --update=auto` and `discrawl messages --sync` import the Git snapshot first, usually as a changed-shard delta, then use live Discord for latest-message deltas. Use `sync --all-channels` or `sync --full` when you intentionally want a broader live repair/backfill pass.
|
||||
Hybrid mode is supported too: keep normal Discord credentials configured and set `share.remote`. `discrawl sync` and `discrawl messages --sync` import the Git snapshot first, then use live Discord for latest-message deltas. Use `sync --all-channels` or `sync --full` when you intentionally want a broader live repair/backfill pass.
|
||||
|
||||
Git snapshots publish non-DM archive tables by default. Embedding queue state stays local to each machine, and Git-only readers can use FTS immediately without an embedding provider.
|
||||
|
||||
@ -518,7 +487,7 @@ discrawl report --readme path/to/discord-backup/README.md
|
||||
|
||||
Every scheduled snapshot publish updates deterministic README stats: latest update time, latest archived message, archive totals, and day/week/month activity.
|
||||
|
||||
The backup workflows restore and save `.discrawl-ci/discrawl.db` with `actions/cache`. On a warm runner cache, scheduled publishers skip the pre-sync snapshot import and go straight to the live latest-message delta before publishing. Cache misses still import the latest published snapshot first so `--latest-only` has channel cursors to resume from.
|
||||
The backup workflows restore and save `.discrawl-ci/discrawl.db` with `actions/cache`. On a warm runner cache, `discrawl update` compares the cached DB's last imported snapshot timestamp with `manifest.json` and skips the full sharded import when they match. Cache misses and newer backup manifests still take the normal pull/import path.
|
||||
|
||||
### `digest`
|
||||
|
||||
@ -598,7 +567,6 @@ attachment_text = true
|
||||
[desktop]
|
||||
path = "~/.config/discord" # macOS default: "~/Library/Application Support/discord"
|
||||
max_file_bytes = 67108864
|
||||
full_cache = false
|
||||
|
||||
[search]
|
||||
default_mode = "fts"
|
||||
@ -706,7 +674,6 @@ go run github.com/golangci/golangci-lint/v2/cmd/golangci-lint@v2.11.1 run
|
||||
go test ./... -coverprofile=/tmp/discrawl.cover
|
||||
go tool cover -func=/tmp/discrawl.cover | tail -n 1
|
||||
go build ./cmd/discrawl
|
||||
go run ./cmd/discrawl help | grep tui
|
||||
```
|
||||
|
||||
Target coverage is `>= 85%`.
|
||||
|
||||
2
SPEC.md
2
SPEC.md
@ -465,14 +465,12 @@ Expected flags:
|
||||
- `--dry-run`
|
||||
- `--watch-every <duration>`
|
||||
- `--max-file-bytes <bytes>`
|
||||
- `--full-cache`
|
||||
|
||||
Requirements:
|
||||
|
||||
- never use Discord user tokens
|
||||
- never extract or persist auth tokens from desktop cache
|
||||
- scan bounded local files only
|
||||
- default to route-bearing HTTP cache entries; exhaustive Chromium cache scans require explicit full-cache mode
|
||||
- store sanitized raw metadata, not full arbitrary cache blobs
|
||||
|
||||
### `search`
|
||||
|
||||
@ -4,17 +4,12 @@ import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/signal"
|
||||
"syscall"
|
||||
|
||||
"github.com/openclaw/discrawl/internal/cli"
|
||||
"github.com/steipete/discrawl/internal/cli"
|
||||
)
|
||||
|
||||
func main() {
|
||||
ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM)
|
||||
err := cli.Run(ctx, os.Args[1:], os.Stdout, os.Stderr)
|
||||
stop()
|
||||
if err != nil {
|
||||
if err := cli.Run(context.Background(), os.Args[1:], os.Stdout, os.Stderr); err != nil {
|
||||
fmt.Fprintln(os.Stderr, err.Error())
|
||||
os.Exit(cli.ExitCode(err))
|
||||
}
|
||||
|
||||
@ -1,18 +1,10 @@
|
||||
package main
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"errors"
|
||||
"fmt"
|
||||
"os"
|
||||
"os/exec"
|
||||
"path/filepath"
|
||||
"syscall"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/openclaw/discrawl/internal/config"
|
||||
"github.com/openclaw/discrawl/internal/store"
|
||||
)
|
||||
|
||||
func TestMainHelpAndVersion(t *testing.T) {
|
||||
@ -46,137 +38,3 @@ func TestMainHelpAndVersion(t *testing.T) {
|
||||
}
|
||||
t.Fatalf("expected exit code 2, got %v", err)
|
||||
}
|
||||
|
||||
func TestMainCancelsWatchOnSIGTERM(t *testing.T) {
|
||||
if os.Getenv("DISCRAWL_MAIN_SIGNAL_CHILD") == "1" {
|
||||
dir := t.TempDir()
|
||||
cfgPath := filepath.Join(dir, "config.toml")
|
||||
cfg := config.Default()
|
||||
cfg.DBPath = filepath.Join(dir, "discrawl.db")
|
||||
cfg.CacheDir = filepath.Join(dir, "cache")
|
||||
cfg.LogDir = filepath.Join(dir, "logs")
|
||||
cfg.Desktop.Path = filepath.Join(dir, "discord")
|
||||
requireNoError(t, os.MkdirAll(cfg.Desktop.Path, 0o755))
|
||||
requireNoError(t, config.Write(cfgPath, cfg))
|
||||
|
||||
oldArgs := os.Args
|
||||
t.Cleanup(func() { os.Args = oldArgs })
|
||||
os.Args = []string{"discrawl", "--config", cfgPath, "wiretap", "--dry-run", "--watch-every", "1s"}
|
||||
go func() {
|
||||
time.Sleep(50 * time.Millisecond)
|
||||
process, err := os.FindProcess(os.Getpid())
|
||||
if err == nil {
|
||||
_ = process.Signal(syscall.SIGTERM)
|
||||
}
|
||||
}()
|
||||
main()
|
||||
return
|
||||
}
|
||||
|
||||
exe, err := os.Executable()
|
||||
if err != nil {
|
||||
t.Fatalf("os.Executable: %v", err)
|
||||
}
|
||||
cmd := exec.CommandContext(t.Context(), exe, "-test.run=TestMainCancelsWatchOnSIGTERM")
|
||||
cmd.Env = append(os.Environ(), "DISCRAWL_MAIN_SIGNAL_CHILD=1")
|
||||
output, err := cmd.CombinedOutput()
|
||||
if isContextCanceledExit(err, output) {
|
||||
return
|
||||
}
|
||||
if err != nil {
|
||||
t.Fatalf("expected graceful SIGTERM cancellation, got %v", err)
|
||||
}
|
||||
}
|
||||
|
||||
func TestMainCancelsWiretapImportOnSIGTERMWithoutCorruptingDB(t *testing.T) {
|
||||
if dir := os.Getenv("DISCRAWL_MAIN_IMPORT_SIGNAL_DIR"); dir != "" {
|
||||
runWiretapImportSignalChild(t, dir)
|
||||
return
|
||||
}
|
||||
|
||||
dir := t.TempDir()
|
||||
exe, err := os.Executable()
|
||||
if err != nil {
|
||||
t.Fatalf("os.Executable: %v", err)
|
||||
}
|
||||
cmd := exec.CommandContext(t.Context(), exe, "-test.run=TestMainCancelsWiretapImportOnSIGTERMWithoutCorruptingDB")
|
||||
cmd.Env = append(os.Environ(), "DISCRAWL_MAIN_IMPORT_SIGNAL_DIR="+dir)
|
||||
output, err := cmd.CombinedOutput()
|
||||
if !isContextCanceledExit(err, output) {
|
||||
t.Fatalf("expected context-canceled exit from SIGTERM, got err=%v output=%s", err, output)
|
||||
}
|
||||
|
||||
ctx := t.Context()
|
||||
s, err := store.Open(ctx, filepath.Join(dir, "discrawl.db"))
|
||||
if err != nil {
|
||||
t.Fatalf("open db after SIGTERM: %v output=%s", err, output)
|
||||
}
|
||||
defer func() { _ = s.Close() }()
|
||||
_, rows, err := s.ReadOnlyQuery(ctx, "pragma quick_check")
|
||||
if err != nil {
|
||||
t.Fatalf("quick_check after SIGTERM: %v output=%s", err, output)
|
||||
}
|
||||
if len(rows) != 1 || len(rows[0]) != 1 || rows[0][0] != "ok" {
|
||||
t.Fatalf("quick_check after SIGTERM = %#v output=%s", rows, output)
|
||||
}
|
||||
}
|
||||
|
||||
func runWiretapImportSignalChild(t *testing.T, dir string) {
|
||||
t.Helper()
|
||||
|
||||
cfgPath := filepath.Join(dir, "config.toml")
|
||||
cfg := config.Default()
|
||||
cfg.DBPath = filepath.Join(dir, "discrawl.db")
|
||||
cfg.CacheDir = filepath.Join(dir, "cache")
|
||||
cfg.LogDir = filepath.Join(dir, "logs")
|
||||
cfg.Desktop.Path = filepath.Join(dir, "discord")
|
||||
cfg.Discord.TokenSource = "none"
|
||||
cfg.Share.AutoUpdate = false
|
||||
cachePath := filepath.Join(cfg.Desktop.Path, "Local Storage", "leveldb")
|
||||
requireNoError(t, os.MkdirAll(cachePath, 0o755))
|
||||
requireNoError(t, config.Write(cfgPath, cfg))
|
||||
writeLargeWiretapCache(t, filepath.Join(cachePath, "000001.log"), 50000)
|
||||
|
||||
oldArgs := os.Args
|
||||
t.Cleanup(func() { os.Args = oldArgs })
|
||||
os.Args = []string{"discrawl", "--config", cfgPath, "wiretap", "--path", cfg.Desktop.Path}
|
||||
go func() {
|
||||
time.Sleep(15 * time.Millisecond)
|
||||
process, err := os.FindProcess(os.Getpid())
|
||||
if err == nil {
|
||||
_ = process.Signal(syscall.SIGTERM)
|
||||
}
|
||||
}()
|
||||
main()
|
||||
}
|
||||
|
||||
func writeLargeWiretapCache(t *testing.T, path string, count int) {
|
||||
t.Helper()
|
||||
|
||||
file, err := os.Create(path)
|
||||
requireNoError(t, err)
|
||||
defer func() { requireNoError(t, file.Close()) }()
|
||||
_, err = fmt.Fprintln(file, `{"id":"111111111111111117","guild_id":"999999999999999997","type":0,"name":"sigterm-import"}`)
|
||||
requireNoError(t, err)
|
||||
for i := range count {
|
||||
_, err = fmt.Fprintf(
|
||||
file,
|
||||
`{"id":"3333333333%09d","channel_id":"111111111111111117","content":"sigterm import message %d","timestamp":"2026-04-23T18:20:43Z","author":{"id":"222222222222222228","username":"alice"}}`+"\n",
|
||||
i,
|
||||
i,
|
||||
)
|
||||
requireNoError(t, err)
|
||||
}
|
||||
}
|
||||
|
||||
func isContextCanceledExit(err error, output []byte) bool {
|
||||
var exitErr *exec.ExitError
|
||||
return errors.As(err, &exitErr) && exitErr.ExitCode() == 1 && bytes.Contains(output, []byte("context canceled"))
|
||||
}
|
||||
|
||||
func requireNoError(t *testing.T, err error) {
|
||||
t.Helper()
|
||||
if err != nil {
|
||||
t.Fatal(err)
|
||||
}
|
||||
}
|
||||
|
||||
@ -1 +0,0 @@
|
||||
discrawl.sh
|
||||
@ -1,46 +0,0 @@
|
||||
# Discrawl
|
||||
|
||||
Mirror Discord guilds into local SQLite. Search server history without depending on Discord search. Bring a bot token, or read everything offline from a Git snapshot.
|
||||
|
||||
## What it does
|
||||
|
||||
- discovers every guild a bot can access and syncs channels, threads, members, and message history into SQLite
|
||||
- maintains FTS5 indexes for fast literal search; optional embeddings for semantic and hybrid recall
|
||||
- imports classifiable Discord Desktop cache messages with `wiretap`, including proven DMs under `@me`
|
||||
- tails the Gateway for live updates with periodic repair sweeps
|
||||
- publishes the archive as sharded NDJSON snapshots in a private Git repo so readers can search offline with no Discord credentials
|
||||
- exposes read-only SQL, channel/member directories, mention queries, digests, and trend analytics
|
||||
|
||||
## Pick your path
|
||||
|
||||
- **New here?** Read [Install](install.html) and run `discrawl init`.
|
||||
- **Already have a bot?** Jump to [`sync`](commands/sync.html) and [`search`](commands/search.html).
|
||||
- **Just want to read a shared archive?** Use [`subscribe`](commands/subscribe.html) - no token needed.
|
||||
- **Need DM search?** [`wiretap`](commands/wiretap.html) imports local Discord Desktop cache.
|
||||
- **Want semantic search?** Configure [Embeddings](guides/embeddings.html), then run [`embed`](commands/embed.html).
|
||||
- **Wiring an agent or launcher?** `discrawl metadata --json`, `discrawl status --json`, and `discrawl doctor --json` expose the read-only crawlkit control surface.
|
||||
|
||||
## At a glance
|
||||
|
||||
```bash
|
||||
export DISCORD_BOT_TOKEN="..."
|
||||
discrawl init
|
||||
discrawl doctor
|
||||
discrawl sync --full
|
||||
discrawl search "panic: nil pointer"
|
||||
discrawl tail
|
||||
```
|
||||
|
||||
[`discrawl tui`](commands/tui.html) uses the shared crawlkit terminal explorer:
|
||||
channel/person/thread groups on the left, message rows in the middle, and
|
||||
readable message/thread detail on the right.
|
||||
|
||||
## Sections
|
||||
|
||||
- **[Start](install.html)** - install, configure, set up the Discord bot, security notes, contact
|
||||
- **[Guides](guides/)** - sync sources, wiretap internals, search modes, embeddings, Git snapshots, data layout
|
||||
- **[Commands](commands/)** - one page per CLI command
|
||||
|
||||
## Where to file issues
|
||||
|
||||
`https://github.com/openclaw/discrawl/issues`. See [contact](contact.html) for project links.
|
||||
@ -7,7 +7,7 @@ summary: "Release checklist for discrawl (GitHub release binaries via GoReleaser
|
||||
Always do all steps below. No partial releases.
|
||||
|
||||
Assumptions:
|
||||
- Repo: `openclaw/discrawl`
|
||||
- Repo: `steipete/discrawl`
|
||||
- Binary: `discrawl`
|
||||
- GoReleaser config: `.goreleaser.yaml`
|
||||
- Homebrew tap repo: `~/Projects/homebrew-tap`
|
||||
@ -82,7 +82,7 @@ After tagging a real release:
|
||||
Useful commands:
|
||||
|
||||
```sh
|
||||
curl -L -o /tmp/discrawl-darwin-arm64.tgz https://github.com/openclaw/discrawl/releases/download/vX.Y.Z/discrawl_X.Y.Z_darwin_arm64.tar.gz
|
||||
curl -L -o /tmp/discrawl-darwin-arm64.tgz https://github.com/steipete/discrawl/releases/download/vX.Y.Z/discrawl_X.Y.Z_darwin_arm64.tar.gz
|
||||
shasum -a 256 /tmp/discrawl-darwin-arm64.tgz
|
||||
brew uninstall discrawl || true
|
||||
brew install steipete/tap/discrawl
|
||||
@ -92,7 +92,7 @@ brew info steipete/tap/discrawl
|
||||
|
||||
## Notes
|
||||
|
||||
- Build-time version stamping comes from `-X github.com/openclaw/discrawl/internal/cli.version={{ .Version }}`
|
||||
- Build-time version stamping comes from `-X github.com/steipete/discrawl/internal/cli.version={{ .Version }}`
|
||||
- If release workflow needs a rerun:
|
||||
|
||||
```sh
|
||||
|
||||
@ -1,63 +0,0 @@
|
||||
# Discord bot setup
|
||||
|
||||
Discrawl needs a real Discord bot token to run `sync` or `tail`. Not a user token. The desktop `wiretap` import does not need any token.
|
||||
|
||||
## Minimum setup
|
||||
|
||||
1. Create or reuse a Discord application in the [Discord developer portal](https://discord.com/developers/applications).
|
||||
2. Add a bot user to that application.
|
||||
3. Invite the bot to the target guilds.
|
||||
4. Enable these intents for the bot:
|
||||
- **Server Members Intent**
|
||||
- **Message Content Intent**
|
||||
5. Ensure the bot can at least:
|
||||
- view channels
|
||||
- read message history
|
||||
|
||||
Without those intents/permissions, `sync`, `tail`, member snapshots, and message content archiving will be partial or fail outright.
|
||||
|
||||
## Provide the token
|
||||
|
||||
### Environment variable
|
||||
|
||||
```bash
|
||||
export DISCORD_BOT_TOKEN="your-bot-token"
|
||||
discrawl doctor
|
||||
```
|
||||
|
||||
If you keep shell secrets in `~/.profile`, add the export there and reload your shell.
|
||||
|
||||
### OS keyring
|
||||
|
||||
If you prefer the OS keyring, keep the token out of config and store it in the default keyring item:
|
||||
|
||||
```bash
|
||||
# macOS Keychain
|
||||
security add-generic-password -U -s discrawl -a discord_bot_token -w "$DISCORD_BOT_TOKEN"
|
||||
|
||||
# Linux Secret Service / libsecret
|
||||
printf %s "$DISCORD_BOT_TOKEN" | secret-tool store --label="discrawl Discord bot token" service discrawl username discord_bot_token
|
||||
|
||||
# Windows Credential Manager
|
||||
cmdkey /generic:discrawl:discord_bot_token /user:discord_bot_token /pass:%DISCORD_BOT_TOKEN%
|
||||
```
|
||||
|
||||
Set `discord.token_source = "keyring"` if you want to require the keyring and skip env entirely.
|
||||
|
||||
## Verify
|
||||
|
||||
```bash
|
||||
discrawl doctor
|
||||
```
|
||||
|
||||
`doctor` reports the token source (env or keyring), confirms bot auth, lists how many guilds the bot can access, and verifies the local DB plus FTS wiring. It does not print the token contents.
|
||||
|
||||
## Wiretap-only setup
|
||||
|
||||
If you only want to import local Discord Desktop cache messages and not run a bot, skip everything above and run:
|
||||
|
||||
```bash
|
||||
discrawl sync --source wiretap
|
||||
```
|
||||
|
||||
Or `discrawl wiretap` directly. See the [wiretap guide](guides/wiretap.html).
|
||||
@ -1,37 +0,0 @@
|
||||
# `analytics`
|
||||
|
||||
Groups activity-style queries under one namespace.
|
||||
|
||||
## Usage
|
||||
|
||||
```bash
|
||||
discrawl analytics
|
||||
discrawl analytics quiet --since 30d
|
||||
discrawl analytics quiet --guild 123456789012345678
|
||||
discrawl analytics trends --weeks 8
|
||||
discrawl analytics trends --weeks 12 --channel general
|
||||
discrawl --json analytics quiet --since 60d
|
||||
discrawl --json analytics trends --weeks 4
|
||||
```
|
||||
|
||||
## Subcommands
|
||||
|
||||
### `quiet`
|
||||
|
||||
Top-level text/announcement channels with no messages in the lookback window, including never-active channels.
|
||||
|
||||
- `--since <duration>` - lookback window (e.g. `30d`, `60d`)
|
||||
- `--guild <id>` - scope to one guild; when omitted, `default_guild_id` is used if configured
|
||||
|
||||
### `trends`
|
||||
|
||||
Monday-start UTC weekly message counts per message-capable channel.
|
||||
|
||||
- `--weeks <n>` - number of weeks to include
|
||||
- `--channel <id|name>` - scope to one channel
|
||||
- `--guild <id>` - scope to one guild
|
||||
|
||||
## See also
|
||||
|
||||
- [`digest`](digest.html)
|
||||
- [`status`](status.html)
|
||||
@ -1,25 +0,0 @@
|
||||
# `channels`
|
||||
|
||||
Browse the offline channel directory.
|
||||
|
||||
## Usage
|
||||
|
||||
```bash
|
||||
discrawl channels list
|
||||
discrawl channels show 123456789012345678
|
||||
```
|
||||
|
||||
## Subcommands
|
||||
|
||||
- `list` - dump every channel and thread in the local archive
|
||||
- `show <id>` - show metadata for one channel/thread
|
||||
|
||||
## Notes
|
||||
|
||||
- threads are stored as channels because that matches the Discord model
|
||||
- archived threads are part of the sync surface and appear here too
|
||||
|
||||
## See also
|
||||
|
||||
- [`members`](members.html)
|
||||
- [Data layout](../guides/data-storage.html)
|
||||
@ -1,29 +0,0 @@
|
||||
# `digest`
|
||||
|
||||
Summarizes per-channel activity for a lookback window.
|
||||
|
||||
## Usage
|
||||
|
||||
```bash
|
||||
discrawl digest
|
||||
discrawl digest --since 30d
|
||||
discrawl digest --guild 123456789012345678
|
||||
discrawl digest --channel general
|
||||
discrawl --json digest --since 7d --top-n 5
|
||||
```
|
||||
|
||||
## Flags
|
||||
|
||||
- `--since <duration>` - Go durations (`72h`, `30m`) and `Nd` shorthand (`7d`, `30d`)
|
||||
- `--guild <id>` - scope to one guild; when omitted, `default_guild_id` is used if configured
|
||||
- `--channel <id|name>` - scope to one channel
|
||||
- `--top-n <n>` - how many top posters and mention targets per channel
|
||||
|
||||
## Output
|
||||
|
||||
For each channel in scope: message count, top posters, top mention targets, first/last activity in window.
|
||||
|
||||
## See also
|
||||
|
||||
- [`analytics`](analytics.html)
|
||||
- [`mentions`](mentions.html)
|
||||
@ -1,39 +0,0 @@
|
||||
# `dms`
|
||||
|
||||
Lists local wiretap DM conversations or reads one DM thread. Convenience layer over the synthetic `@me` guild id.
|
||||
|
||||
## Usage
|
||||
|
||||
```bash
|
||||
discrawl dms
|
||||
discrawl dms --with Molty --last 20
|
||||
discrawl dms --with 1456464433768300635 --all
|
||||
discrawl dms --search "launch checklist"
|
||||
discrawl dms --with Molty --search "invoice"
|
||||
```
|
||||
|
||||
## Default output
|
||||
|
||||
`discrawl dms` (no flags) shows one row per local DM channel with:
|
||||
|
||||
- message count
|
||||
- author count
|
||||
- first/last cached message times
|
||||
|
||||
## Flags
|
||||
|
||||
- `--with <name|id>` - switches to message output for that DM conversation (unless `--list` is also set)
|
||||
- `--list` - keep the channel-summary listing even when `--with` is set
|
||||
- `--search <query>` - search only local DM messages
|
||||
- `--last <n>` / `--all` / `--limit <n>` - same slicing as [`messages`](messages.html)
|
||||
|
||||
## Notes
|
||||
|
||||
- only sees data imported by [`wiretap`](wiretap.html) - Discord Desktop cache, not live DM history
|
||||
- skips Git snapshot auto-update because DMs are never imported from the shared mirror
|
||||
- DMs are local-only and never published
|
||||
|
||||
## See also
|
||||
|
||||
- [Wiretap guide](../guides/wiretap.html)
|
||||
- [`messages --dm`](messages.html)
|
||||
@ -1,35 +0,0 @@
|
||||
# `doctor`
|
||||
|
||||
Checks config, auth, DB, and FTS wiring. The fastest sanity check.
|
||||
|
||||
## Usage
|
||||
|
||||
```bash
|
||||
discrawl doctor
|
||||
```
|
||||
|
||||
## What it verifies
|
||||
|
||||
- config loads from the expected path
|
||||
- where the bot token was resolved from (env var or OS keyring)
|
||||
- bot auth succeeds against Discord
|
||||
- how many guilds the bot can access
|
||||
- local SQLite database exists and the schema version matches the binary
|
||||
- FTS5 index is wired up
|
||||
|
||||
## What it does not do
|
||||
|
||||
- does not print the token contents
|
||||
- does not run a sync; it only checks readiness
|
||||
|
||||
## Common outputs
|
||||
|
||||
- "token from env (DISCORD_BOT_TOKEN)" or "token from keyring (discrawl/discord_bot_token)"
|
||||
- "0 guilds visible" - bot is not invited to any guild yet, or intents/permissions are missing
|
||||
- "schema newer than binary" - update `discrawl` to a build that supports the local DB schema
|
||||
|
||||
## See also
|
||||
|
||||
- [Bot setup](../bot-setup.html)
|
||||
- [Configuration](../configuration.html)
|
||||
- [`status`](status.html)
|
||||
@ -1,42 +0,0 @@
|
||||
# `embed`
|
||||
|
||||
Drains pending `embedding_jobs` rows by calling the configured embedding provider and writing vectors to `message_embeddings`.
|
||||
|
||||
## Usage
|
||||
|
||||
```bash
|
||||
discrawl embed
|
||||
discrawl embed --limit 1000
|
||||
discrawl embed --rebuild --limit 1000
|
||||
```
|
||||
|
||||
## Flags
|
||||
|
||||
- `--limit <n>` - cap how many jobs this run drains
|
||||
- `--batch-size <n>` - provider request batch size
|
||||
- `--rebuild` - regenerate vectors for the existing archive after a provider/model/input-version change
|
||||
|
||||
## Behavior
|
||||
|
||||
- claims jobs with a short lock so overlapping runs do not process the same batch
|
||||
- rate limits requeue the batch and stop that drain run cleanly
|
||||
- provider or validation failures retry up to three attempts before the job is marked failed
|
||||
- messages with no normalized text are marked done and any stale vector for that message is removed
|
||||
|
||||
## Identity
|
||||
|
||||
Provider, model, and input version are stored on each job and vector. Changing any of them retargets pending jobs to the new identity and resets prior attempts. Existing vectors for another identity remain in SQLite but are not used by semantic search.
|
||||
|
||||
## When to use `--rebuild`
|
||||
|
||||
After changing `[search.embeddings]` provider, model, or any input setting, when you want to regenerate vectors for messages already in the archive.
|
||||
|
||||
## Pairing with `sync`
|
||||
|
||||
`sync --with-embeddings` enqueues; `embed` drains. The two phases are intentionally separate so a slow provider does not block the hot sync path.
|
||||
|
||||
## See also
|
||||
|
||||
- [Embeddings guide](../guides/embeddings.html)
|
||||
- [Search modes](../guides/search-modes.html)
|
||||
- [`search`](search.html)
|
||||
@ -1,31 +0,0 @@
|
||||
# `init`
|
||||
|
||||
Creates the local config and discovers accessible guilds.
|
||||
|
||||
## Usage
|
||||
|
||||
```bash
|
||||
discrawl init
|
||||
discrawl init --guild 123456789012345678
|
||||
discrawl init --db ~/data/discrawl.db
|
||||
discrawl init --with-embeddings
|
||||
```
|
||||
|
||||
## What it does
|
||||
|
||||
- writes `~/.discrawl/config.toml` (or whatever `--config` / `DISCRAWL_CONFIG` points to)
|
||||
- discovers guilds the configured bot can access
|
||||
- if exactly one guild is available, sets it as `default_guild_id` automatically
|
||||
- creates the SQLite database at `db_path`
|
||||
|
||||
## Flags
|
||||
|
||||
- `--guild <id>` - set a specific default guild instead of auto-picking
|
||||
- `--db <path>` - override `db_path`
|
||||
- `--with-embeddings` - enable `[search.embeddings]` in the generated config
|
||||
|
||||
## See also
|
||||
|
||||
- [Configuration](../configuration.html)
|
||||
- [Bot setup](../bot-setup.html)
|
||||
- [`doctor`](doctor.html)
|
||||
@ -1,72 +0,0 @@
|
||||
# `members`
|
||||
|
||||
Browse the offline member directory built from archived profile payloads.
|
||||
|
||||
## Usage
|
||||
|
||||
```bash
|
||||
discrawl members list
|
||||
discrawl members show 123456789012345678
|
||||
discrawl members show --messages 10 steipete
|
||||
discrawl members search "peter"
|
||||
discrawl members search "github"
|
||||
discrawl members search "https://github.com/steipete"
|
||||
```
|
||||
|
||||
## Subcommands
|
||||
|
||||
- `list` - dump the local member directory
|
||||
- `show <id|query>` - show one member; if the query resolves to one match, also show recent messages
|
||||
- `search <query>` - match names plus any offline profile fields present in the archived member payload
|
||||
|
||||
## Flags
|
||||
|
||||
- `show --messages <n>` - include the most recent `n` messages from that member
|
||||
|
||||
## Profile fields
|
||||
|
||||
Extracted from the archived Discord member/user payload. May include:
|
||||
|
||||
- `bio`
|
||||
- `pronouns`
|
||||
- `location`
|
||||
- `website`
|
||||
- `x`
|
||||
- `github`
|
||||
- discovered URLs
|
||||
|
||||
If the bot cannot see a field from Discord, `discrawl` cannot invent it. This is strictly archive-based offline data.
|
||||
|
||||
## Typical workflow
|
||||
|
||||
```bash
|
||||
discrawl sync --full
|
||||
discrawl members search "design engineer"
|
||||
discrawl members search "github"
|
||||
discrawl members show --messages 25 steipete
|
||||
discrawl messages --author steipete --days 30 --all
|
||||
```
|
||||
|
||||
## Typical `members show` output
|
||||
|
||||
```text
|
||||
guild=1456350064065904867
|
||||
user=37658261826043904
|
||||
username=steipete
|
||||
display=Peter Steinberger
|
||||
joined=2026-03-08T16:03:14Z
|
||||
bot=false
|
||||
x=steipete
|
||||
github=steipete
|
||||
website=https://steipete.me
|
||||
bio=Builds native apps and tooling.
|
||||
urls=https://steipete.me, https://github.com/steipete
|
||||
message_count=1284
|
||||
first_message=2026-02-01T09:00:00Z
|
||||
last_message=2026-03-08T15:59:58Z
|
||||
```
|
||||
|
||||
## See also
|
||||
|
||||
- [`channels`](channels.html)
|
||||
- [Data layout](../guides/data-storage.html)
|
||||
@ -1,27 +0,0 @@
|
||||
# `mentions`
|
||||
|
||||
Lists structured user and role mentions extracted during sync.
|
||||
|
||||
## Usage
|
||||
|
||||
```bash
|
||||
discrawl mentions --channel maintainers --days 7
|
||||
discrawl mentions --target steipete --type user --limit 50
|
||||
discrawl mentions --target 1456406468898197625
|
||||
discrawl --json mentions --type role --days 1
|
||||
```
|
||||
|
||||
## Flags
|
||||
|
||||
- `--target <id|name>` - user or role id, exact name, or partial match
|
||||
- `--type <user|role>` - filter by mention type
|
||||
- `--channel <id|name>` - same channel matching as [`messages`](messages.html)
|
||||
- `--guild <id>` / `--guilds <id,id>` - restrict the guild scope
|
||||
- `--days <n>` / `--since <RFC3339>` / `--before <RFC3339>` - time filters
|
||||
- `--limit <n>` - cap result count
|
||||
|
||||
## Notes
|
||||
|
||||
- mentions are recorded structurally during sync, so this is a direct row read - no FTS parsing
|
||||
- combine with `--type role` to find every mention of a role
|
||||
- combine with `--target steipete` to find everywhere your account got pinged
|
||||
@ -1,41 +0,0 @@
|
||||
# `messages`
|
||||
|
||||
Lists exact message slices by channel, author, and time range. Unlike [`search`](search.html), this does not query the FTS index - it pulls a slice of rows.
|
||||
|
||||
## Usage
|
||||
|
||||
```bash
|
||||
discrawl messages --channel maintainers --days 7 --all
|
||||
discrawl messages --channel maintainers --hours 6 --all
|
||||
discrawl messages --channel "#maintainers" --since 2026-03-01T00:00:00Z
|
||||
discrawl messages --channel 1456744319972282449 --author steipete --limit 50
|
||||
discrawl messages --channel maintainers --last 100 --sync
|
||||
discrawl messages --dm --channel Molty --last 20
|
||||
discrawl messages --channel maintainers --days 7 --all --include-empty
|
||||
discrawl --json messages --channel maintainers --days 3
|
||||
```
|
||||
|
||||
## Flags
|
||||
|
||||
- `--channel <id|name|#name>` - id, exact name, `#name`, or partial name match
|
||||
- `--guild <id>` / `--guilds <id,id>` / `--dm` - restrict the guild scope (`--dm` is shorthand for `--guild @me`)
|
||||
- `--author <name>` - restrict to one author
|
||||
- `--hours <n>` - shorthand for "since now minus N hours"
|
||||
- `--days <n>` - shorthand for "since now minus N days"
|
||||
- `--since <RFC3339>` - explicit start timestamp
|
||||
- `--last <n>` - return the newest `N` matching messages, then print oldest-to-newest
|
||||
- `--limit <n>` - safety limit (default 200; `--all` removes it)
|
||||
- `--all` - removes the safety limit
|
||||
- `--sync` - blocking pre-query sync for the matching channel or guild scope
|
||||
- `--include-empty` - include rows with no displayable/searchable content
|
||||
|
||||
## Notes
|
||||
|
||||
- at least one filter is required
|
||||
- `--dm` skips Git snapshot auto-update because DMs are never imported from the shared mirror
|
||||
- use either `--last` for the newest matching rows or `--all` for an uncapped oldest-to-newest slice
|
||||
|
||||
## See also
|
||||
|
||||
- [`search`](search.html)
|
||||
- [`dms`](dms.html)
|
||||
@ -1,42 +0,0 @@
|
||||
# `publish`
|
||||
|
||||
Publishes the local SQLite archive as sharded, compressed NDJSON snapshots in a private Git repo.
|
||||
|
||||
## Usage
|
||||
|
||||
```bash
|
||||
discrawl publish --remote https://github.com/example/discord-archive.git --push
|
||||
discrawl publish --readme path/to/discord-backup/README.md --push
|
||||
discrawl publish --message "sync: discord archive" --push
|
||||
discrawl publish --with-embeddings --push
|
||||
```
|
||||
|
||||
## Flags
|
||||
|
||||
- `--repo <path>` - local snapshot repo path (defaults to `[share].repo_path`)
|
||||
- `--remote <url>` - target Git remote (defaults to `[share].remote`)
|
||||
- `--branch <name>` - snapshot branch (defaults to `[share].branch`)
|
||||
- `--message <text>` - commit message (default: `sync: discord archive`)
|
||||
- `--no-commit` - write/export files without creating a Git commit
|
||||
- `--push` - push the snapshot commit after writing it
|
||||
- `--readme <path>` - update the activity block in this README file too
|
||||
- `--with-embeddings` - also export stored `message_embeddings` rows
|
||||
|
||||
## What is published
|
||||
|
||||
- non-DM archive tables (DM `@me` rows are always excluded)
|
||||
- README activity block (latest update, latest message, totals, day/week/month activity)
|
||||
- with `--with-embeddings`: vectors for the configured `[search.embeddings]` provider/model/input version, plus identity manifest
|
||||
|
||||
## What is not published
|
||||
|
||||
- `@me` DM guilds, channels, messages, events, attachments, mentions, wiretap sync state
|
||||
- `embedding_jobs`
|
||||
- raw bot tokens or any local secret
|
||||
|
||||
## See also
|
||||
|
||||
- [Git snapshots guide](../guides/git-snapshots.html)
|
||||
- [`subscribe`](subscribe.html)
|
||||
- [`update`](update.html)
|
||||
- [`report`](report.html)
|
||||
@ -1,35 +0,0 @@
|
||||
# `report`
|
||||
|
||||
Generates the Markdown activity block used by the shared backup repo README.
|
||||
|
||||
## Usage
|
||||
|
||||
```bash
|
||||
discrawl report
|
||||
discrawl report --readme path/to/discord-backup/README.md
|
||||
```
|
||||
|
||||
## Flags
|
||||
|
||||
- `--readme <path>` - update the activity block in the given README file in place
|
||||
|
||||
## What gets rendered
|
||||
|
||||
Deterministic README stats:
|
||||
|
||||
- latest update time
|
||||
- latest archived message
|
||||
- archive totals
|
||||
- day / week / month activity
|
||||
|
||||
Every scheduled snapshot publish updates this block.
|
||||
|
||||
## CI integration
|
||||
|
||||
The backup workflows restore and save `.discrawl-ci/discrawl.db` with `actions/cache`. On a warm runner cache, scheduled publishers skip the pre-sync snapshot import and go straight to the live latest-message delta before publishing. Cache misses still import the latest published snapshot first so `--latest-only` has channel cursors to resume from.
|
||||
|
||||
## See also
|
||||
|
||||
- [`publish`](publish.html)
|
||||
- [Git snapshots](../guides/git-snapshots.html)
|
||||
- [`status`](status.html)
|
||||
@ -1,51 +0,0 @@
|
||||
# `search`
|
||||
|
||||
Searches archived messages. FTS is the default mode and works without embeddings.
|
||||
|
||||
## Usage
|
||||
|
||||
```bash
|
||||
discrawl search "panic: nil pointer"
|
||||
discrawl search --mode fts "panic: nil pointer"
|
||||
discrawl search --mode semantic "missing launch checklist"
|
||||
discrawl search --mode hybrid "database timeout"
|
||||
discrawl search --guild 123456789012345678 "payment failed"
|
||||
discrawl search --dm "launch checklist"
|
||||
discrawl search --channel billing --author steipete --limit 50 "invoice"
|
||||
discrawl search --include-empty "GitHub"
|
||||
discrawl --json search "websocket closed"
|
||||
```
|
||||
|
||||
## Modes
|
||||
|
||||
- `fts` (default) - SQLite FTS5 with `unicode61` tokenizer; newest matches first
|
||||
- `semantic` - embeds the query, scores against locally stored vectors; errors out if embeddings are disabled or no compatible vectors exist
|
||||
- `hybrid` - runs both, deduplicates by message id, falls back to FTS when semantic is unavailable
|
||||
|
||||
## Flags
|
||||
|
||||
- `--mode <fts|semantic|hybrid>` - search mode
|
||||
- `--guild <id>` / `--guilds <id,id>` - restrict the guild scope
|
||||
- `--dm` - shorthand for `--guild @me`
|
||||
- `--channel <id|name|#name>` - restrict to one channel (id, exact name, `#name`, or partial match)
|
||||
- `--author <name>` - restrict to one author
|
||||
- `--limit <n>` - cap result count
|
||||
- `--include-empty` - include rows with no searchable content (attachment text/filenames, embeds, and replies still count as content)
|
||||
|
||||
## FTS behavior
|
||||
|
||||
User query terms are parameterized and quoted before `MATCH`, so tokens like `AND`, `OR`, `NOT`, `NEAR`, and `*` are searched as input terms instead of FTS operators. Punctuation still follows FTS5 tokenization rules.
|
||||
|
||||
## Semantic prerequisites
|
||||
|
||||
- `[search.embeddings]` configured in `~/.discrawl/config.toml`
|
||||
- local `message_embeddings` rows for the configured provider, model, and input version
|
||||
- input version is currently `message_normalized_v1`
|
||||
|
||||
Run `discrawl sync --with-embeddings` to enqueue, then `discrawl embed` to generate vectors.
|
||||
|
||||
## See also
|
||||
|
||||
- [Search modes](../guides/search-modes.html)
|
||||
- [Embeddings](../guides/embeddings.html)
|
||||
- [`messages`](messages.html) - exact slices, not search
|
||||
@ -1,25 +0,0 @@
|
||||
# `sql`
|
||||
|
||||
Runs read-only SQL against the local database.
|
||||
|
||||
## Usage
|
||||
|
||||
```bash
|
||||
discrawl sql 'select count(*) as messages from messages'
|
||||
echo 'select guild_id, count(*) from messages group by guild_id' | discrawl sql -
|
||||
```
|
||||
|
||||
`-` reads SQL from stdin.
|
||||
|
||||
## Notes
|
||||
|
||||
- read-only - writes are blocked at the connection level
|
||||
- `--unsafe --confirm` opens the escape hatch for deliberate write/admin SQL
|
||||
- the schema is multi-guild ready; threads are stored as channels because that matches the Discord model
|
||||
- proven DMs use the synthetic guild id `@me`
|
||||
- SQLite schema migrations are versioned with `PRAGMA user_version`; startup fails fast when a local DB schema is newer than the supported binary
|
||||
|
||||
## See also
|
||||
|
||||
- [Data layout](../guides/data-storage.html) - what tables exist
|
||||
- [`status`](status.html) - high-level archive numbers without raw SQL
|
||||
@ -1,24 +0,0 @@
|
||||
# `status`
|
||||
|
||||
Shows local archive status.
|
||||
|
||||
## Usage
|
||||
|
||||
```bash
|
||||
discrawl status
|
||||
```
|
||||
|
||||
## Reports
|
||||
|
||||
- where the local database lives
|
||||
- guild count and per-guild totals
|
||||
- channel and thread counts
|
||||
- message totals
|
||||
- latest archived message time
|
||||
- whether the Git share is configured and how stale the local import is
|
||||
- embeddings status if `[search.embeddings]` is enabled
|
||||
|
||||
## See also
|
||||
|
||||
- [`doctor`](doctor.html) - liveness check (config, auth, DB, FTS wiring)
|
||||
- [`report`](report.html) - Markdown activity block for the shared backup README
|
||||
@ -1,48 +0,0 @@
|
||||
# `subscribe`
|
||||
|
||||
Subscribes to a Git-backed snapshot repo. The Git-only setup path - no Discord bot token required.
|
||||
|
||||
## Usage
|
||||
|
||||
```bash
|
||||
discrawl subscribe https://github.com/example/discord-archive.git
|
||||
discrawl subscribe --repo ~/.discrawl/share https://github.com/example/discord-archive.git
|
||||
discrawl subscribe --branch main https://github.com/example/discord-archive.git
|
||||
discrawl subscribe --stale-after 15m https://github.com/example/discord-archive.git
|
||||
discrawl subscribe --no-auto-update https://github.com/example/discord-archive.git
|
||||
discrawl subscribe --no-import https://github.com/example/discord-archive.git
|
||||
discrawl subscribe --with-embeddings https://github.com/example/discord-archive.git
|
||||
```
|
||||
|
||||
## What it does
|
||||
|
||||
- writes a config with `discord.token_source = "none"` (so no bot token is required)
|
||||
- imports the latest snapshot into the local SQLite archive
|
||||
- enables auto-refresh: read commands fetch and import when the local share import is older than `share.stale_after` (default `15m`)
|
||||
|
||||
## Flags
|
||||
|
||||
- `--repo <path>` - local snapshot repo path
|
||||
- `--branch <name>` - snapshot branch (default: `main`)
|
||||
- `--stale-after <duration>` - how stale the local import can get before read commands auto-refresh
|
||||
- `--no-auto-update` - disable auto-refresh (use [`update`](update.html) manually)
|
||||
- `--no-import` - write config only; skip the initial pull/import
|
||||
- `--with-embeddings` - import vectors that match your local `[search.embeddings]` identity
|
||||
|
||||
## Disabled in this mode
|
||||
|
||||
`sync` and `tail` are disabled when `discord.token_source = "none"` because they need live Discord access. Switch to a token-equipped config to re-enable them.
|
||||
|
||||
## After subscribing
|
||||
|
||||
```bash
|
||||
discrawl search "launch checklist"
|
||||
discrawl messages --channel general --hours 24
|
||||
discrawl status
|
||||
```
|
||||
|
||||
## See also
|
||||
|
||||
- [Git snapshots guide](../guides/git-snapshots.html)
|
||||
- [`publish`](publish.html)
|
||||
- [`update`](update.html)
|
||||
@ -1,82 +0,0 @@
|
||||
# `sync`
|
||||
|
||||
Refreshes SQLite from one or both archive sources.
|
||||
|
||||
By default, `sync` runs both live/local sources and does **not** import the Git snapshot first:
|
||||
|
||||
- Discord bot-token sync for bot-visible guild data
|
||||
- local Discord Desktop cache import for classifiable cached messages and proven DMs
|
||||
|
||||
Use [`update`](update.html) when you want to pull/import the shared Git snapshot. Snapshot imports normally use changed-shard deltas, but unsafe table changes fall back to a full import. If you intentionally want a sync run to import the snapshot before live deltas, pass `--update=auto` (only when stale) or `--update=force` (always). `--no-update` is accepted as an explicit no-op alias for the default.
|
||||
|
||||
Run one explicit `--full` pass when you want a complete historical guild archive. Use plain `sync` afterward for frequent latest-message and desktop-cache refreshes.
|
||||
|
||||
## Usage
|
||||
|
||||
```bash
|
||||
discrawl sync
|
||||
discrawl sync --update=auto
|
||||
discrawl sync --update=force
|
||||
discrawl sync --no-update
|
||||
discrawl sync --full
|
||||
discrawl sync --full --all
|
||||
discrawl sync --guild 123456789012345678
|
||||
discrawl sync --guilds 123,456 --concurrency 8
|
||||
discrawl sync --source both # default: bot API + desktop cache
|
||||
discrawl sync --source discord # bot API only; aliases: key, bot, api
|
||||
discrawl sync --source wiretap # desktop cache only; aliases: desktop, cache
|
||||
discrawl sync --guild 123456789012345678 --all-channels
|
||||
discrawl sync --channels 111,222 --since 2026-03-01T00:00:00Z
|
||||
discrawl sync --with-embeddings
|
||||
```
|
||||
|
||||
## Sources
|
||||
|
||||
| Source | Reads from | Stores |
|
||||
| --- | --- | --- |
|
||||
| `both` | Discord bot API and local Discord Desktop cache | bot-visible guild data plus classifiable cached desktop messages |
|
||||
| `discord` / `key` | Discord bot API | guilds, channels, threads, members, and messages the bot can access |
|
||||
| `wiretap` | local Discord Desktop cache files | classifiable cached messages; proven DMs are stored under `@me` |
|
||||
|
||||
## Bot sync modes
|
||||
|
||||
| Command | Use when | Behavior |
|
||||
| --- | --- | --- |
|
||||
| `discrawl sync` | routine refresh | skips member refreshes, checks live top-level channels plus active threads, only fetches new messages for channels with a stored cursor |
|
||||
| `discrawl sync --update=auto` | hybrid Git/live refresh | imports a stale Git snapshot first, then runs the routine live refresh |
|
||||
| `discrawl sync --all-channels` | repair pass | broad incremental sweep across every stored channel/thread, including archived threads |
|
||||
| `discrawl sync --full` | historical backfill | crawls older history until channels are complete |
|
||||
|
||||
## Flags
|
||||
|
||||
- `--source <both|discord|wiretap>` - which archive sources to read
|
||||
- `--update <auto|force|none>` - whether to import the Git snapshot before live deltas
|
||||
- `--full` - historical backfill (slow on large guilds)
|
||||
- `--all-channels` - broader incremental sweep across every stored channel/thread
|
||||
- `--latest-only` - explicit latest-only run (also the default for untargeted `sync`)
|
||||
- `--all` - ignore `default_guild_id` and fan out across every discovered guild
|
||||
- `--guild <id>` / `--guilds <id,id>` - target specific guilds
|
||||
- `--channels <id,id>` - target specific channels (forum ids expand to threads)
|
||||
- `--since <RFC3339>` - limit initial history and `--full` backfill to messages at or after this timestamp
|
||||
- `--concurrency <n>` - override worker count (default auto-sized: floor 8, cap 32)
|
||||
- `--skip-members` - refresh guild/channel/message data without crawling members
|
||||
- `--with-embeddings` - also enqueue changed messages into `embedding_jobs`
|
||||
|
||||
## Notes
|
||||
|
||||
- `--latest-only` is the default for untargeted `sync`. Use `--all-channels` to opt out without doing a full historical crawl.
|
||||
- `--since` does not mark older history as complete, so a later `sync --full` without `--since` can continue the backfill.
|
||||
- Long runs emit periodic progress logs to stderr.
|
||||
- Heartbeat logs (`message sync waiting`) name the oldest active channel and per-channel page activity if in-flight channels stop completing for a while.
|
||||
- Every run ends with a `message sync finished` summary.
|
||||
- Each channel crawl has a bounded runtime budget; pathological channels are deferred and retried next sync.
|
||||
- Retryable failures and unavailable-channel markers are tracked per channel; stale unavailable markers are cleared after a later successful crawl.
|
||||
- Marker cleanup is best-effort, so one missing local sync-state row cannot crash the run.
|
||||
- Full sync member refresh is best-effort and gives up after five minutes without a caller-supplied deadline.
|
||||
- When the archive is already complete, `sync --full` reuses backlog markers and limits steady-state refresh to live top-level channels plus active threads.
|
||||
|
||||
## See also
|
||||
|
||||
- [Sync sources](../guides/sync-sources.html)
|
||||
- [`tail`](tail.html)
|
||||
- [`update`](update.html)
|
||||
@ -1,33 +0,0 @@
|
||||
# `tail`
|
||||
|
||||
Runs the live Discord Gateway tail and a periodic repair loop.
|
||||
|
||||
## Usage
|
||||
|
||||
```bash
|
||||
discrawl tail
|
||||
discrawl tail --guild 123456789012345678
|
||||
discrawl tail --repair-every 30m
|
||||
```
|
||||
|
||||
## What it does
|
||||
|
||||
- connects to the Discord Gateway with the configured bot token
|
||||
- writes new messages, edits, and deletes into the local archive as they arrive
|
||||
- periodically runs a repair pass to catch anything the live stream missed
|
||||
|
||||
## Flags
|
||||
|
||||
- `--guild <id>` / `--guilds <id,id>` - tail a specific guild scope (default: `default_guild_id`, or all discovered guilds if unset)
|
||||
- `--repair-every <duration>` - frequency of the repair sweep
|
||||
|
||||
## Notes
|
||||
|
||||
- requires a working Discord bot token
|
||||
- not available in Git-only mode (`discord.token_source = "none"`)
|
||||
- terminates cleanly on SIGINT / SIGTERM and treats cancellation as normal exit
|
||||
|
||||
## See also
|
||||
|
||||
- [`sync`](sync.html)
|
||||
- [Bot setup](../bot-setup.html)
|
||||
@ -1,47 +0,0 @@
|
||||
# `tui`
|
||||
|
||||
Opens the local terminal archive browser for stored messages.
|
||||
|
||||
## Usage
|
||||
|
||||
```bash
|
||||
discrawl tui
|
||||
discrawl tui --guild 123456789012345678 --channel general
|
||||
discrawl tui --guilds 123,456 --author 1456464433768300635
|
||||
discrawl tui --dm
|
||||
discrawl --json tui --limit 50
|
||||
```
|
||||
|
||||
## What it shows
|
||||
|
||||
The browser uses the shared crawlkit explorer:
|
||||
|
||||
- left pane: channel, person, or thread groups
|
||||
- middle pane: newest matching message rows
|
||||
- right pane: selected message detail, attachments, replies, and thread context
|
||||
- footer: local DB or remote Git snapshot source
|
||||
|
||||
Mouse selection, right-click actions, sortable headers, refresh, and chat layout match the other crawlkit-backed archive tools.
|
||||
|
||||
## Flags
|
||||
|
||||
- `--guild <id>` / `--guilds <id,id>` - restrict the guild scope
|
||||
- `--dm` - browse local direct messages under the synthetic `@me` guild
|
||||
- `--channel <id|name|#name>` - restrict to one channel or DM conversation
|
||||
- `--author <id|name>` - restrict to one author
|
||||
- `--limit <n>` - newest rows to load (default 200)
|
||||
- `--include-empty` - include rows with no displayable/searchable content
|
||||
- `--json` - print crawlkit browser rows as JSON instead of opening the TUI
|
||||
|
||||
## Notes
|
||||
|
||||
- `tui` is read-only.
|
||||
- without `--guild`, `--guilds`, or `--dm`, it uses `default_guild_id` when configured; otherwise it can browse all stored guild rows
|
||||
- `--dm` only shows messages imported from the local Discord Desktop cache by [`wiretap`](wiretap.html)
|
||||
- `--json` is useful for launchers and agents that want the same row shape without an interactive terminal
|
||||
|
||||
## See also
|
||||
|
||||
- [`messages`](messages.html)
|
||||
- [`dms`](dms.html)
|
||||
- [`wiretap`](wiretap.html)
|
||||
@ -1,36 +0,0 @@
|
||||
# `update`
|
||||
|
||||
Forces a Git snapshot pull and import.
|
||||
|
||||
Routine imports are delta-planned from crawlkit shard fingerprints, with a Git-object fallback for older manifests. The usual publish only imports changed tail shards; unsafe table changes fall back to a full import.
|
||||
|
||||
## Usage
|
||||
|
||||
```bash
|
||||
discrawl update
|
||||
discrawl update --repo ~/.discrawl/share --remote https://github.com/example/discord-archive.git
|
||||
discrawl update --with-embeddings
|
||||
```
|
||||
|
||||
## Flags
|
||||
|
||||
- `--repo <path>` - local snapshot repo path (defaults to `[share].repo_path`)
|
||||
- `--remote <url>` - target Git remote (defaults to `[share].remote`)
|
||||
- `--branch <name>` - snapshot branch (defaults to `[share].branch`)
|
||||
- `--with-embeddings` - also import vectors that match your local `[search.embeddings]` identity
|
||||
|
||||
## When to use it
|
||||
|
||||
- you have `share.remote` configured and want a fresh shard-delta import before running a command that does not auto-update (`sync` does not auto-import unless `--update=auto` is passed)
|
||||
- you set `--no-auto-update` when subscribing and want to refresh on demand
|
||||
- a CI job already imported the latest snapshot but read commands still consider it stale
|
||||
|
||||
## How `sync` interacts
|
||||
|
||||
`discrawl sync` does **not** auto-import the share unless `--update=auto` (only when stale) or `--update=force` (always). Routine live refreshes stay fast; explicit imports happen via `update`.
|
||||
|
||||
## See also
|
||||
|
||||
- [Git snapshots guide](../guides/git-snapshots.html)
|
||||
- [`subscribe`](subscribe.html)
|
||||
- [`sync`](sync.html)
|
||||
@ -1,47 +0,0 @@
|
||||
# `wiretap`
|
||||
|
||||
Imports classifiable Discord Desktop message payloads into the same local SQLite archive.
|
||||
|
||||
This is the path for searchable DMs because bot tokens cannot read personal direct messages.
|
||||
|
||||
`wiretap` is also available through `discrawl sync --source wiretap` and is included in the default `discrawl sync --source both` path.
|
||||
|
||||
## Usage
|
||||
|
||||
```bash
|
||||
discrawl wiretap
|
||||
discrawl wiretap --path "$HOME/Library/Application Support/discord"
|
||||
discrawl wiretap --dry-run
|
||||
discrawl wiretap --full-cache
|
||||
discrawl wiretap --watch-every 2m
|
||||
```
|
||||
|
||||
## Flags
|
||||
|
||||
- `--path <dir>` - override the desktop data directory (default: platform-specific Discord cache path)
|
||||
- `--dry-run` - report what would be imported without writing anything
|
||||
- `--full-cache` - exhaustive Chromium HTTP cache import for historical guild-cache archaeology (slower)
|
||||
- `--watch-every <duration>` - keep importing on a periodic loop
|
||||
- `--max-file-bytes <n>` - skip unusually large files (default 64 MiB)
|
||||
|
||||
## Notes
|
||||
|
||||
- stores classifiable cache messages in the same `guilds`, `channels`, and `messages` tables used by bot sync
|
||||
- stores proven DMs under the synthetic guild id `@me`
|
||||
- `@me` rows stay local-only: never exported to `publish` / Git snapshot import / embedding snapshots
|
||||
- preserves existing local `@me` rows when importing a Git snapshot
|
||||
- drops message payloads whose channel cannot be classified from cached channel metadata or Discord route URLs; dropped rows are counted as `skipped_messages`
|
||||
- imports what Discord Desktop has cached locally, not complete live DM history
|
||||
- scans local `.ldb`, `.log`, `.json`, and `.txt` artifacts for Discord message JSON, plus route-bearing Chromium HTTP cache entries by default
|
||||
- does not extract, store, or print Discord auth tokens
|
||||
|
||||
## Default desktop paths
|
||||
|
||||
- macOS: `~/Library/Application Support/discord`
|
||||
- Linux: `~/.config/discord`
|
||||
|
||||
## See also
|
||||
|
||||
- [Wiretap guide](../guides/wiretap.html)
|
||||
- [`dms`](dms.html)
|
||||
- [`sync`](sync.html)
|
||||
@ -1,77 +0,0 @@
|
||||
# Configuration
|
||||
|
||||
`discrawl init` writes a complete config so most users do not hand-edit anything initially. This page documents the full shape and override rules for when you do.
|
||||
|
||||
## File layout
|
||||
|
||||
```toml
|
||||
version = 1
|
||||
default_guild_id = ""
|
||||
guild_ids = []
|
||||
db_path = "~/.discrawl/discrawl.db"
|
||||
cache_dir = "~/.discrawl/cache"
|
||||
log_dir = "~/.discrawl/logs"
|
||||
|
||||
[discord]
|
||||
token_source = "env" # use "none" for Git-only read access
|
||||
token_env = "DISCORD_BOT_TOKEN"
|
||||
token_keyring_service = "discrawl"
|
||||
token_keyring_account = "discord_bot_token"
|
||||
|
||||
[sync]
|
||||
source = "both" # "discord" for bot-only sync, "wiretap" for desktop-cache-only import
|
||||
concurrency = 16
|
||||
repair_every = "6h"
|
||||
full_history = true
|
||||
attachment_text = true
|
||||
|
||||
[desktop]
|
||||
path = "~/.config/discord" # macOS default: "~/Library/Application Support/discord"
|
||||
max_file_bytes = 67108864
|
||||
full_cache = false
|
||||
|
||||
[search]
|
||||
default_mode = "fts"
|
||||
|
||||
[search.embeddings]
|
||||
enabled = false
|
||||
provider = "openai"
|
||||
model = "text-embedding-3-small"
|
||||
api_key_env = "OPENAI_API_KEY"
|
||||
batch_size = 64
|
||||
|
||||
[share]
|
||||
remote = ""
|
||||
repo_path = "~/.discrawl/share"
|
||||
branch = "main"
|
||||
auto_update = true
|
||||
stale_after = "15m"
|
||||
```
|
||||
|
||||
`concurrency` is auto-sized at `init` to `min(32, max(8, GOMAXPROCS*2))`.
|
||||
|
||||
## Token resolution
|
||||
|
||||
In order:
|
||||
|
||||
1. `DISCORD_BOT_TOKEN`, or the env var named in `discord.token_env`
|
||||
2. OS keyring item `discrawl` / `discord_bot_token`, or the configured keyring service/account
|
||||
|
||||
`discrawl` accepts either raw token text or a value prefixed with `Bot `. Normalization is automatic.
|
||||
|
||||
Set `discord.token_source = "keyring"` if you want to require keyring lookup and skip env entirely. Set it to `"none"` for a Git-only reader.
|
||||
|
||||
## Override rules
|
||||
|
||||
- `--config <path>` beats everything
|
||||
- `DISCRAWL_CONFIG=<path>` overrides the default config path
|
||||
- `discord.token_source = "none"` disables live Discord access for Git-only readers
|
||||
- `discord.token_source = "keyring"` skips env lookup
|
||||
- `DISCRAWL_NO_AUTO_UPDATE=1` disables Git snapshot auto-update for read commands in one process
|
||||
|
||||
## Notes
|
||||
|
||||
- `default_guild_id` is the implicit scope for `sync`, `tail`, `digest`, and `analytics` when `--guild` is not passed
|
||||
- `guild_ids` is reserved for explicit multi-guild fan-out; usually you do not set this directly
|
||||
- changing `[search.embeddings]` provider/model/input version retargets pending jobs and resets prior attempts; existing vectors for another identity remain in SQLite but are not used for semantic search
|
||||
- changing `db_path` does not migrate existing data; copy the file yourself if you want to keep history
|
||||
@ -1,6 +0,0 @@
|
||||
# Contact
|
||||
|
||||
Discord archive search and analysis tooling.
|
||||
|
||||
- Source: [github.com/openclaw/discrawl](https://github.com/openclaw/discrawl)
|
||||
- Issues: [github.com/openclaw/discrawl/issues](https://github.com/openclaw/discrawl/issues)
|
||||
@ -1,51 +0,0 @@
|
||||
# Data layout
|
||||
|
||||
Everything lives in one local SQLite file. Default path: `~/.discrawl/discrawl.db`.
|
||||
|
||||
## What is stored
|
||||
|
||||
- guild metadata
|
||||
- channels and threads in one table (Discord models threads as channels)
|
||||
- current member snapshot
|
||||
- canonical message rows
|
||||
- append-only message event records
|
||||
- FTS5 index rows
|
||||
- optional local embedding queue metadata and vectors
|
||||
|
||||
Messages imported from Discord Desktop use the same message, attachment, mention, and FTS paths as bot-synced messages.
|
||||
|
||||
## DMs
|
||||
|
||||
Proven DMs use the synthetic guild id `@me`. Unclassifiable desktop-cache payloads are skipped instead of being stored as unknown synthetic data.
|
||||
|
||||
## Attachments
|
||||
|
||||
Attachment binaries are not stored in SQLite. Only attachment metadata, filenames, and (optionally) extracted text.
|
||||
|
||||
Set `sync.attachment_text = false` if you want to keep attachment metadata and filenames but disable attachment body fetches for text indexing.
|
||||
|
||||
## Multi-guild ready
|
||||
|
||||
The schema is multi-guild ready even when the common UX stays single-guild simple. Threads are stored as channels because that matches the Discord model. Archived threads are part of the sync surface.
|
||||
|
||||
## Schema migrations
|
||||
|
||||
SQLite schema migrations are versioned with `PRAGMA user_version`. Startup fails fast when a local DB schema is newer than the supported binary - that means you have a binary older than the database.
|
||||
|
||||
## Querying directly
|
||||
|
||||
Anything you want, with read-only SQL:
|
||||
|
||||
```bash
|
||||
discrawl sql 'select count(*) as messages from messages'
|
||||
echo 'select guild_id, count(*) from messages group by guild_id' | discrawl sql -
|
||||
```
|
||||
|
||||
See [`sql`](../commands/sql.html).
|
||||
|
||||
## See also
|
||||
|
||||
- [`status`](../commands/status.html) - high-level archive status
|
||||
- [`channels`](../commands/channels.html) - channel directory
|
||||
- [`members`](../commands/members.html) - member directory
|
||||
- [Security](../security.html)
|
||||
@ -1,68 +0,0 @@
|
||||
# Embeddings
|
||||
|
||||
Embeddings are optional. FTS is the default search path and the primary verification target. Embeddings enrich recall in background batches; they do not block the hot sync path.
|
||||
|
||||
## Quick path
|
||||
|
||||
```bash
|
||||
export OPENAI_API_KEY="..."
|
||||
discrawl init --with-embeddings
|
||||
discrawl sync --with-embeddings
|
||||
discrawl embed --limit 1000
|
||||
discrawl search --mode semantic "launch checklist"
|
||||
discrawl search --mode hybrid "launch checklist"
|
||||
```
|
||||
|
||||
## Two-phase pipeline
|
||||
|
||||
1. **Queue** - `sync --with-embeddings` writes `embedding_jobs` rows for new messages, changed normalized text, and messages without an existing job. The embedding provider is **not** called in this phase.
|
||||
2. **Drain** - `discrawl embed` claims pending jobs with a short lock so overlapping runs do not process the same batch. It calls the configured provider, writes vectors to `message_embeddings` with provider, model, input version, dimensions, and binary vector data.
|
||||
|
||||
Behavior during drain:
|
||||
|
||||
- rate limits requeue the batch and stop that drain run cleanly
|
||||
- provider or validation failures retry up to three attempts before marking the job failed
|
||||
- messages with no normalized text are marked done and any stale vector for that message is removed
|
||||
|
||||
## Identity (provider, model, input version)
|
||||
|
||||
Stored on each job and vector. If you change provider or model:
|
||||
|
||||
- pending jobs are retargeted to the new identity
|
||||
- prior attempts are reset
|
||||
- existing vectors for another identity remain in SQLite but are not used for semantic search
|
||||
|
||||
Use `--rebuild` when you want to regenerate vectors for the existing archive after a config change:
|
||||
|
||||
```bash
|
||||
discrawl embed --rebuild --limit 1000
|
||||
```
|
||||
|
||||
## Local provider example
|
||||
|
||||
```toml
|
||||
[search.embeddings]
|
||||
enabled = true
|
||||
provider = "ollama"
|
||||
model = "nomic-embed-text"
|
||||
```
|
||||
|
||||
With local providers, message and query embedding both happen on the same machine. With remote providers, message text is sent during `discrawl embed`, and search query text is sent during `--mode semantic` or `--mode hybrid` calls.
|
||||
|
||||
## Git snapshot interaction
|
||||
|
||||
By default, `publish` does not export embeddings. Use `--with-embeddings`:
|
||||
|
||||
```bash
|
||||
discrawl publish --with-embeddings --push
|
||||
discrawl subscribe --with-embeddings https://github.com/example/discord-archive.git
|
||||
discrawl update --with-embeddings
|
||||
```
|
||||
|
||||
The snapshot stores vectors under `embeddings/<provider>/<model>/<input_version>/...` and records that identity in `manifest.json`. Only vectors for non-DM messages are exported. Import only restores matching embedding manifests, so an Ollama/nomic subscriber does not accidentally import OpenAI/text-embedding vectors. `embedding_jobs` is never exported; subscribers that want fresh local vectors run `discrawl embed --rebuild`. Publishing without `--with-embeddings` omits embedding manifests instead of carrying forward an older bundle.
|
||||
|
||||
## See also
|
||||
|
||||
- [Search modes](search-modes.html)
|
||||
- [`embed`](../commands/embed.html)
|
||||
- [Configuration](../configuration.html)
|
||||
@ -1,84 +0,0 @@
|
||||
# Git-backed snapshots
|
||||
|
||||
Discrawl can publish the SQLite archive as sharded, compressed NDJSON snapshots in a private Git repo, then auto-import that repo before local read commands. This gives readers org memory without Discord credentials.
|
||||
|
||||
Snapshot packing/import and git mirror mechanics are shared through
|
||||
`crawlkit`. Discrawl still owns Discord-specific privacy policy: `@me` direct
|
||||
messages, wiretap sync state, and local-only desktop rows are excluded from
|
||||
published snapshots and are preserved locally on import.
|
||||
|
||||
## Publisher
|
||||
|
||||
```bash
|
||||
discrawl publish --remote https://github.com/example/discord-archive.git --push
|
||||
discrawl publish --readme path/to/discord-backup/README.md --push
|
||||
```
|
||||
|
||||
The publisher uses your existing bot-synced archive. It exports non-DM tables only.
|
||||
|
||||
## Subscriber
|
||||
|
||||
```bash
|
||||
discrawl subscribe https://github.com/example/discord-archive.git
|
||||
discrawl search "launch checklist"
|
||||
discrawl messages --channel general --hours 24
|
||||
```
|
||||
|
||||
`subscribe` is the Git-only setup path. It writes a config with `discord.token_source = "none"`, imports the snapshot, and does not require a Discord bot token. `sync` and `tail` remain disabled in this mode because they need live Discord access.
|
||||
|
||||
## Auto-update
|
||||
|
||||
Once `share.remote` is configured, read commands auto-fetch and import when the local share import is older than `share.stale_after` (default `15m`):
|
||||
|
||||
```bash
|
||||
discrawl subscribe --stale-after 15m https://github.com/example/discord-archive.git
|
||||
discrawl subscribe --no-auto-update https://github.com/example/discord-archive.git
|
||||
```
|
||||
|
||||
`discrawl update` forces the same pull/import step manually. Snapshot imports are delta-planned from crawlkit shard fingerprints. Older manifests without those fields fall back to Git blob identity, so the common publish shape only imports the changed message tail shard plus small cursor tables. Unsafe table-shape changes still fall back to a full import.
|
||||
|
||||
`discrawl sync` does **not** auto-import the share unless `--update=auto` or `--update=force` is provided, so routine live refreshes stay fast.
|
||||
|
||||
## Hybrid mode
|
||||
|
||||
Keep normal Discord credentials configured **and** set `share.remote`:
|
||||
|
||||
```bash
|
||||
discrawl sync --update=auto # import snapshot delta first, then live deltas
|
||||
discrawl messages --sync # blocking pre-query sync for matched scope
|
||||
discrawl sync --all-channels # broader live repair
|
||||
discrawl sync --full # historical backfill
|
||||
```
|
||||
|
||||
## What is published
|
||||
|
||||
- non-DM archive tables (DM `@me` rows are always excluded)
|
||||
- README activity block - latest update time, latest archived message, archive totals, day/week/month activity
|
||||
- `embedding_jobs` is never exported
|
||||
|
||||
## Backing up vectors
|
||||
|
||||
```bash
|
||||
discrawl publish --with-embeddings --push
|
||||
discrawl subscribe --with-embeddings https://github.com/example/discord-archive.git
|
||||
discrawl update --with-embeddings
|
||||
```
|
||||
|
||||
Stored under `embeddings/<provider>/<model>/<input_version>/...`. Import only restores matching identities; Ollama/nomic subscribers do not accidentally pick up OpenAI/text-embedding vectors. Publishing without `--with-embeddings` omits embedding manifests instead of carrying forward an older bundle.
|
||||
|
||||
## CI
|
||||
|
||||
The Docker smoke test installs `discrawl` in a clean Go container, subscribes to a Git snapshot repo, then checks `search`, `messages`, `sql`, and `report`:
|
||||
|
||||
```bash
|
||||
DISCRAWL_DOCKER_TEST=1 go test ./internal/cli -run TestDockerGitSourceSmoke -count=1
|
||||
```
|
||||
|
||||
The backup workflows restore and save `.discrawl-ci/discrawl.db` with `actions/cache`. On a warm runner cache, scheduled publishers skip the pre-sync snapshot import and go straight to the live latest-message delta before publishing. Cache misses still import the latest published snapshot first so `--latest-only` has channel cursors to resume from.
|
||||
|
||||
## See also
|
||||
|
||||
- [`publish`](../commands/publish.html)
|
||||
- [`subscribe`](../commands/subscribe.html)
|
||||
- [`update`](../commands/update.html)
|
||||
- [`report`](../commands/report.html)
|
||||
@ -1,57 +0,0 @@
|
||||
# Search modes
|
||||
|
||||
`discrawl search` has three modes. FTS is the default and works with no embeddings.
|
||||
|
||||
## Modes
|
||||
|
||||
- **`fts`** (default) - searches the local SQLite FTS5 index, returns newest matching messages first
|
||||
- **`semantic`** - embeds the query, scores against locally stored message vectors; errors out cleanly if embeddings are disabled or no compatible vectors exist
|
||||
- **`hybrid`** - runs FTS and semantic, deduplicates by message id, falls back to FTS when semantic is unavailable
|
||||
|
||||
## FTS details
|
||||
|
||||
- backed by SQLite FTS5 with the default `unicode61` tokenizer
|
||||
- user query terms are parameterized and quoted before `MATCH`, so tokens like `AND`, `OR`, `NOT`, `NEAR`, and `*` are searched as input terms instead of FTS operators
|
||||
- punctuation still follows FTS5 tokenization rules
|
||||
- by default, `search` skips rows with no searchable content (attachment text, attachment filenames, embeds, and replies still count as content); use `--include-empty` to opt back in
|
||||
|
||||
## Semantic and hybrid prerequisites
|
||||
|
||||
- `[search.embeddings]` configured in `~/.discrawl/config.toml`
|
||||
- local `message_embeddings` rows for the configured provider, model, and input version
|
||||
- input version is currently `message_normalized_v1`, so vectors are tied to normalized message text rather than raw Discord payloads
|
||||
|
||||
Two-phase embedding creation:
|
||||
|
||||
1. `discrawl sync --with-embeddings` queues changed messages by writing `embedding_jobs` rows. New messages, changed normalized text, and messages without an existing job are queued. This phase does not call the embedding provider.
|
||||
2. `discrawl embed` drains pending jobs in bounded batches, calls the configured provider, and writes vectors to `message_embeddings`.
|
||||
|
||||
## Provider/model identity
|
||||
|
||||
The provider/model/input-version identity is stored on each job and vector. If you change provider or model, pending jobs are retargeted to the new identity and prior attempts are reset. Existing vectors for another identity remain in SQLite, but semantic search only reads vectors compatible with the current config.
|
||||
|
||||
Use `--rebuild` when changing provider, model, or input settings and you want to regenerate vectors for the existing archive.
|
||||
|
||||
## Local vs remote providers
|
||||
|
||||
Local providers like Ollama keep both message and query embedding on the same machine. With remote providers (OpenAI, etc.), message text is sent during `discrawl embed`, and search query text is sent when using `--mode semantic` or `--mode hybrid`. Stored message text is not sent during local vector scoring.
|
||||
|
||||
## Examples
|
||||
|
||||
```bash
|
||||
discrawl search "panic: nil pointer"
|
||||
discrawl search --mode fts "panic: nil pointer"
|
||||
discrawl search --mode semantic "missing launch checklist"
|
||||
discrawl search --mode hybrid "database timeout"
|
||||
discrawl search --guild 123456789012345678 "payment failed"
|
||||
discrawl search --dm "launch checklist"
|
||||
discrawl search --channel billing --author steipete --limit 50 "invoice"
|
||||
discrawl search --include-empty "GitHub"
|
||||
discrawl --json search "websocket closed"
|
||||
```
|
||||
|
||||
## See also
|
||||
|
||||
- [`search`](../commands/search.html)
|
||||
- [`embed`](../commands/embed.html)
|
||||
- [Embeddings](embeddings.html)
|
||||
@ -1,57 +0,0 @@
|
||||
# Sync sources
|
||||
|
||||
Discrawl reads from two local archive sources. Either or both can run in a single `sync`.
|
||||
|
||||
## Sources
|
||||
|
||||
| Source | Reads from | Stores |
|
||||
| --- | --- | --- |
|
||||
| `both` | Discord bot API and local Discord Desktop cache | bot-visible guild data plus classifiable cached desktop messages |
|
||||
| `discord` / `key` / `bot` / `api` | Discord bot API | guilds, channels, threads, members, and messages the bot can access |
|
||||
| `wiretap` / `desktop` / `cache` | local Discord Desktop cache files | classifiable cached messages; proven DMs are stored under `@me` |
|
||||
|
||||
The default is `both`. Pick one with `--source` or by setting `[sync].source` in config.
|
||||
|
||||
## Bot sync modes
|
||||
|
||||
Sync modes control the Discord bot API side of a run. When `wiretap` is selected, the desktop cache import runs once alongside the chosen bot sync mode.
|
||||
|
||||
| Command | Use when | Behavior |
|
||||
| --- | --- | --- |
|
||||
| `discrawl sync` | routine refresh | skips member refreshes, checks live top-level channels plus active threads, only fetches new messages for channels with a stored latest cursor |
|
||||
| `discrawl sync --update=auto` | hybrid Git/live refresh | imports a stale Git snapshot first, usually as a changed-shard delta, then runs the routine live refresh |
|
||||
| `discrawl sync --all-channels` | repair pass | broad incremental sweep across every stored channel/thread, including archived threads |
|
||||
| `discrawl sync --full` | historical backfill | crawls older history until channels are complete; can take a long time on large servers |
|
||||
|
||||
Run one explicit `--full` pass when you want a complete historical guild archive. Use plain `sync` afterward for frequent latest-message and desktop-cache refreshes.
|
||||
|
||||
## Concurrency
|
||||
|
||||
`sync` already uses parallel channel workers for bot API message crawling. The default is auto-sized from `GOMAXPROCS` with a floor of `8` and a cap of `32`. Override with `--concurrency`.
|
||||
|
||||
## Targeting
|
||||
|
||||
- `--guild <id>` runs only that guild
|
||||
- `--guilds 123,456` runs an explicit set
|
||||
- `--all` ignores `default_guild_id` and fans out across every discovered guild
|
||||
- `--channels 111,222` targets specific channels (forum ids expand to their threads)
|
||||
- `--since <RFC3339>` limits initial history and `--full` backfill to messages at or after the timestamp; older history is not marked complete, so a later `sync --full` without `--since` can continue the backfill
|
||||
|
||||
## Performance and resilience
|
||||
|
||||
- Long runs emit periodic progress logs to stderr.
|
||||
- If in-flight channels stop completing for a while, `discrawl` emits `message sync waiting` heartbeat logs with the oldest active channel, per-channel page activity, and skip/defer counters.
|
||||
- Every run ends with a `message sync finished` summary.
|
||||
- Each channel crawl has a bounded runtime budget; pathological channels are deferred and retried on the next sync.
|
||||
- Retryable failures and unavailable-channel markers are tracked per channel; stale unavailable markers are cleared after a later successful crawl.
|
||||
- Marker cleanup is best-effort, so one missing local sync-state row cannot crash the run.
|
||||
- Full sync member refresh is best-effort and gives up after five minutes without a caller-supplied deadline, so message sync completion is not held hostage by a slow guild member crawl.
|
||||
- When the archive is already complete, `sync --full` reuses backlog markers and limits steady-state refresh to live top-level channels plus active threads instead of revisiting every stored archived thread.
|
||||
- If a guild already has a local member snapshot, routine syncs reuse it and skip another full member crawl until that snapshot ages out.
|
||||
|
||||
## See also
|
||||
|
||||
- [`sync`](../commands/sync.html)
|
||||
- [`tail`](../commands/tail.html)
|
||||
- [Wiretap](wiretap.html)
|
||||
- [Git snapshots](git-snapshots.html)
|
||||
@ -1,61 +0,0 @@
|
||||
# Desktop wiretap
|
||||
|
||||
`wiretap` imports classifiable Discord Desktop message payloads into the same local SQLite archive used by bot sync. It is the path for searchable DMs because bot tokens cannot read personal direct messages.
|
||||
|
||||
`wiretap` is also available through `discrawl sync --source wiretap` and is included in the default `discrawl sync --source both` path.
|
||||
|
||||
## What it does
|
||||
|
||||
- stores classifiable cache messages in the same `guilds`, `channels`, and `messages` tables used by bot sync
|
||||
- stores proven DMs under the synthetic guild id `@me`
|
||||
- preserves existing local `@me` guilds, channels, messages, and attachments when importing a Git snapshot, so a shared guild mirror refresh does not wipe local wiretap DM search
|
||||
- drops message payloads whose channel cannot be classified from cached channel metadata or Discord route URLs; dropped rows are counted as `skipped_messages`
|
||||
- imports what Discord Desktop has cached locally - not complete live DM history
|
||||
|
||||
## What it does not do
|
||||
|
||||
- does not extract, store, or print Discord auth tokens
|
||||
- does not use a user token
|
||||
- does not call the Discord API as your user
|
||||
- does not run as a selfbot
|
||||
|
||||
## DM privacy: `@me` stays local
|
||||
|
||||
`@me` rows are local-only. Excluded from:
|
||||
|
||||
- `publish` (Git snapshot output)
|
||||
- `subscribe` / Git snapshot import
|
||||
- `--with-embeddings` snapshot export
|
||||
|
||||
Excluded categories: DM guilds, channels, messages, events, attachments, mentions, wiretap sync state, and vectors for DM messages.
|
||||
|
||||
## What gets scanned
|
||||
|
||||
- local `.ldb`, `.log`, `.json`, and `.txt` artifacts for Discord message JSON
|
||||
- route-bearing Chromium HTTP cache entries by default
|
||||
- `--full-cache` (or `desktop.full_cache = true`) enables exhaustive Chromium cache import for slower historical guild-cache archaeology
|
||||
- `--max-file-bytes` skips unusually large files (default 64 MiB)
|
||||
|
||||
## Flags
|
||||
|
||||
```bash
|
||||
discrawl wiretap
|
||||
discrawl wiretap --path "$HOME/Library/Application Support/discord"
|
||||
discrawl wiretap --dry-run
|
||||
discrawl wiretap --full-cache
|
||||
discrawl wiretap --watch-every 2m
|
||||
```
|
||||
|
||||
`--watch-every` keeps the import running on a periodic loop. `--dry-run` reports what would be imported without writing anything.
|
||||
|
||||
## Default desktop paths
|
||||
|
||||
- macOS: `~/Library/Application Support/discord`
|
||||
- Linux: `~/.config/discord`
|
||||
- override via `--path` or `[desktop].path`
|
||||
|
||||
## See also
|
||||
|
||||
- [`wiretap`](../commands/wiretap.html)
|
||||
- [`dms`](../commands/dms.html) - convenience layer over `@me`
|
||||
- [Sync sources](sync-sources.html)
|
||||
@ -1,13 +0,0 @@
|
||||
<!doctype html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||
<meta http-equiv="refresh" content="0; url=README.html">
|
||||
<link rel="canonical" href="README.html">
|
||||
<title>Discrawl docs</title>
|
||||
</head>
|
||||
<body>
|
||||
<p><a href="README.html">Discrawl docs</a></p>
|
||||
</body>
|
||||
</html>
|
||||
@ -1,66 +0,0 @@
|
||||
# Install
|
||||
|
||||
Discrawl is a single Go binary. Install via Homebrew or build from source.
|
||||
|
||||
## Homebrew
|
||||
|
||||
```bash
|
||||
brew install steipete/tap/discrawl
|
||||
discrawl --version
|
||||
```
|
||||
|
||||
The tap auto-installs from `steipete/tap`.
|
||||
|
||||
## From source
|
||||
|
||||
Requires Go `1.26+`.
|
||||
|
||||
```bash
|
||||
git clone https://github.com/openclaw/discrawl.git
|
||||
cd discrawl
|
||||
go build -o bin/discrawl ./cmd/discrawl
|
||||
./bin/discrawl --version
|
||||
```
|
||||
|
||||
If you do not put `discrawl` on `PATH`, replace `discrawl` with `./bin/discrawl` in any example below.
|
||||
|
||||
## Quick start (with bot token)
|
||||
|
||||
```bash
|
||||
export DISCORD_BOT_TOKEN="your-bot-token"
|
||||
discrawl init
|
||||
discrawl doctor
|
||||
discrawl sync --full
|
||||
discrawl sync
|
||||
discrawl search "panic: nil pointer"
|
||||
discrawl tail
|
||||
```
|
||||
|
||||
`init` discovers accessible guilds and writes `~/.discrawl/config.toml`. If exactly one guild is available, it becomes the default automatically.
|
||||
|
||||
`doctor` verifies the config loads, the token resolves, the bot can reach the Gateway, and the local DB and FTS index are wired up.
|
||||
|
||||
## Quick start (Git-only reader)
|
||||
|
||||
No Discord credentials required. You read a private Git snapshot another machine published.
|
||||
|
||||
```bash
|
||||
discrawl subscribe https://github.com/example/discord-archive.git
|
||||
discrawl search "launch checklist"
|
||||
discrawl messages --channel general --hours 24
|
||||
```
|
||||
|
||||
`subscribe` writes a token-free config (`discord.token_source = "none"`) and imports the snapshot. Read commands auto-refresh when the local snapshot is older than `15m`.
|
||||
|
||||
## Default runtime paths
|
||||
|
||||
- config: `~/.discrawl/config.toml`
|
||||
- database: `~/.discrawl/discrawl.db`
|
||||
- cache: `~/.discrawl/cache/`
|
||||
- logs: `~/.discrawl/logs/`
|
||||
|
||||
## Next steps
|
||||
|
||||
- [Bot setup](bot-setup.html) - intents, permissions, token sources
|
||||
- [Configuration](configuration.html) - the full TOML shape and override rules
|
||||
- [`sync`](commands/sync.html) - the main archive command
|
||||
@ -1,49 +0,0 @@
|
||||
# Security
|
||||
|
||||
## Tokens and credentials
|
||||
|
||||
- Do not commit bot tokens or API keys.
|
||||
- Default config lives in your home directory, not inside the repo.
|
||||
- Prefer env vars or the OS keyring for bot tokens.
|
||||
- `discrawl doctor` reports the token source (env or keyring), not token contents.
|
||||
|
||||
## Wiretap is local-only
|
||||
|
||||
`wiretap` reads local Discord Desktop cache files only. It does not:
|
||||
|
||||
- extract, store, or print Discord auth tokens
|
||||
- use a user token
|
||||
- call the Discord API as your user
|
||||
- run as a selfbot
|
||||
|
||||
Wiretap DM messages stay local. They are stored under the synthetic guild id `@me` and are never exported to:
|
||||
|
||||
- `publish` (Git snapshot output)
|
||||
- `subscribe` / Git snapshot import
|
||||
- the optional `--with-embeddings` snapshot export
|
||||
|
||||
A shared guild mirror refresh does not wipe local wiretap DM search either - import preserves existing local `@me` guilds, channels, messages, and attachments.
|
||||
|
||||
## CI
|
||||
|
||||
CI runs secret scanning with `gitleaks` against git history and the working tree.
|
||||
|
||||
## What is stored locally
|
||||
|
||||
- guild metadata
|
||||
- channels and threads (one table)
|
||||
- current member snapshot
|
||||
- canonical message rows
|
||||
- append-only message event records
|
||||
- FTS index rows
|
||||
- optional local embedding queue metadata and vectors
|
||||
|
||||
Attachment binaries are not stored in SQLite. Only attachment metadata and (optionally) extracted text.
|
||||
|
||||
Set `sync.attachment_text = false` if you want to keep attachment metadata and filenames but disable attachment body fetches for text indexing.
|
||||
|
||||
## What is sent over the wire
|
||||
|
||||
With remote embedding providers, message text is sent during `discrawl embed`, and search query text is sent when using `--mode semantic` or `--mode hybrid`. Stored message text is not sent during local vector scoring.
|
||||
|
||||
Local providers like Ollama keep both message and query embedding on the same machine.
|
||||
Binary file not shown.
|
Before Width: | Height: | Size: 146 KiB |
@ -1,79 +0,0 @@
|
||||
<svg xmlns="http://www.w3.org/2000/svg" width="1200" height="630" viewBox="0 0 1200 630" role="img" aria-labelledby="title desc">
|
||||
<title id="title">Discrawl social card</title>
|
||||
<desc id="desc">Discrawl mirrors Discord into SQLite for local search and analysis.</desc>
|
||||
<defs>
|
||||
<linearGradient id="bg" x1="0" y1="0" x2="1" y2="1">
|
||||
<stop offset="0" stop-color="#0b0f16"/>
|
||||
<stop offset="0.58" stop-color="#111723"/>
|
||||
<stop offset="1" stop-color="#151827"/>
|
||||
</linearGradient>
|
||||
<linearGradient id="accent" x1="0" y1="0" x2="1" y2="0">
|
||||
<stop offset="0" stop-color="#5fe3d4"/>
|
||||
<stop offset="0.56" stop-color="#a594ff"/>
|
||||
<stop offset="1" stop-color="#f364a2"/>
|
||||
</linearGradient>
|
||||
<linearGradient id="terminal" x1="0" y1="0" x2="0" y2="1">
|
||||
<stop offset="0" stop-color="#161d2a"/>
|
||||
<stop offset="1" stop-color="#0f141d"/>
|
||||
</linearGradient>
|
||||
<filter id="shadow" x="-10%" y="-15%" width="120%" height="130%">
|
||||
<feDropShadow dx="0" dy="22" stdDeviation="22" flood-color="#000000" flood-opacity="0.45"/>
|
||||
</filter>
|
||||
<filter id="softGlow" x="-40%" y="-40%" width="180%" height="180%">
|
||||
<feGaussianBlur stdDeviation="36"/>
|
||||
</filter>
|
||||
</defs>
|
||||
|
||||
<rect width="1200" height="630" fill="url(#bg)"/>
|
||||
<circle cx="1030" cy="92" r="210" fill="#5fe3d4" opacity="0.11" filter="url(#softGlow)"/>
|
||||
<circle cx="104" cy="568" r="240" fill="#f364a2" opacity="0.10" filter="url(#softGlow)"/>
|
||||
<path d="M0 515 C190 438 330 548 512 472 S874 330 1200 410 L1200 630 L0 630 Z" fill="#0a0d13" opacity="0.55"/>
|
||||
<path d="M0 534 C206 456 338 570 520 492 S884 360 1200 438" fill="none" stroke="url(#accent)" stroke-width="3" opacity="0.44"/>
|
||||
|
||||
<g transform="translate(72 70)">
|
||||
<rect x="0" y="0" width="112" height="112" rx="22" fill="#0c0f14" stroke="#253244" stroke-width="2"/>
|
||||
<rect x="23" y="28" width="66" height="47" rx="5" fill="none" stroke="#5fe3d4" stroke-width="4"/>
|
||||
<line x1="23" y1="43" x2="89" y2="43" stroke="#5fe3d4" stroke-width="3"/>
|
||||
<circle cx="33" cy="36" r="2.8" fill="#f364a2"/>
|
||||
<circle cx="43" cy="36" r="2.8" fill="#f7c177"/>
|
||||
<circle cx="53" cy="36" r="2.8" fill="#5fe3d4"/>
|
||||
<text x="31" y="59" font-family="JetBrains Mono, Menlo, Consolas, monospace" font-size="10" font-weight="800" fill="#5fe3d4">SELECT</text>
|
||||
<text x="31" y="71" font-family="JetBrains Mono, Menlo, Consolas, monospace" font-size="10" font-weight="800" fill="#aab3c1">msgs</text>
|
||||
<rect x="23" y="84" width="66" height="6" rx="3" fill="#161b24"/>
|
||||
<rect x="23" y="84" width="42" height="6" rx="3" fill="#5fe3d4"/>
|
||||
</g>
|
||||
|
||||
<text x="205" y="126" font-family="JetBrains Mono, Menlo, Consolas, monospace" font-size="28" font-weight="800" letter-spacing="2" fill="#5fe3d4">discrawl.sh</text>
|
||||
<text x="72" y="248" font-family="Inter, -apple-system, BlinkMacSystemFont, Segoe UI, sans-serif" font-size="96" font-weight="800" letter-spacing="-3" fill="#edf4fb">Discord history,</text>
|
||||
<text x="72" y="346" font-family="Inter, -apple-system, BlinkMacSystemFont, Segoe UI, sans-serif" font-size="96" font-weight="800" letter-spacing="-3" fill="#edf4fb">local answers.</text>
|
||||
<text x="74" y="410" font-family="Inter, -apple-system, BlinkMacSystemFont, Segoe UI, sans-serif" font-size="30" font-weight="560" fill="#aab3c1">Mirror Discord into SQLite.</text>
|
||||
<text x="74" y="450" font-family="Inter, -apple-system, BlinkMacSystemFont, Segoe UI, sans-serif" font-size="30" font-weight="560" fill="#aab3c1">Search, query, tail, and analyze locally.</text>
|
||||
|
||||
<g transform="translate(72 505)">
|
||||
<rect x="0" y="0" width="210" height="54" rx="10" fill="#5fe3d4"/>
|
||||
<text x="28" y="35" font-family="JetBrains Mono, Menlo, Consolas, monospace" font-size="20" font-weight="900" fill="#081016">discrawl sync</text>
|
||||
<rect x="230" y="0" width="228" height="54" rx="10" fill="#151d29" stroke="#263448" stroke-width="2"/>
|
||||
<text x="258" y="35" font-family="JetBrains Mono, Menlo, Consolas, monospace" font-size="20" font-weight="800" fill="#f364a2">discrawl search</text>
|
||||
</g>
|
||||
|
||||
<g transform="translate(742 135)" filter="url(#shadow)">
|
||||
<rect x="0" y="0" width="386" height="330" rx="20" fill="url(#terminal)" stroke="#263448" stroke-width="2"/>
|
||||
<rect x="0" y="0" width="386" height="54" rx="20" fill="#121925"/>
|
||||
<path d="M0 34 Q0 0 34 0 H352 Q386 0 386 34 V54 H0 Z" fill="#121925"/>
|
||||
<circle cx="30" cy="27" r="7" fill="#f364a2"/>
|
||||
<circle cx="54" cy="27" r="7" fill="#f7c177"/>
|
||||
<circle cx="78" cy="27" r="7" fill="#5fe3d4"/>
|
||||
<text x="112" y="34" font-family="JetBrains Mono, Menlo, Consolas, monospace" font-size="16" font-weight="800" fill="#657287">sqlite archive</text>
|
||||
<text x="28" y="95" font-family="JetBrains Mono, Menlo, Consolas, monospace" font-size="20" font-weight="800" fill="#5fe3d4">$ discrawl wiretap</text>
|
||||
<text x="28" y="132" font-family="JetBrains Mono, Menlo, Consolas, monospace" font-size="18" font-weight="650" fill="#6f7b8d">dm cache imported: 814</text>
|
||||
<text x="28" y="180" font-family="JetBrains Mono, Menlo, Consolas, monospace" font-size="20" font-weight="800" fill="#5fe3d4">$ discrawl sql</text>
|
||||
<text x="28" y="218" font-family="JetBrains Mono, Menlo, Consolas, monospace" font-size="18" font-weight="650" fill="#edf4fb">312k messages</text>
|
||||
<text x="28" y="255" font-family="JetBrains Mono, Menlo, Consolas, monospace" font-size="18" font-weight="650" fill="#edf4fb">14k attachments</text>
|
||||
<text x="28" y="292" font-family="JetBrains Mono, Menlo, Consolas, monospace" font-size="18" font-weight="650" fill="#edf4fb">FTS5 ready</text>
|
||||
<rect x="286" y="260" width="72" height="10" rx="5" fill="#263448"/>
|
||||
<rect x="312" y="282" width="46" height="10" rx="5" fill="#263448"/>
|
||||
<rect x="298" y="304" width="60" height="10" rx="5" fill="#263448"/>
|
||||
</g>
|
||||
|
||||
<text x="72" y="600" font-family="JetBrains Mono, Menlo, Consolas, monospace" font-size="18" font-weight="800" fill="#657287">bot sync + desktop wiretap + FTS5 + semantic search</text>
|
||||
</svg>
|
||||
|
Before Width: | Height: | Size: 5.9 KiB |
31
go.mod
31
go.mod
@ -1,53 +1,30 @@
|
||||
module github.com/openclaw/discrawl
|
||||
module github.com/steipete/discrawl
|
||||
|
||||
go 1.26.3
|
||||
go 1.26.2
|
||||
|
||||
require (
|
||||
github.com/bwmarrin/discordgo v0.29.0
|
||||
github.com/gorilla/websocket v1.5.3
|
||||
github.com/pelletier/go-toml/v2 v2.3.0
|
||||
github.com/stretchr/testify v1.11.1
|
||||
github.com/zalando/go-keyring v0.2.8
|
||||
golang.org/x/sys v0.43.0
|
||||
golang.org/x/text v0.36.0
|
||||
modernc.org/sqlite v1.50.0
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/charmbracelet/bubbles v1.0.0 // indirect
|
||||
github.com/clipperhouse/displaywidth v0.11.0 // indirect
|
||||
github.com/clipperhouse/uax29/v2 v2.7.0 // indirect
|
||||
github.com/pelletier/go-toml/v2 v2.3.1 // indirect
|
||||
modernc.org/sqlite v1.50.0 // indirect
|
||||
)
|
||||
|
||||
require (
|
||||
github.com/aymanbagabas/go-osc52/v2 v2.0.1 // indirect
|
||||
github.com/charmbracelet/bubbletea v1.3.10 // indirect
|
||||
github.com/charmbracelet/colorprofile v0.4.1 // indirect
|
||||
github.com/charmbracelet/lipgloss v1.1.0 // indirect
|
||||
github.com/charmbracelet/x/ansi v0.11.7 // indirect
|
||||
github.com/charmbracelet/x/cellbuf v0.0.15 // indirect
|
||||
github.com/charmbracelet/x/term v0.2.2 // indirect
|
||||
github.com/danieljoos/wincred v1.2.3 // indirect
|
||||
github.com/davecgh/go-spew v1.1.1 // indirect
|
||||
github.com/dustin/go-humanize v1.0.1 // indirect
|
||||
github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f // indirect
|
||||
github.com/godbus/dbus/v5 v5.2.2 // indirect
|
||||
github.com/google/pprof v0.0.0-20260402051712-545e8a4df936 // indirect
|
||||
github.com/google/uuid v1.6.0 // indirect
|
||||
github.com/kr/pretty v0.3.1 // indirect
|
||||
github.com/lucasb-eyer/go-colorful v1.4.0 // indirect
|
||||
github.com/mattn/go-isatty v0.0.22 // indirect
|
||||
github.com/mattn/go-localereader v0.0.1 // indirect
|
||||
github.com/mattn/go-runewidth v0.0.23 // indirect
|
||||
github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 // indirect
|
||||
github.com/muesli/cancelreader v0.2.2 // indirect
|
||||
github.com/muesli/termenv v0.16.0 // indirect
|
||||
github.com/ncruces/go-strftime v1.0.0 // indirect
|
||||
github.com/openclaw/crawlkit v0.5.0
|
||||
github.com/pmezard/go-difflib v1.0.0 // indirect
|
||||
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect
|
||||
github.com/rivo/uniseg v0.4.7 // indirect
|
||||
github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e // indirect
|
||||
golang.org/x/crypto v0.50.0 // indirect
|
||||
golang.org/x/tools v0.44.0 // indirect
|
||||
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c // indirect
|
||||
|
||||
47
go.sum
47
go.sum
@ -1,25 +1,5 @@
|
||||
github.com/aymanbagabas/go-osc52/v2 v2.0.1 h1:HwpRHbFMcZLEVr42D4p7XBqjyuxQH5SMiErDT4WkJ2k=
|
||||
github.com/aymanbagabas/go-osc52/v2 v2.0.1/go.mod h1:uYgXzlJ7ZpABp8OJ+exZzJJhRNQ2ASbcXHWsFqH8hp8=
|
||||
github.com/bwmarrin/discordgo v0.29.0 h1:FmWeXFaKUwrcL3Cx65c20bTRW+vOb6k8AnaP+EgjDno=
|
||||
github.com/bwmarrin/discordgo v0.29.0/go.mod h1:NJZpH+1AfhIcyQsPeuBKsUtYrRnjkyu0kIVMCHkZtRY=
|
||||
github.com/charmbracelet/bubbles v1.0.0 h1:12J8/ak/uCZEMQ6KU7pcfwceyjLlWsDLAxB5fXonfvc=
|
||||
github.com/charmbracelet/bubbles v1.0.0/go.mod h1:9d/Zd5GdnauMI5ivUIVisuEm3ave1XwXtD1ckyV6r3E=
|
||||
github.com/charmbracelet/bubbletea v1.3.10 h1:otUDHWMMzQSB0Pkc87rm691KZ3SWa4KUlvF9nRvCICw=
|
||||
github.com/charmbracelet/bubbletea v1.3.10/go.mod h1:ORQfo0fk8U+po9VaNvnV95UPWA1BitP1E0N6xJPlHr4=
|
||||
github.com/charmbracelet/colorprofile v0.4.1 h1:a1lO03qTrSIRaK8c3JRxJDZOvhvIeSco3ej+ngLk1kk=
|
||||
github.com/charmbracelet/colorprofile v0.4.1/go.mod h1:U1d9Dljmdf9DLegaJ0nGZNJvoXAhayhmidOdcBwAvKk=
|
||||
github.com/charmbracelet/lipgloss v1.1.0 h1:vYXsiLHVkK7fp74RkV7b2kq9+zDLoEU4MZoFqR/noCY=
|
||||
github.com/charmbracelet/lipgloss v1.1.0/go.mod h1:/6Q8FR2o+kj8rz4Dq0zQc3vYf7X+B0binUUBwA0aL30=
|
||||
github.com/charmbracelet/x/ansi v0.11.7 h1:kzv1kJvjg2S3r9KHo8hDdHFQLEqn4RBCb39dAYC84jI=
|
||||
github.com/charmbracelet/x/ansi v0.11.7/go.mod h1:9qGpnAVYz+8ACONkZBUWPtL7lulP9No6p1epAihUZwQ=
|
||||
github.com/charmbracelet/x/cellbuf v0.0.15 h1:ur3pZy0o6z/R7EylET877CBxaiE1Sp1GMxoFPAIztPI=
|
||||
github.com/charmbracelet/x/cellbuf v0.0.15/go.mod h1:J1YVbR7MUuEGIFPCaaZ96KDl5NoS0DAWkskup+mOY+Q=
|
||||
github.com/charmbracelet/x/term v0.2.2 h1:xVRT/S2ZcKdhhOuSP4t5cLi5o+JxklsoEObBSgfgZRk=
|
||||
github.com/charmbracelet/x/term v0.2.2/go.mod h1:kF8CY5RddLWrsgVwpw4kAa6TESp6EB5y3uxGLeCqzAI=
|
||||
github.com/clipperhouse/displaywidth v0.11.0 h1:lBc6kY44VFw+TDx4I8opi/EtL9m20WSEFgwIwO+UVM8=
|
||||
github.com/clipperhouse/displaywidth v0.11.0/go.mod h1:bkrFNkf81G8HyVqmKGxsPufD3JhNl3dSqnGhOoSD/o0=
|
||||
github.com/clipperhouse/uax29/v2 v2.7.0 h1:+gs4oBZ2gPfVrKPthwbMzWZDaAFPGYK72F0NJv2v7Vk=
|
||||
github.com/clipperhouse/uax29/v2 v2.7.0/go.mod h1:EFJ2TJMRUaplDxHKj1qAEhCtQPW2tJSwu5BF98AuoVM=
|
||||
github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E=
|
||||
github.com/danieljoos/wincred v1.2.3 h1:v7dZC2x32Ut3nEfRH+vhoZGvN72+dQ/snVXo/vMFLdQ=
|
||||
github.com/danieljoos/wincred v1.2.3/go.mod h1:6qqX0WNrS4RzPZ1tnroDzq9kY3fu1KwE7MRLQK4X0bs=
|
||||
@ -27,8 +7,6 @@ github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c
|
||||
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
|
||||
github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY=
|
||||
github.com/dustin/go-humanize v1.0.1/go.mod h1:Mu1zIs6XwVuF/gI1OepvI0qD18qycQx+mFykh5fBlto=
|
||||
github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f h1:Y/CXytFA4m6baUTXGLOoWe4PQhGxaX0KpnayAqC48p4=
|
||||
github.com/erikgeiser/coninput v0.0.0-20211004153227-1c3628e74d0f/go.mod h1:vw97MGsxSvLiUE2X8qFplwetxpGLQrlU1Q9AUEIzCaM=
|
||||
github.com/godbus/dbus/v5 v5.2.2 h1:TUR3TgtSVDmjiXOgAAyaZbYmIeP3DPkld3jgKGV8mXQ=
|
||||
github.com/godbus/dbus/v5 v5.2.2/go.mod h1:3AAv2+hPq5rdnr5txxxRwiGjPXamgoIHgz9FPBfOp3c=
|
||||
github.com/google/pprof v0.0.0-20260402051712-545e8a4df936 h1:EwtI+Al+DeppwYX2oXJCETMO23COyaKGP6fHVpkpWpg=
|
||||
@ -47,55 +25,34 @@ github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
|
||||
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
|
||||
github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY=
|
||||
github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE=
|
||||
github.com/lucasb-eyer/go-colorful v1.4.0 h1:UtrWVfLdarDgc44HcS7pYloGHJUjHV/4FwW4TvVgFr4=
|
||||
github.com/lucasb-eyer/go-colorful v1.4.0/go.mod h1:R4dSotOR9KMtayYi1e77YzuveK+i7ruzyGqttikkLy0=
|
||||
github.com/mattn/go-isatty v0.0.22 h1:j8l17JJ9i6VGPUFUYoTUKPSgKe/83EYU2zBC7YNKMw4=
|
||||
github.com/mattn/go-isatty v0.0.22/go.mod h1:ZXfXG4SQHsB/w3ZeOYbR0PrPwLy+n6xiMrJlRFqopa4=
|
||||
github.com/mattn/go-localereader v0.0.1 h1:ygSAOl7ZXTx4RdPYinUpg6W99U8jWvWi9Ye2JC/oIi4=
|
||||
github.com/mattn/go-localereader v0.0.1/go.mod h1:8fBrzywKY7BI3czFoHkuzRoWE9C+EiG4R1k4Cjx5p88=
|
||||
github.com/mattn/go-runewidth v0.0.23 h1:7ykA0T0jkPpzSvMS5i9uoNn2Xy3R383f9HDx3RybWcw=
|
||||
github.com/mattn/go-runewidth v0.0.23/go.mod h1:XBkDxAl56ILZc9knddidhrOlY5R/pDhgLpndooCuJAs=
|
||||
github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6 h1:ZK8zHtRHOkbHy6Mmr5D264iyp3TiX5OmNcI5cIARiQI=
|
||||
github.com/muesli/ansi v0.0.0-20230316100256-276c6243b2f6/go.mod h1:CJlz5H+gyd6CUWT45Oy4q24RdLyn7Md9Vj2/ldJBSIo=
|
||||
github.com/muesli/cancelreader v0.2.2 h1:3I4Kt4BQjOR54NavqnDogx/MIoWBFa0StPA8ELUXHmA=
|
||||
github.com/muesli/cancelreader v0.2.2/go.mod h1:3XuTXfFS2VjM+HTLZY9Ak0l6eUKfijIfMUZ4EgX0QYo=
|
||||
github.com/muesli/termenv v0.16.0 h1:S5AlUN9dENB57rsbnkPyfdGuWIlkmzJjbFf0Tf5FWUc=
|
||||
github.com/muesli/termenv v0.16.0/go.mod h1:ZRfOIKPFDYQoDFF4Olj7/QJbW60Ol/kL1pU3VfY/Cnk=
|
||||
github.com/ncruces/go-strftime v1.0.0 h1:HMFp8mLCTPp341M/ZnA4qaf7ZlsbTc+miZjCLOFAw7w=
|
||||
github.com/ncruces/go-strftime v1.0.0/go.mod h1:Fwc5htZGVVkseilnfgOVb9mKy6w1naJmn9CehxcKcls=
|
||||
github.com/openclaw/crawlkit v0.5.0 h1:sVqIbQ5v6LiOf+NXcVj93UhfoaJqMbBlrd1lU6uhO9M=
|
||||
github.com/openclaw/crawlkit v0.5.0/go.mod h1:/AI8o/DeRqXPZJPHq/9mGUjNzLPskm/wTjikRPxEdHY=
|
||||
github.com/pelletier/go-toml/v2 v2.3.1 h1:MYEvvGnQjeNkRF1qUuGolNtNExTDwct51yp7olPtrEc=
|
||||
github.com/pelletier/go-toml/v2 v2.3.1/go.mod h1:2gIqNv+qfxSVS7cM2xJQKtLSTLUE9V8t9Stt+h56mCY=
|
||||
github.com/pelletier/go-toml/v2 v2.3.0 h1:k59bC/lIZREW0/iVaQR8nDHxVq8OVlIzYCOJf421CaM=
|
||||
github.com/pelletier/go-toml/v2 v2.3.0/go.mod h1:2gIqNv+qfxSVS7cM2xJQKtLSTLUE9V8t9Stt+h56mCY=
|
||||
github.com/pkg/diff v0.0.0-20210226163009-20ebb0f2a09e/go.mod h1:pJLUxLENpZxwdsKMEsNbx1VGcRFpLqf3715MtcvvzbA=
|
||||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
|
||||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
|
||||
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE=
|
||||
github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo=
|
||||
github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ=
|
||||
github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
|
||||
github.com/rogpeppe/go-internal v1.9.0 h1:73kH8U+JUqXU8lRuOHeVHaa/SZPifC7BkcraZVejAe8=
|
||||
github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs=
|
||||
github.com/stretchr/objx v0.5.2 h1:xuMeJ0Sdp5ZMRXx/aWO6RZxdr3beISkG5/G/aIRr3pY=
|
||||
github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA=
|
||||
github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U=
|
||||
github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U=
|
||||
github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e h1:JVG44RsyaB9T2KIHavMF/ppJZNG9ZpyihvCd0w101no=
|
||||
github.com/xo/terminfo v0.0.0-20220910002029-abceb7e1c41e/go.mod h1:RbqR21r5mrJuqunuUZ/Dhy/avygyECGrLceyNeo4LiM=
|
||||
github.com/zalando/go-keyring v0.2.8 h1:6sD/Ucpl7jNq10rM2pgqTs0sZ9V3qMrqfIIy5YPccHs=
|
||||
github.com/zalando/go-keyring v0.2.8/go.mod h1:tsMo+VpRq5NGyKfxoBVjCuMrG47yj8cmakZDO5QGii0=
|
||||
golang.org/x/crypto v0.0.0-20210421170649-83a5a9bb288b/go.mod h1:T9bdIzuCu7OtxOm1hfPfRQxPLYneinmdGuTeoZ9dtd4=
|
||||
golang.org/x/crypto v0.50.0 h1:zO47/JPrL6vsNkINmLoo/PH1gcxpls50DNogFvB5ZGI=
|
||||
golang.org/x/crypto v0.50.0/go.mod h1:3muZ7vA7PBCE6xgPX7nkzzjiUq87kRItoJQM1Yo8S+Q=
|
||||
golang.org/x/exp v0.0.0-20231006140011-7918f672742d h1:jtJma62tbqLibJ5sFQz8bKtEM8rJBtfilJ2qTU199MI=
|
||||
golang.org/x/exp v0.0.0-20231006140011-7918f672742d/go.mod h1:ldy0pHrwJyGW56pPQzzkH36rKxoZW1tw7ZJpeKx+hdo=
|
||||
golang.org/x/mod v0.35.0 h1:Ww1D637e6Pg+Zb2KrWfHQUnH2dQRLBQyAtpr/haaJeM=
|
||||
golang.org/x/mod v0.35.0/go.mod h1:+GwiRhIInF8wPm+4AoT6L0FA1QWAad3OMdTRx4tFYlU=
|
||||
golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg=
|
||||
golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4=
|
||||
golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0=
|
||||
golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
|
||||
golang.org/x/sys v0.0.0-20210809222454-d867a43fc93e/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
|
||||
golang.org/x/sys v0.43.0 h1:Rlag2XtaFTxp19wS8MXlJwTvoh8ArU6ezoyFsMyCTNI=
|
||||
golang.org/x/sys v0.43.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw=
|
||||
golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo=
|
||||
|
||||
@ -13,13 +13,12 @@ import (
|
||||
"syscall"
|
||||
"time"
|
||||
|
||||
"github.com/openclaw/crawlkit/embed"
|
||||
"github.com/openclaw/discrawl/internal/config"
|
||||
"github.com/openclaw/discrawl/internal/discord"
|
||||
"github.com/openclaw/discrawl/internal/discorddesktop"
|
||||
"github.com/openclaw/discrawl/internal/share"
|
||||
"github.com/openclaw/discrawl/internal/store"
|
||||
"github.com/openclaw/discrawl/internal/syncer"
|
||||
"github.com/steipete/discrawl/internal/config"
|
||||
"github.com/steipete/discrawl/internal/discord"
|
||||
"github.com/steipete/discrawl/internal/discorddesktop"
|
||||
"github.com/steipete/discrawl/internal/embed"
|
||||
"github.com/steipete/discrawl/internal/store"
|
||||
"github.com/steipete/discrawl/internal/syncer"
|
||||
)
|
||||
|
||||
type syncSources struct {
|
||||
@ -114,19 +113,9 @@ func (r *runtime) runSync(args []string) error {
|
||||
latestOnly := fs.Bool("latest-only", false, "")
|
||||
guildsFlag := fs.String("guilds", "", "")
|
||||
guildFlag := fs.String("guild", "", "")
|
||||
updateMode := fs.String("update", "", "")
|
||||
noUpdate := fs.Bool("no-update", false, "")
|
||||
if err := fs.Parse(args); err != nil {
|
||||
return usageErr(err)
|
||||
}
|
||||
if *noUpdate && strings.TrimSpace(*updateMode) != "" && !strings.EqualFold(strings.TrimSpace(*updateMode), string(shareUpdateNever)) {
|
||||
return usageErr(errors.New("use either --no-update or --update, not both"))
|
||||
}
|
||||
if strings.TrimSpace(*updateMode) != "" {
|
||||
if _, err := parseShareUpdateMode(*updateMode); err != nil {
|
||||
return usageErr(err)
|
||||
}
|
||||
}
|
||||
sources, err := parseSyncSources(*source)
|
||||
if err != nil {
|
||||
return usageErr(err)
|
||||
@ -162,7 +151,6 @@ func (r *runtime) runSync(args []string) error {
|
||||
func (r *runtime) runSyncLocked(sources syncSources, opts syncer.SyncOptions) error {
|
||||
var apiStats *syncer.SyncStats
|
||||
if sources.discord {
|
||||
r.setSyncLockPhase("discord sync")
|
||||
shouldClose := r.client == nil
|
||||
if err := r.ensureDiscordServices(); err != nil {
|
||||
return err
|
||||
@ -178,11 +166,9 @@ func (r *runtime) runSyncLocked(sources syncSources, opts syncer.SyncOptions) er
|
||||
}
|
||||
var wiretapStats *discorddesktop.Stats
|
||||
if sources.wiretap {
|
||||
r.setSyncLockPhase("wiretap import")
|
||||
stats, err := discorddesktop.Import(r.ctx, r.store, discorddesktop.Options{
|
||||
Path: r.cfg.Desktop.Path,
|
||||
MaxFileBytes: r.cfg.Desktop.MaxFileBytes,
|
||||
FullCache: r.cfg.Desktop.FullCache,
|
||||
Now: r.now,
|
||||
})
|
||||
if err != nil {
|
||||
@ -264,7 +250,6 @@ func (r *runtime) runWiretap(args []string) error {
|
||||
fs.SetOutput(io.Discard)
|
||||
path := fs.String("path", r.cfg.Desktop.Path, "")
|
||||
maxFileBytes := fs.Int64("max-file-bytes", r.cfg.Desktop.MaxFileBytes, "")
|
||||
fullCache := fs.Bool("full-cache", r.cfg.Desktop.FullCache, "")
|
||||
dryRun := fs.Bool("dry-run", false, "")
|
||||
watchEvery := fs.Duration("watch-every", 0, "")
|
||||
if err := fs.Parse(args); err != nil {
|
||||
@ -280,7 +265,6 @@ func (r *runtime) runWiretap(args []string) error {
|
||||
stats, err := discorddesktop.Import(ctx, r.store, discorddesktop.Options{
|
||||
Path: *path,
|
||||
MaxFileBytes: *maxFileBytes,
|
||||
FullCache: *fullCache,
|
||||
DryRun: *dryRun,
|
||||
Now: r.now,
|
||||
})
|
||||
@ -315,37 +299,16 @@ func (r *runtime) runWiretap(args []string) error {
|
||||
}
|
||||
|
||||
func (r *runtime) runStatus(args []string) error {
|
||||
fs := flag.NewFlagSet("status", flag.ContinueOnError)
|
||||
fs.SetOutput(io.Discard)
|
||||
jsonOut := fs.Bool("json", false, "")
|
||||
if err := fs.Parse(args); err != nil {
|
||||
return usageErr(err)
|
||||
}
|
||||
if fs.NArg() != 0 {
|
||||
if len(args) != 0 {
|
||||
return usageErr(errors.New("status takes no arguments"))
|
||||
}
|
||||
if *jsonOut {
|
||||
r.json = true
|
||||
}
|
||||
dbPath, err := config.ExpandPath(r.cfg.DBPath)
|
||||
if err != nil {
|
||||
return configErr(err)
|
||||
}
|
||||
status := store.Status{DBPath: dbPath, DefaultGuildID: r.cfg.EffectiveDefaultGuildID()}
|
||||
if r.store != nil {
|
||||
status, err = r.store.Status(r.ctx, dbPath, r.cfg.EffectiveDefaultGuildID())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
if r.json {
|
||||
needsUpdate := false
|
||||
if r.store != nil && r.cfg.ShareEnabled() {
|
||||
if staleAfter, err := time.ParseDuration(r.cfg.Share.StaleAfter); err == nil {
|
||||
needsUpdate = share.NeedsImport(r.ctx, r.store, staleAfter)
|
||||
}
|
||||
}
|
||||
return r.print(controlStatus(r.configPath, r.cfg, status, needsUpdate))
|
||||
status, err := r.store.Status(r.ctx, dbPath, r.cfg.EffectiveDefaultGuildID())
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return r.print(status)
|
||||
}
|
||||
@ -374,7 +337,7 @@ func (r *runtime) runEmbed(args []string) error {
|
||||
providerFactory := r.newEmbed
|
||||
if providerFactory == nil {
|
||||
providerFactory = func(cfg config.EmbeddingsConfig) (embed.Provider, error) {
|
||||
return embed.NewProvider(crawlkitEmbeddingConfig(cfg))
|
||||
return embed.NewProvider(cfg)
|
||||
}
|
||||
}
|
||||
provider, err := providerFactory(r.cfg.Search.Embeddings)
|
||||
@ -406,18 +369,9 @@ func (r *runtime) runEmbed(args []string) error {
|
||||
}
|
||||
|
||||
func (r *runtime) runDoctor(args []string) error {
|
||||
fs := flag.NewFlagSet("doctor", flag.ContinueOnError)
|
||||
fs.SetOutput(io.Discard)
|
||||
jsonOut := fs.Bool("json", false, "")
|
||||
if err := fs.Parse(args); err != nil {
|
||||
return usageErr(err)
|
||||
}
|
||||
if fs.NArg() != 0 {
|
||||
if len(args) != 0 {
|
||||
return usageErr(errors.New("doctor takes no arguments"))
|
||||
}
|
||||
if *jsonOut {
|
||||
r.json = true
|
||||
}
|
||||
report := map[string]any{
|
||||
"config_path": r.configPath,
|
||||
}
|
||||
@ -435,7 +389,7 @@ func (r *runtime) runDoctor(args []string) error {
|
||||
report["share_stale_after"] = cfg.Share.StaleAfter
|
||||
}
|
||||
if cfg.Search.Embeddings.Enabled {
|
||||
check := embed.CheckProvider(r.ctx, crawlkitEmbeddingConfig(cfg.Search.Embeddings))
|
||||
check := embed.CheckProvider(r.ctx, cfg.Search.Embeddings)
|
||||
report["embeddings"] = check.Status
|
||||
report["embeddings_provider"] = check.Provider
|
||||
report["embeddings_model"] = check.Model
|
||||
|
||||
@ -7,7 +7,7 @@ import (
|
||||
"io"
|
||||
"strings"
|
||||
|
||||
"github.com/openclaw/discrawl/internal/report"
|
||||
"github.com/steipete/discrawl/internal/report"
|
||||
)
|
||||
|
||||
func (r *runtime) runAnalytics(args []string) error {
|
||||
@ -20,11 +20,11 @@ func (r *runtime) runAnalytics(args []string) error {
|
||||
subArgs := args[1:]
|
||||
switch subcommand {
|
||||
case "quiet":
|
||||
return r.withLocalStoreRead(true, func() error {
|
||||
return r.withLocalStoreDefaultLocked(true, true, func() error {
|
||||
return r.runAnalyticsQuiet(subArgs)
|
||||
})
|
||||
case "trends":
|
||||
return r.withLocalStoreRead(true, func() error {
|
||||
return r.withLocalStoreDefaultLocked(true, true, func() error {
|
||||
return r.runAnalyticsTrends(subArgs)
|
||||
})
|
||||
default:
|
||||
|
||||
@ -10,8 +10,8 @@ import (
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/openclaw/discrawl/internal/config"
|
||||
"github.com/openclaw/discrawl/internal/store"
|
||||
"github.com/steipete/discrawl/internal/config"
|
||||
"github.com/steipete/discrawl/internal/store"
|
||||
)
|
||||
|
||||
func TestAnalyticsCommand(t *testing.T) {
|
||||
|
||||
@ -11,12 +11,12 @@ import (
|
||||
"time"
|
||||
|
||||
"github.com/bwmarrin/discordgo"
|
||||
"github.com/openclaw/crawlkit/embed"
|
||||
"github.com/openclaw/discrawl/internal/config"
|
||||
"github.com/openclaw/discrawl/internal/discord"
|
||||
"github.com/openclaw/discrawl/internal/share"
|
||||
"github.com/openclaw/discrawl/internal/store"
|
||||
"github.com/openclaw/discrawl/internal/syncer"
|
||||
"github.com/steipete/discrawl/internal/config"
|
||||
"github.com/steipete/discrawl/internal/discord"
|
||||
"github.com/steipete/discrawl/internal/embed"
|
||||
"github.com/steipete/discrawl/internal/share"
|
||||
"github.com/steipete/discrawl/internal/store"
|
||||
"github.com/steipete/discrawl/internal/syncer"
|
||||
)
|
||||
|
||||
type cliError struct {
|
||||
@ -28,17 +28,10 @@ func (e *cliError) Error() string {
|
||||
return e.err.Error()
|
||||
}
|
||||
|
||||
func (e *cliError) Unwrap() error {
|
||||
return e.err
|
||||
}
|
||||
|
||||
func ExitCode(err error) int {
|
||||
if err == nil {
|
||||
return 0
|
||||
}
|
||||
if errors.Is(err, context.Canceled) {
|
||||
return 1
|
||||
}
|
||||
var codeErr *cliError
|
||||
if errors.As(err, &codeErr) {
|
||||
return codeErr.code
|
||||
@ -47,10 +40,6 @@ func ExitCode(err error) int {
|
||||
}
|
||||
|
||||
func Run(ctx context.Context, args []string, stdout, stderr io.Writer) error {
|
||||
if len(args) == 0 || args[0] == "help" || args[0] == "--help" || args[0] == "-h" {
|
||||
printUsage(stdout)
|
||||
return nil
|
||||
}
|
||||
global := flag.NewFlagSet("discrawl", flag.ContinueOnError)
|
||||
global.SetOutput(io.Discard)
|
||||
configPath := global.String("config", "", "")
|
||||
@ -70,14 +59,10 @@ func Run(ctx context.Context, args []string, stdout, stderr io.Writer) error {
|
||||
return nil
|
||||
}
|
||||
rest := global.Args()
|
||||
if len(rest) == 0 || rest[0] == "help" || rest[0] == "--help" || rest[0] == "-h" {
|
||||
if len(rest) == 0 || rest[0] == "help" {
|
||||
printUsage(stdout)
|
||||
return nil
|
||||
}
|
||||
if rest[0] == "version" {
|
||||
_, _ = io.WriteString(stdout, version+"\n")
|
||||
return nil
|
||||
}
|
||||
level := slog.LevelInfo
|
||||
if *quiet {
|
||||
level = slog.LevelError
|
||||
@ -98,35 +83,23 @@ func Run(ctx context.Context, args []string, stdout, stderr io.Writer) error {
|
||||
}
|
||||
|
||||
type runtime struct {
|
||||
ctx context.Context
|
||||
configPath string
|
||||
cfg config.Config
|
||||
stdout io.Writer
|
||||
stderr io.Writer
|
||||
json bool
|
||||
plain bool
|
||||
logger *slog.Logger
|
||||
store *store.Store
|
||||
client discordClient
|
||||
syncer syncService
|
||||
dbLockHeld bool
|
||||
lockStarted time.Time
|
||||
openStore func(context.Context, string) (*store.Store, error)
|
||||
newDiscord func(config.Config) (discordClient, error)
|
||||
newSyncer func(syncer.Client, *store.Store, *slog.Logger) syncService
|
||||
newEmbed func(config.EmbeddingsConfig) (embed.Provider, error)
|
||||
now func() time.Time
|
||||
}
|
||||
|
||||
func crawlkitEmbeddingConfig(cfg config.EmbeddingsConfig) embed.Config {
|
||||
return embed.Config{
|
||||
Provider: cfg.Provider,
|
||||
Model: cfg.Model,
|
||||
BaseURL: cfg.BaseURL,
|
||||
APIKeyEnv: cfg.APIKeyEnv,
|
||||
RequestTimeout: cfg.RequestTimeout,
|
||||
MaxInputChars: cfg.MaxInputChars,
|
||||
}
|
||||
ctx context.Context
|
||||
configPath string
|
||||
cfg config.Config
|
||||
stdout io.Writer
|
||||
stderr io.Writer
|
||||
json bool
|
||||
plain bool
|
||||
logger *slog.Logger
|
||||
store *store.Store
|
||||
client discordClient
|
||||
syncer syncService
|
||||
dbLockHeld bool
|
||||
openStore func(context.Context, string) (*store.Store, error)
|
||||
newDiscord func(config.Config) (discordClient, error)
|
||||
newSyncer func(syncer.Client, *store.Store, *slog.Logger) syncService
|
||||
newEmbed func(config.EmbeddingsConfig) (embed.Provider, error)
|
||||
now func() time.Time
|
||||
}
|
||||
|
||||
type discordClient interface {
|
||||
@ -148,59 +121,43 @@ type attachmentTextConfigurer interface {
|
||||
|
||||
func (r *runtime) dispatch(rest []string) error {
|
||||
switch rest[0] {
|
||||
case "metadata":
|
||||
return r.runMetadata(rest[1:])
|
||||
case "init":
|
||||
return r.runInit(rest[1:])
|
||||
case "sync":
|
||||
updateMode, err := syncShareUpdateMode(rest[1:])
|
||||
if err != nil {
|
||||
return usageErr(err)
|
||||
}
|
||||
return r.withLocalStoreUpdateLocked(updateMode, true, func() error { return r.runSync(rest[1:]) })
|
||||
return r.withLocalStoreLocked(true, func() error { return r.runSync(rest[1:]) })
|
||||
case "tail":
|
||||
return r.withServicesLocked(true, func() error { return r.runTail(rest[1:]) })
|
||||
case "wiretap":
|
||||
return r.withLocalStoreLocked(false, func() error { return r.runWiretap(rest[1:]) })
|
||||
case "tap", "cache-import":
|
||||
return r.withLocalStoreLocked(false, func() error { return r.runWiretap(rest[1:]) })
|
||||
case "search":
|
||||
autoShareUpdate := !hasBoolFlag(rest[1:], "--dm")
|
||||
return r.withLocalStoreRead(autoShareUpdate, func() error { return r.runSearch(rest[1:]) })
|
||||
case "tui":
|
||||
if hasHelpArg(rest[1:]) {
|
||||
return r.runTUI(rest[1:])
|
||||
}
|
||||
return r.withLocalStoreReadOnly(func() error { return r.runTUI(rest[1:]) })
|
||||
return r.withLocalStoreDefaultLocked(autoShareUpdate, autoShareUpdate, func() error { return r.runSearch(rest[1:]) })
|
||||
case "messages":
|
||||
if hasBoolFlag(rest[1:], "--sync") && !hasBoolFlag(rest[1:], "--dm") {
|
||||
return r.withServicesAutoLocked(true, true, true, func() error { return r.runMessages(rest[1:]) })
|
||||
}
|
||||
autoShareUpdate := !hasBoolFlag(rest[1:], "--dm")
|
||||
return r.withLocalStoreRead(autoShareUpdate, func() error { return r.runMessages(rest[1:]) })
|
||||
return r.withLocalStoreDefaultLocked(autoShareUpdate, autoShareUpdate, func() error { return r.runMessages(rest[1:]) })
|
||||
case "digest":
|
||||
return r.withLocalStoreRead(true, func() error { return r.runDigest(rest[1:]) })
|
||||
return r.withLocalStoreDefaultLocked(true, true, func() error { return r.runDigest(rest[1:]) })
|
||||
case "analytics":
|
||||
return r.runAnalytics(rest[1:])
|
||||
case "dms":
|
||||
return r.withLocalStoreRead(false, func() error { return r.runDirectMessages(rest[1:]) })
|
||||
return r.withLocalStoreDefault(false, func() error { return r.runDirectMessages(rest[1:]) })
|
||||
case "mentions":
|
||||
return r.withLocalStoreRead(true, func() error { return r.runMentions(rest[1:]) })
|
||||
return r.withLocalStoreLocked(true, func() error { return r.runMentions(rest[1:]) })
|
||||
case "embed":
|
||||
return r.withLocalStoreLocked(true, func() error { return r.runEmbed(rest[1:]) })
|
||||
case "sql":
|
||||
if boolFlagEnabled(rest[1:], "--unsafe") {
|
||||
return r.withLocalStoreLocked(true, func() error { return r.runSQL(rest[1:]) })
|
||||
}
|
||||
return r.withLocalStoreRead(true, func() error { return r.runSQL(rest[1:]) })
|
||||
return r.withLocalStoreLocked(true, func() error { return r.runSQL(rest[1:]) })
|
||||
case "members":
|
||||
return r.withLocalStoreRead(true, func() error { return r.runMembers(rest[1:]) })
|
||||
return r.withLocalStoreLocked(true, func() error { return r.runMembers(rest[1:]) })
|
||||
case "channels":
|
||||
return r.withLocalStoreRead(true, func() error { return r.runChannels(rest[1:]) })
|
||||
return r.withLocalStoreLocked(true, func() error { return r.runChannels(rest[1:]) })
|
||||
case "status":
|
||||
return r.withLocalStoreReadOnly(func() error { return r.runStatus(rest[1:]) })
|
||||
return r.withLocalStoreLocked(true, func() error { return r.runStatus(rest[1:]) })
|
||||
case "report":
|
||||
return r.withLocalStoreRead(true, func() error { return r.runReport(rest[1:]) })
|
||||
return r.withLocalStoreLocked(true, func() error { return r.runReport(rest[1:]) })
|
||||
case "publish":
|
||||
return r.withServicesAutoLocked(false, false, true, func() error { return r.runPublish(rest[1:]) })
|
||||
case "subscribe":
|
||||
@ -223,41 +180,14 @@ func (r *runtime) withServicesLocked(withDiscord bool, fn func() error) error {
|
||||
}
|
||||
|
||||
func (r *runtime) withLocalStoreLocked(autoShareUpdate bool, fn func() error) error {
|
||||
return r.withLocalStoreUpdateLocked(boolShareUpdateMode(autoShareUpdate), true, fn)
|
||||
return r.withLocalStoreDefaultLocked(autoShareUpdate, true, fn)
|
||||
}
|
||||
|
||||
func (r *runtime) withLocalStoreRead(autoShareUpdate bool, fn func() error) error {
|
||||
return r.withLocalStoreReadUpdate(boolShareUpdateMode(autoShareUpdate), fn)
|
||||
func (r *runtime) withLocalStoreDefault(autoShareUpdate bool, fn func() error) error {
|
||||
return r.withLocalStoreDefaultLocked(autoShareUpdate, false, fn)
|
||||
}
|
||||
|
||||
func (r *runtime) withLocalStoreReadUpdate(updateMode shareUpdateMode, fn func() error) error {
|
||||
cfg, err := config.Load(r.configPath)
|
||||
if err != nil {
|
||||
if !errors.Is(err, os.ErrNotExist) {
|
||||
return configErr(err)
|
||||
}
|
||||
cfg = config.Default()
|
||||
if err := cfg.Normalize(); err != nil {
|
||||
return configErr(err)
|
||||
}
|
||||
}
|
||||
if err := config.EnsureRuntimeDirs(cfg); err != nil {
|
||||
return configErr(err)
|
||||
}
|
||||
dbPath, err := config.ExpandPath(cfg.DBPath)
|
||||
if err != nil {
|
||||
return configErr(err)
|
||||
}
|
||||
r.cfg = cfg
|
||||
if r.shouldAutoUpdateShare(updateMode) {
|
||||
if err := r.autoUpdateShareIfLockAvailable(dbPath, updateMode); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return r.openLocalStoreReadOnly(dbPath, fn)
|
||||
}
|
||||
|
||||
func (r *runtime) withLocalStoreUpdateLocked(updateMode shareUpdateMode, lockDB bool, fn func() error) error {
|
||||
func (r *runtime) withLocalStoreDefaultLocked(autoShareUpdate, lockDB bool, fn func() error) error {
|
||||
cfg, err := config.Load(r.configPath)
|
||||
if err != nil {
|
||||
if !errors.Is(err, os.ErrNotExist) {
|
||||
@ -278,45 +208,13 @@ func (r *runtime) withLocalStoreUpdateLocked(updateMode shareUpdateMode, lockDB
|
||||
r.cfg = cfg
|
||||
if lockDB {
|
||||
return r.withSyncLock(func() error {
|
||||
return r.openLocalStore(dbPath, updateMode, fn)
|
||||
return r.openLocalStore(dbPath, autoShareUpdate, fn)
|
||||
})
|
||||
}
|
||||
return r.openLocalStore(dbPath, updateMode, fn)
|
||||
return r.openLocalStore(dbPath, autoShareUpdate, fn)
|
||||
}
|
||||
|
||||
func (r *runtime) shouldAutoUpdateShare(mode shareUpdateMode) bool {
|
||||
return os.Getenv("DISCRAWL_NO_AUTO_UPDATE") != "1" &&
|
||||
r.cfg.ShareEnabled() &&
|
||||
(mode == shareUpdateForce || mode == shareUpdateAuto || (mode == shareUpdateConfigured && r.cfg.Share.AutoUpdate))
|
||||
}
|
||||
|
||||
func (r *runtime) autoUpdateShareIfLockAvailable(dbPath string, updateMode shareUpdateMode) error {
|
||||
locked, err := r.tryWithSyncLock(func() error {
|
||||
storeFactory := r.openStore
|
||||
if storeFactory == nil {
|
||||
storeFactory = store.Open
|
||||
}
|
||||
var openErr error
|
||||
r.store, openErr = storeFactory(r.ctx, dbPath)
|
||||
if openErr != nil {
|
||||
return dbErr(openErr)
|
||||
}
|
||||
defer func() {
|
||||
_ = r.store.Close()
|
||||
r.store = nil
|
||||
}()
|
||||
return r.autoUpdateShare(updateMode)
|
||||
})
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if !locked {
|
||||
r.logger.Info("share update skipped; sync lock is held")
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *runtime) openLocalStore(dbPath string, updateMode shareUpdateMode, fn func() error) error {
|
||||
func (r *runtime) openLocalStore(dbPath string, autoShareUpdate bool, fn func() error) error {
|
||||
storeFactory := r.openStore
|
||||
if storeFactory == nil {
|
||||
storeFactory = store.Open
|
||||
@ -327,96 +225,19 @@ func (r *runtime) openLocalStore(dbPath string, updateMode shareUpdateMode, fn f
|
||||
return dbErr(err)
|
||||
}
|
||||
defer func() { _ = r.store.Close() }()
|
||||
if updateMode != shareUpdateNever && os.Getenv("DISCRAWL_NO_AUTO_UPDATE") != "1" {
|
||||
if err := r.autoUpdateShare(updateMode); err != nil {
|
||||
if autoShareUpdate && os.Getenv("DISCRAWL_NO_AUTO_UPDATE") != "1" {
|
||||
if err := r.autoUpdateShare(); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
return fn()
|
||||
}
|
||||
|
||||
func (r *runtime) withLocalStoreReadOnly(fn func() error) error {
|
||||
cfg, err := config.Load(r.configPath)
|
||||
if err != nil {
|
||||
if !errors.Is(err, os.ErrNotExist) {
|
||||
return configErr(err)
|
||||
}
|
||||
cfg = config.Default()
|
||||
if err := cfg.Normalize(); err != nil {
|
||||
return configErr(err)
|
||||
}
|
||||
}
|
||||
dbPath, err := config.ExpandPath(cfg.DBPath)
|
||||
if err != nil {
|
||||
return configErr(err)
|
||||
}
|
||||
r.cfg = cfg
|
||||
var openErr error
|
||||
r.store, openErr = store.OpenReadOnly(r.ctx, dbPath)
|
||||
if openErr != nil {
|
||||
if errors.Is(openErr, os.ErrNotExist) {
|
||||
r.store = nil
|
||||
return fn()
|
||||
}
|
||||
return dbErr(openErr)
|
||||
}
|
||||
defer func() { _ = r.store.Close() }()
|
||||
return fn()
|
||||
}
|
||||
|
||||
func (r *runtime) openLocalStoreReadOnly(dbPath string, fn func() error) error {
|
||||
var openErr error
|
||||
r.store, openErr = store.OpenReadOnly(r.ctx, dbPath)
|
||||
if openErr != nil {
|
||||
if errors.Is(openErr, os.ErrNotExist) {
|
||||
storeFactory := r.openStore
|
||||
if storeFactory == nil {
|
||||
storeFactory = store.Open
|
||||
}
|
||||
r.store, openErr = storeFactory(r.ctx, dbPath)
|
||||
if openErr == nil {
|
||||
defer func() { _ = r.store.Close() }()
|
||||
return fn()
|
||||
}
|
||||
}
|
||||
if errors.Is(openErr, store.ErrSchemaVersionMismatch) {
|
||||
if err := r.withSyncLock(func() error {
|
||||
storeFactory := r.openStore
|
||||
if storeFactory == nil {
|
||||
storeFactory = store.Open
|
||||
}
|
||||
var migrateErr error
|
||||
r.store, migrateErr = storeFactory(r.ctx, dbPath)
|
||||
if migrateErr != nil {
|
||||
return dbErr(migrateErr)
|
||||
}
|
||||
closeErr := r.store.Close()
|
||||
r.store = nil
|
||||
return closeErr
|
||||
}); err != nil {
|
||||
return err
|
||||
}
|
||||
r.store, openErr = store.OpenReadOnly(r.ctx, dbPath)
|
||||
if openErr == nil {
|
||||
defer func() { _ = r.store.Close() }()
|
||||
return fn()
|
||||
}
|
||||
}
|
||||
return dbErr(openErr)
|
||||
}
|
||||
defer func() { _ = r.store.Close() }()
|
||||
return fn()
|
||||
}
|
||||
|
||||
func (r *runtime) withServicesAuto(withDiscord, autoShareUpdate bool, fn func() error) error {
|
||||
return r.withServicesAutoLocked(withDiscord, autoShareUpdate, false, fn)
|
||||
}
|
||||
|
||||
func (r *runtime) withServicesAutoLocked(withDiscord, autoShareUpdate, lockDB bool, fn func() error) error {
|
||||
return r.withServicesUpdateLocked(withDiscord, boolShareUpdateMode(autoShareUpdate), lockDB, fn)
|
||||
}
|
||||
|
||||
func (r *runtime) withServicesUpdateLocked(withDiscord bool, updateMode shareUpdateMode, lockDB bool, fn func() error) error {
|
||||
cfg, err := config.Load(r.configPath)
|
||||
if err != nil {
|
||||
return configErr(err)
|
||||
@ -431,13 +252,13 @@ func (r *runtime) withServicesUpdateLocked(withDiscord bool, updateMode shareUpd
|
||||
r.cfg = cfg
|
||||
if lockDB {
|
||||
return r.withSyncLock(func() error {
|
||||
return r.openServices(dbPath, withDiscord, updateMode, fn)
|
||||
return r.openServices(dbPath, withDiscord, autoShareUpdate, fn)
|
||||
})
|
||||
}
|
||||
return r.openServices(dbPath, withDiscord, updateMode, fn)
|
||||
return r.openServices(dbPath, withDiscord, autoShareUpdate, fn)
|
||||
}
|
||||
|
||||
func (r *runtime) openServices(dbPath string, withDiscord bool, updateMode shareUpdateMode, fn func() error) error {
|
||||
func (r *runtime) openServices(dbPath string, withDiscord, autoShareUpdate bool, fn func() error) error {
|
||||
storeFactory := r.openStore
|
||||
if storeFactory == nil {
|
||||
storeFactory = store.Open
|
||||
@ -448,8 +269,8 @@ func (r *runtime) openServices(dbPath string, withDiscord bool, updateMode share
|
||||
return dbErr(err)
|
||||
}
|
||||
defer func() { _ = r.store.Close() }()
|
||||
if updateMode != shareUpdateNever && os.Getenv("DISCRAWL_NO_AUTO_UPDATE") != "1" {
|
||||
if err := r.autoUpdateShare(updateMode); err != nil {
|
||||
if autoShareUpdate && os.Getenv("DISCRAWL_NO_AUTO_UPDATE") != "1" {
|
||||
if err := r.autoUpdateShare(); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
@ -493,27 +314,24 @@ func (r *runtime) ensureDiscordServices() error {
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *runtime) autoUpdateShare(mode shareUpdateMode) error {
|
||||
if !r.cfg.ShareEnabled() || (mode == shareUpdateConfigured && !r.cfg.Share.AutoUpdate) {
|
||||
func (r *runtime) autoUpdateShare() error {
|
||||
if !r.cfg.ShareEnabled() || !r.cfg.Share.AutoUpdate {
|
||||
return nil
|
||||
}
|
||||
staleAfter, err := time.ParseDuration(r.cfg.Share.StaleAfter)
|
||||
if err != nil {
|
||||
return configErr(fmt.Errorf("invalid share.stale_after: %w", err))
|
||||
}
|
||||
if mode != shareUpdateForce && !share.NeedsImport(r.ctx, r.store, staleAfter) {
|
||||
if !share.NeedsImport(r.ctx, r.store, staleAfter) {
|
||||
return nil
|
||||
}
|
||||
opts, err := r.shareOptions()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
r.setSyncLockPhase("share pull")
|
||||
r.logger.Info("share update pulling", "repo_path", opts.RepoPath, "remote", opts.Remote)
|
||||
if err := share.Pull(r.ctx, opts); err != nil {
|
||||
return err
|
||||
}
|
||||
r.setSyncLockPhase("share import")
|
||||
_, _, err = share.ImportIfChanged(r.ctx, r.store, opts)
|
||||
if errors.Is(err, share.ErrNoManifest) {
|
||||
return nil
|
||||
@ -530,6 +348,5 @@ func (r *runtime) shareOptions() (share.Options, error) {
|
||||
RepoPath: repoPath,
|
||||
Remote: r.cfg.Share.Remote,
|
||||
Branch: r.cfg.Share.Branch,
|
||||
Progress: r.shareProgress,
|
||||
}, nil
|
||||
}
|
||||
|
||||
@ -4,8 +4,6 @@ import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"errors"
|
||||
"io"
|
||||
"log/slog"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
@ -20,13 +18,11 @@ import (
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/openclaw/discrawl/internal/config"
|
||||
discordclient "github.com/openclaw/discrawl/internal/discord"
|
||||
"github.com/openclaw/discrawl/internal/discorddesktop"
|
||||
"github.com/openclaw/discrawl/internal/report"
|
||||
"github.com/openclaw/discrawl/internal/share"
|
||||
"github.com/openclaw/discrawl/internal/store"
|
||||
"github.com/openclaw/discrawl/internal/syncer"
|
||||
"github.com/steipete/discrawl/internal/config"
|
||||
discordclient "github.com/steipete/discrawl/internal/discord"
|
||||
"github.com/steipete/discrawl/internal/share"
|
||||
"github.com/steipete/discrawl/internal/store"
|
||||
"github.com/steipete/discrawl/internal/syncer"
|
||||
)
|
||||
|
||||
func TestHelpAndVersion(t *testing.T) {
|
||||
@ -38,196 +34,10 @@ func TestHelpAndVersion(t *testing.T) {
|
||||
|
||||
out.Reset()
|
||||
require.NoError(t, Run(context.Background(), []string{"--version"}, &out, &bytes.Buffer{}))
|
||||
require.Contains(t, out.String(), "0.7.0")
|
||||
require.Contains(t, out.String(), "0.6.3")
|
||||
|
||||
err := Run(context.Background(), []string{"bogus"}, &out, &bytes.Buffer{})
|
||||
require.Equal(t, 2, ExitCode(err))
|
||||
require.Equal(t, 1, ExitCode(context.Canceled))
|
||||
require.Equal(t, 7, ExitCode(&cliError{code: 7, err: errors.New("custom")}))
|
||||
}
|
||||
|
||||
func TestCommandValidationEdges(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
dir := t.TempDir()
|
||||
cfgPath := filepath.Join(dir, "config.toml")
|
||||
dbPath := filepath.Join(dir, "discrawl.db")
|
||||
cfg := config.Default()
|
||||
cfg.DBPath = dbPath
|
||||
cfg.Discord.TokenSource = "none"
|
||||
require.NoError(t, config.Write(cfgPath, cfg))
|
||||
s, err := store.Open(ctx, dbPath)
|
||||
require.NoError(t, err)
|
||||
require.NoError(t, s.Close())
|
||||
|
||||
cases := [][]string{
|
||||
{"--config", cfgPath, "--bogus"},
|
||||
{"--config", cfgPath, "search"},
|
||||
{"--config", cfgPath, "search", "--mode", "bogus", "term"},
|
||||
{"--config", cfgPath, "messages"},
|
||||
{"--config", cfgPath, "messages", "--hours", "-1", "--channel", "general"},
|
||||
{"--config", cfgPath, "messages", "--hours", "1", "--days", "1", "--channel", "general"},
|
||||
{"--config", cfgPath, "messages", "--all", "--last", "1", "--channel", "general"},
|
||||
{"--config", cfgPath, "messages", "--dm", "--sync", "--channel", "alice"},
|
||||
{"--config", cfgPath, "dms", "--hours", "-1"},
|
||||
{"--config", cfgPath, "dms", "--limit", "1", "--last", "1", "--with", "alice"},
|
||||
{"--config", cfgPath, "mentions"},
|
||||
{"--config", cfgPath, "mentions", "--days", "-1", "--target", "u1"},
|
||||
{"--config", cfgPath, "mentions", "--type", "channel", "--target", "u1"},
|
||||
{"--config", cfgPath, "digest", "--since", "-1d"},
|
||||
{"--config", cfgPath, "analytics", "wat"},
|
||||
{"--config", cfgPath, "analytics", "quiet", "extra"},
|
||||
{"--config", cfgPath, "analytics", "trends", "--weeks", "-1"},
|
||||
{"--config", cfgPath, "channels"},
|
||||
{"--config", cfgPath, "channels", "wat"},
|
||||
{"--config", cfgPath, "channels", "show"},
|
||||
{"--config", cfgPath, "status", "extra"},
|
||||
{"--config", cfgPath, "report", "extra"},
|
||||
{"--config", cfgPath, "wiretap", "extra"},
|
||||
{"--config", cfgPath, "wiretap", "--max-file-bytes", "0"},
|
||||
{"--config", cfgPath, "sync", "--source", "bogus"},
|
||||
{"--config", cfgPath, "sync", "--since", "not-time"},
|
||||
{"--config", cfgPath, "sync", "--no-update", "--update", "force"},
|
||||
{"--config", cfgPath, "publish", "--remote", ""},
|
||||
{"--config", cfgPath, "subscribe"},
|
||||
{"--config", cfgPath, "update", "extra"},
|
||||
{"--config", cfgPath, "sql", "--confirm", "select 1"},
|
||||
{"--config", cfgPath, "sql", "--unsafe", "select 1"},
|
||||
{"--config", cfgPath, "members"},
|
||||
{"--config", cfgPath, "members", "wat"},
|
||||
}
|
||||
for _, args := range cases {
|
||||
var stdout, stderr bytes.Buffer
|
||||
err := Run(ctx, args, &stdout, &stderr)
|
||||
require.Error(t, err, args)
|
||||
}
|
||||
}
|
||||
|
||||
func TestOutputBranches(t *testing.T) {
|
||||
now := time.Date(2026, 5, 8, 12, 0, 0, 0, time.UTC)
|
||||
values := []any{
|
||||
syncRunStats{
|
||||
Source: "both",
|
||||
Discord: &syncer.SyncStats{Guilds: 1, Channels: 2, Threads: 3, Members: 4, Messages: 5},
|
||||
Wiretap: &discorddesktop.Stats{
|
||||
Path: "/tmp/discord",
|
||||
FilesVisited: 1,
|
||||
FilesScanned: 2,
|
||||
FilesSkipped: 3,
|
||||
FilesUnchanged: 4,
|
||||
CacheFilesFastSkipped: 5,
|
||||
JSONObjects: 6,
|
||||
Guilds: 7,
|
||||
Channels: 8,
|
||||
Messages: 9,
|
||||
DMMessages: 10,
|
||||
DMChannels: 11,
|
||||
GuildMessages: 12,
|
||||
SkippedMessages: 13,
|
||||
SkippedChannels: 14,
|
||||
Checkpoints: 15,
|
||||
FullCache: true,
|
||||
DryRun: true,
|
||||
},
|
||||
},
|
||||
syncer.SyncStats{Guilds: 1, Channels: 2, Threads: 3, Members: 4, Messages: 5},
|
||||
discorddesktop.Stats{Path: "/tmp/discord", FilesVisited: 1, FullCache: true, DryRun: true},
|
||||
store.EmbeddingDrainStats{
|
||||
Processed: 3,
|
||||
Succeeded: 2,
|
||||
Failed: 1,
|
||||
Requeued: 4,
|
||||
RateLimited: true,
|
||||
RemainingBacklog: 5,
|
||||
Provider: "openai",
|
||||
Model: "model",
|
||||
InputVersion: "v1",
|
||||
},
|
||||
[]store.DirectMessageConversationRow{{
|
||||
ChannelID: "c1",
|
||||
Name: "Alice",
|
||||
MessageCount: 2,
|
||||
AuthorCount: 1,
|
||||
FirstMessageAt: now.Add(-time.Hour),
|
||||
LastMessageAt: now,
|
||||
}},
|
||||
store.MemberProfile{
|
||||
Member: store.MemberRow{
|
||||
GuildID: "g1",
|
||||
UserID: "u1",
|
||||
Username: "peter",
|
||||
DisplayName: "Peter",
|
||||
JoinedAt: now,
|
||||
XHandle: "steipete",
|
||||
GitHubLogin: "steipete",
|
||||
Website: "https://steipete.me",
|
||||
Pronouns: "he/him",
|
||||
Location: "Vienna",
|
||||
Bio: "Maintainer",
|
||||
URLs: []string{"https://example.com"},
|
||||
},
|
||||
MessageCount: 1,
|
||||
FirstMessageAt: now.Add(-time.Hour),
|
||||
LastMessageAt: now,
|
||||
RecentMessages: []store.MessageRow{{ChannelName: "general", CreatedAt: now, Content: "hello"}},
|
||||
},
|
||||
report.Digest{
|
||||
Since: now.Add(-24 * time.Hour),
|
||||
Until: now,
|
||||
WindowLabel: "1d",
|
||||
Channels: []report.ChannelDigest{{
|
||||
ChannelID: "c1",
|
||||
ChannelName: "general",
|
||||
Kind: "text",
|
||||
GuildID: "g1",
|
||||
Messages: 3,
|
||||
Replies: 1,
|
||||
ActiveAuthors: 2,
|
||||
TopPosters: []report.RankedCount{{Name: "Peter", Count: 2}},
|
||||
TopMentions: []report.RankedCount{{Count: 1}},
|
||||
}},
|
||||
Totals: report.DigestTotals{Messages: 3, Replies: 1, Channels: 1, ActiveAuthors: 2},
|
||||
},
|
||||
report.Quiet{
|
||||
Since: now.Add(-24 * time.Hour),
|
||||
Until: now,
|
||||
Channels: []report.QuietChannel{{
|
||||
ChannelID: "c1",
|
||||
ChannelName: "general",
|
||||
Kind: "text",
|
||||
LastMessage: "",
|
||||
DaysSilent: -1,
|
||||
}},
|
||||
Totals: report.QuietTotals{Channels: 1},
|
||||
},
|
||||
report.Trends{
|
||||
Since: now.AddDate(0, 0, -14),
|
||||
Until: now,
|
||||
Weeks: 2,
|
||||
Rows: []report.TrendsRow{{
|
||||
ChannelID: "c1",
|
||||
ChannelName: "general",
|
||||
Kind: "text",
|
||||
GuildID: "g1",
|
||||
Weekly: []report.WeeklyCount{
|
||||
{WeekStart: now.AddDate(0, 0, -14), Messages: 1},
|
||||
{WeekStart: now.AddDate(0, 0, -7), Messages: 2},
|
||||
},
|
||||
}},
|
||||
},
|
||||
map[string]any{"b": 2, "a": 1},
|
||||
}
|
||||
for _, value := range values {
|
||||
var out bytes.Buffer
|
||||
require.NoError(t, printHuman(&out, value))
|
||||
require.NotEmpty(t, out.String())
|
||||
}
|
||||
|
||||
var plain bytes.Buffer
|
||||
require.NoError(t, printPlain(&plain, report.Quiet{Channels: []report.QuietChannel{{ChannelID: "c1", ChannelName: "general", Kind: "text", GuildID: "g1", LastMessage: "now", DaysSilent: 0}}}))
|
||||
require.NoError(t, printPlain(&plain, report.Trends{Rows: []report.TrendsRow{{GuildID: "g1", ChannelID: "c1", ChannelName: "general", Kind: "text", Weekly: []report.WeeklyCount{{WeekStart: now, Messages: 2}}}}}))
|
||||
require.Error(t, printPlain(io.Discard, struct{}{}))
|
||||
require.Error(t, printHuman(io.Discard, struct{}{}))
|
||||
require.Equal(t, "this is a profile field with a very l...", trimForTable("this is a profile field with a very long text value"))
|
||||
}
|
||||
|
||||
func TestStatusSearchSQLAndListings(t *testing.T) {
|
||||
@ -266,21 +76,6 @@ func TestStatusSearchSQLAndListings(t *testing.T) {
|
||||
NormalizedContent: "panic locked database",
|
||||
RawJSON: `{}`,
|
||||
}))
|
||||
require.NoError(t, s.UpsertGuild(ctx, store.GuildRecord{ID: "g2", Name: "Other Guild", RawJSON: `{}`}))
|
||||
require.NoError(t, s.UpsertChannel(ctx, store.ChannelRecord{ID: "c2", GuildID: "g2", Kind: "text", Name: "random", RawJSON: `{}`}))
|
||||
require.NoError(t, s.UpsertMessage(ctx, store.MessageRecord{
|
||||
ID: "m-other",
|
||||
GuildID: "g2",
|
||||
ChannelID: "c2",
|
||||
ChannelName: "random",
|
||||
AuthorID: "u2",
|
||||
AuthorName: "Outside",
|
||||
MessageType: 0,
|
||||
CreatedAt: time.Now().UTC().Add(-time.Hour).Format(time.RFC3339Nano),
|
||||
Content: "outside default guild",
|
||||
NormalizedContent: "outside default guild",
|
||||
RawJSON: `{}`,
|
||||
}))
|
||||
require.NoError(t, s.UpsertMessage(ctx, store.MessageRecord{
|
||||
ID: "m2",
|
||||
GuildID: "g1",
|
||||
@ -325,7 +120,6 @@ func TestStatusSearchSQLAndListings(t *testing.T) {
|
||||
tests := [][]string{
|
||||
{"--config", cfgPath, "status"},
|
||||
{"--config", cfgPath, "search", "panic"},
|
||||
{"--config", cfgPath, "search", "panic", "--limit", "1"},
|
||||
{"--config", cfgPath, "search", "stack"},
|
||||
{"--config", cfgPath, "search", "--include-empty", "Peter"},
|
||||
{"--config", cfgPath, "messages", "--channel", "general", "--days", "7", "--all"},
|
||||
@ -343,100 +137,6 @@ func TestStatusSearchSQLAndListings(t *testing.T) {
|
||||
require.NoError(t, Run(ctx, args, &out, &bytes.Buffer{}))
|
||||
require.NotEmpty(t, out.String())
|
||||
}
|
||||
|
||||
for _, args := range [][]string{
|
||||
{"--config", cfgPath, "metadata", "--json"},
|
||||
{"--config", cfgPath, "status", "--json"},
|
||||
} {
|
||||
var out bytes.Buffer
|
||||
require.NoError(t, Run(ctx, args, &out, &bytes.Buffer{}))
|
||||
var payload map[string]any
|
||||
require.NoError(t, json.Unmarshal(out.Bytes(), &payload))
|
||||
require.NotEmpty(t, payload)
|
||||
}
|
||||
|
||||
before, err := os.ReadFile(dbPath)
|
||||
require.NoError(t, err)
|
||||
var out bytes.Buffer
|
||||
require.NoError(t, Run(ctx, []string{"--config", cfgPath, "--json", "tui", "--limit", "5"}, &out, &bytes.Buffer{}))
|
||||
var rows []map[string]any
|
||||
require.NoError(t, json.Unmarshal(out.Bytes(), &rows))
|
||||
require.NotEmpty(t, rows)
|
||||
require.Equal(t, "panic locked database", rows[0]["title"])
|
||||
require.Equal(t, "discord", rows[0]["source"])
|
||||
require.Equal(t, "message", rows[0]["kind"])
|
||||
require.Equal(t, "Guild", rows[0]["scope"])
|
||||
require.Equal(t, "general", rows[0]["container"])
|
||||
require.Equal(t, "https://discord.com/channels/g1/c1/m1", rows[0]["url"])
|
||||
after, err := os.ReadFile(dbPath)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, before, after, "tui --json should not mutate the database")
|
||||
}
|
||||
|
||||
func TestTUIHelpReturnsUsage(t *testing.T) {
|
||||
var stdout bytes.Buffer
|
||||
var stderr bytes.Buffer
|
||||
|
||||
require.NoError(t, Run(context.Background(), []string{"tui", "--help"}, &stdout, &stderr))
|
||||
require.Contains(t, stdout.String(), "Usage of tui:")
|
||||
require.Contains(t, stdout.String(), "-limit")
|
||||
require.Contains(t, stdout.String(), "right-click")
|
||||
require.Contains(t, stdout.String(), "# jump")
|
||||
require.Empty(t, stderr.String())
|
||||
}
|
||||
|
||||
func TestControlStatusIncludesShareAndFileSizes(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
dbPath := filepath.Join(dir, "discrawl.db")
|
||||
require.NoError(t, os.WriteFile(dbPath, []byte("db"), 0o600))
|
||||
require.NoError(t, os.WriteFile(dbPath+"-wal", []byte("wal"), 0o600))
|
||||
cfg := config.Default()
|
||||
cfg.DBPath = dbPath
|
||||
cfg.Share.Remote = "https://github.com/openclaw/discrawl-share.git"
|
||||
cfg.Share.RepoPath = filepath.Join(dir, "share")
|
||||
status := store.Status{
|
||||
DBPath: dbPath,
|
||||
MessageCount: 5,
|
||||
ChannelCount: 2,
|
||||
}
|
||||
|
||||
out := controlStatus(filepath.Join(dir, "config.toml"), cfg, status, true)
|
||||
require.Equal(t, int64(2), out.DatabaseBytes)
|
||||
require.Equal(t, int64(3), out.WALBytes)
|
||||
require.Zero(t, fileSize(filepath.Join(dir, "missing.db")))
|
||||
require.NotNil(t, out.Share)
|
||||
require.True(t, out.Share.Enabled)
|
||||
require.True(t, out.Share.NeedsUpdate)
|
||||
require.Contains(t, out.Summary, "5 messages")
|
||||
}
|
||||
|
||||
func TestFormattingAndTUISourceBranches(t *testing.T) {
|
||||
require.Equal(t, "-", formatDaysSilent(-1))
|
||||
require.Equal(t, "4", formatDaysSilent(4))
|
||||
require.Equal(t, "0", formatWindowDuration(0))
|
||||
require.Equal(t, "2d", formatWindowDuration(48*time.Hour))
|
||||
require.Equal(t, "3h", formatWindowDuration(3*time.Hour))
|
||||
require.Equal(t, "1h30m0s", formatWindowDuration(90*time.Minute))
|
||||
require.Equal(t, 6*time.Hour, mustDuration("bogus"))
|
||||
require.Equal(t, 15*time.Minute, mustDuration("15m"))
|
||||
|
||||
cfg := config.Default()
|
||||
cfg.DBPath = "/tmp/discrawl.db"
|
||||
r := &runtime{cfg: cfg}
|
||||
require.Equal(t, "local", r.archiveSourceKind())
|
||||
require.Equal(t, cfg.DBPath, r.archiveSourceLocation())
|
||||
guilds, err := r.resolveTUIGuilds(false, "", "")
|
||||
require.NoError(t, err)
|
||||
require.Empty(t, guilds)
|
||||
|
||||
r.cfg.DefaultGuildID = "guild-one"
|
||||
guilds, err = r.resolveTUIGuilds(false, "", "")
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, []string{"guild-one"}, guilds)
|
||||
|
||||
r.cfg.Share.Remote = "https://github.com/openclaw/discrawl-share.git"
|
||||
require.Equal(t, "remote", r.archiveSourceKind())
|
||||
require.Equal(t, r.cfg.Share.Remote, r.archiveSourceLocation())
|
||||
}
|
||||
|
||||
func TestWiretapImportsDesktopDirectMessages(t *testing.T) {
|
||||
@ -483,53 +183,6 @@ func TestWiretapImportsDesktopDirectMessages(t *testing.T) {
|
||||
require.Contains(t, out.String(), "secret DM launch plan")
|
||||
}
|
||||
|
||||
func TestDiscordTUIRowsIncludePaneMetadata(t *testing.T) {
|
||||
rows := discordTUIRows([]store.MessageRow{{
|
||||
MessageID: "m1",
|
||||
GuildID: "@me",
|
||||
GuildName: "Discord Direct Messages",
|
||||
ChannelID: "c1",
|
||||
ChannelName: "Vincent K",
|
||||
AuthorID: "u1",
|
||||
AuthorName: "Peter",
|
||||
Content: "hello from desktop",
|
||||
DisplayContent: "hello from Vincent",
|
||||
CreatedAt: time.Date(2026, 5, 2, 12, 0, 0, 0, time.UTC),
|
||||
ReplyToMessage: "m0",
|
||||
HasAttachments: true,
|
||||
AttachmentNames: "trace.txt",
|
||||
AttachmentText: "stack trace line one",
|
||||
Pinned: true,
|
||||
}})
|
||||
require.Len(t, rows, 1)
|
||||
require.Equal(t, "hello from Vincent", rows[0].Title)
|
||||
require.Contains(t, rows[0].Detail, "hello from Vincent")
|
||||
require.Contains(t, rows[0].Detail, "Attachments")
|
||||
require.Contains(t, rows[0].Detail, "stack trace line one")
|
||||
require.Equal(t, "hello from Vincent", rows[0].Text)
|
||||
require.Equal(t, "Direct messages", rows[0].Scope)
|
||||
require.Equal(t, "Vincent K", rows[0].Container)
|
||||
require.Contains(t, rows[0].Tags, "dm")
|
||||
require.Equal(t, "true", rows[0].Fields["attachments"])
|
||||
require.Equal(t, "trace.txt", rows[0].Fields["attachment_names"])
|
||||
require.Equal(t, "true", rows[0].Fields["pinned"])
|
||||
require.Equal(t, "m0", rows[0].Fields["reply_to"])
|
||||
require.Equal(t, "@me", rows[0].Fields["guild_id"])
|
||||
|
||||
rows = discordTUIRows([]store.MessageRow{{
|
||||
MessageID: "m2",
|
||||
GuildID: "g1",
|
||||
ChannelID: "c2",
|
||||
AuthorID: "439223656200273932",
|
||||
Content: "desktop-only author",
|
||||
CreatedAt: time.Date(2026, 5, 2, 12, 0, 0, 0, time.UTC),
|
||||
Source: "discord_desktop",
|
||||
}})
|
||||
require.Equal(t, "user:439223...3932", rows[0].Author)
|
||||
require.Equal(t, "DM c2", discordContainerLabel(store.MessageRow{GuildID: "@me", ChannelID: "c2"}))
|
||||
require.Contains(t, rows[0].Tags, "discord_desktop")
|
||||
}
|
||||
|
||||
func TestParseMessageWindow(t *testing.T) {
|
||||
rt := &runtime{now: func() time.Time {
|
||||
return time.Date(2026, 4, 24, 12, 0, 0, 0, time.UTC)
|
||||
@ -947,7 +600,7 @@ func TestShareUpdateImportsNewRemoteSnapshot(t *testing.T) {
|
||||
require.Contains(t, out.String(), "newer git snapshot arrived")
|
||||
}
|
||||
|
||||
func TestSyncSkipsGitShareByDefaultAndCanImportBeforeLiveDiscord(t *testing.T) {
|
||||
func TestSyncImportsGitShareBeforeLiveDiscord(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
dir := t.TempDir()
|
||||
remoteRepo := filepath.Join(dir, "remote.git")
|
||||
@ -968,8 +621,6 @@ func TestSyncSkipsGitShareByDefaultAndCanImportBeforeLiveDiscord(t *testing.T) {
|
||||
cfg.Share.RepoPath = filepath.Join(dir, "reader-share")
|
||||
cfg.Share.AutoUpdate = true
|
||||
cfg.Share.StaleAfter = "15m"
|
||||
cfg.Desktop.Path = filepath.Join(dir, "empty-discord")
|
||||
require.NoError(t, os.MkdirAll(cfg.Desktop.Path, 0o755))
|
||||
require.NoError(t, config.Write(cfgPath, cfg))
|
||||
|
||||
hybrid := &hybridSyncService{}
|
||||
@ -990,33 +641,17 @@ func TestSyncSkipsGitShareByDefaultAndCanImportBeforeLiveDiscord(t *testing.T) {
|
||||
}
|
||||
|
||||
require.NoError(t, rt.dispatch([]string{"sync", "--all"}))
|
||||
require.False(t, hybrid.sawGitMessage)
|
||||
require.True(t, hybrid.sawGitMessage)
|
||||
|
||||
reader, err := store.Open(ctx, cfg.DBPath)
|
||||
require.NoError(t, err)
|
||||
defer func() { _ = reader.Close() }()
|
||||
rows, err := reader.ListMessages(ctx, store.MessageListOptions{Channel: "general", IncludeEmpty: true})
|
||||
require.NoError(t, err)
|
||||
contents := make([]string, 0, len(rows))
|
||||
for _, row := range rows {
|
||||
contents = append(contents, row.Content)
|
||||
}
|
||||
require.NotContains(t, contents, "automatic updates work")
|
||||
require.Contains(t, contents, "live discord filled the delta")
|
||||
require.NoError(t, reader.Close())
|
||||
|
||||
hybrid.sawGitMessage = false
|
||||
require.NoError(t, rt.dispatch([]string{"sync", "--all", "--update=auto"}))
|
||||
require.True(t, hybrid.sawGitMessage)
|
||||
|
||||
reader, err = store.Open(ctx, cfg.DBPath)
|
||||
require.NoError(t, err)
|
||||
defer func() { _ = reader.Close() }()
|
||||
rows, err = reader.ListMessages(ctx, store.MessageListOptions{Channel: "general", IncludeEmpty: true})
|
||||
require.NoError(t, err)
|
||||
contents = contents[:0]
|
||||
for _, row := range rows {
|
||||
contents = append(contents, row.Content)
|
||||
}
|
||||
require.Contains(t, contents, "automatic updates work")
|
||||
require.Contains(t, contents, "live discord filled the delta")
|
||||
}
|
||||
@ -1054,63 +689,6 @@ func TestSyncLockSerializesConcurrentRuns(t *testing.T) {
|
||||
require.ErrorIs(t, err, context.DeadlineExceeded)
|
||||
}
|
||||
|
||||
func TestReadCommandsDoNotWaitForSyncLock(t *testing.T) {
|
||||
if goruntime.GOOS == "windows" {
|
||||
t.Skip("sync lock timing is flaky on Windows")
|
||||
}
|
||||
ctx := context.Background()
|
||||
dir := t.TempDir()
|
||||
cfg := config.Default()
|
||||
cfg.DBPath = filepath.Join(dir, "discrawl.db")
|
||||
cfgPath := filepath.Join(dir, "config.toml")
|
||||
require.NoError(t, config.Write(cfgPath, cfg))
|
||||
|
||||
s := seedCLIStore(t, cfg.DBPath)
|
||||
require.NoError(t, s.Close())
|
||||
|
||||
firstRelease, err := acquireSyncLock(ctx, filepath.Join(dir, ".discrawl-sync.lock"))
|
||||
require.NoError(t, err)
|
||||
defer func() { _ = firstRelease() }()
|
||||
|
||||
for _, args := range [][]string{
|
||||
{"--config", cfgPath, "search", "automatic"},
|
||||
{"--config", cfgPath, "messages", "--channel", "general", "--last", "1"},
|
||||
{"--config", cfgPath, "sql", "select count(*) as total from messages"},
|
||||
} {
|
||||
runCtx, cancel := context.WithTimeout(ctx, 100*time.Millisecond)
|
||||
var out bytes.Buffer
|
||||
err := Run(runCtx, args, &out, &bytes.Buffer{})
|
||||
cancel()
|
||||
require.NoError(t, err, args)
|
||||
require.NotEmpty(t, out.String(), args)
|
||||
}
|
||||
}
|
||||
|
||||
func TestReadCommandsMigrateOlderLocalStore(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
dir := t.TempDir()
|
||||
cfg := config.Default()
|
||||
cfg.DBPath = filepath.Join(dir, "discrawl.db")
|
||||
cfgPath := filepath.Join(dir, "config.toml")
|
||||
require.NoError(t, config.Write(cfgPath, cfg))
|
||||
|
||||
s := seedCLIStore(t, cfg.DBPath)
|
||||
_, err := s.DB().ExecContext(ctx, `pragma user_version = 1`)
|
||||
require.NoError(t, err)
|
||||
require.NoError(t, s.Close())
|
||||
|
||||
var out bytes.Buffer
|
||||
require.NoError(t, Run(ctx, []string{"--config", cfgPath, "search", "automatic"}, &out, &bytes.Buffer{}))
|
||||
require.Contains(t, out.String(), "automatic updates work")
|
||||
|
||||
reader, err := store.OpenReadOnly(ctx, cfg.DBPath)
|
||||
require.NoError(t, err)
|
||||
defer func() { _ = reader.Close() }()
|
||||
var version int
|
||||
require.NoError(t, reader.DB().QueryRowContext(ctx, `pragma user_version`).Scan(&version))
|
||||
require.Equal(t, 2, version)
|
||||
}
|
||||
|
||||
func seedCLIStore(t *testing.T, path string) *store.Store {
|
||||
t.Helper()
|
||||
ctx := context.Background()
|
||||
@ -1963,17 +1541,6 @@ func TestHelpers(t *testing.T) {
|
||||
|
||||
require.Equal(t, []string{"a", "b"}, csvList("a,b,a"))
|
||||
require.Equal(t, "x", (&cliError{code: 2, err: assertErr("x")}).Error())
|
||||
mode, err := syncShareUpdateMode([]string{"--all"})
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, shareUpdateNever, mode)
|
||||
mode, err = syncShareUpdateMode([]string{"--update=auto"})
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, shareUpdateAuto, mode)
|
||||
mode, err = syncShareUpdateMode([]string{"--update", "force"})
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, shareUpdateForce, mode)
|
||||
_, err = syncShareUpdateMode([]string{"--update"})
|
||||
require.Error(t, err)
|
||||
require.Equal(t, 2, ExitCode(usageErr(assertErr("x"))))
|
||||
require.Equal(t, 4, ExitCode(authErr(assertErr("x"))))
|
||||
require.Equal(t, 5, ExitCode(dbErr(assertErr("x"))))
|
||||
@ -2015,49 +1582,7 @@ func TestRuntimeHelpersAndSubcommands(t *testing.T) {
|
||||
s, err := store.Open(ctx, dbPath)
|
||||
require.NoError(t, err)
|
||||
require.NoError(t, s.UpsertChannel(ctx, store.ChannelRecord{ID: "c1", GuildID: "g1", Kind: "text", Name: "general", RawJSON: `{}`}))
|
||||
require.NoError(t, s.UpsertChannel(ctx, store.ChannelRecord{ID: "dm1", GuildID: store.DirectMessageGuildID, Kind: "dm", Name: "Alice", RawJSON: `{}`}))
|
||||
require.NoError(t, s.UpsertMember(ctx, store.MemberRecord{GuildID: "g1", UserID: "u1", Username: "peter", RoleIDsJSON: `[]`, RawJSON: `{}`}))
|
||||
base := time.Date(2026, 3, 8, 10, 0, 0, 0, time.UTC)
|
||||
require.NoError(t, s.UpsertMessages(ctx, []store.MessageMutation{
|
||||
{
|
||||
Record: store.MessageRecord{
|
||||
ID: "m1",
|
||||
GuildID: "g1",
|
||||
ChannelID: "c1",
|
||||
ChannelName: "general",
|
||||
AuthorID: "u1",
|
||||
AuthorName: "peter",
|
||||
CreatedAt: base.Format(time.RFC3339Nano),
|
||||
Content: "hello <@u1> in <#c1>",
|
||||
NormalizedContent: "hello <@u1> in <#c1>",
|
||||
RawJSON: `{"author":{"username":"peter"}}`,
|
||||
},
|
||||
Mentions: []store.MentionEventRecord{{
|
||||
MessageID: "m1",
|
||||
GuildID: "g1",
|
||||
ChannelID: "c1",
|
||||
AuthorID: "u1",
|
||||
TargetType: "user",
|
||||
TargetID: "u1",
|
||||
TargetName: "peter",
|
||||
EventAt: base.Format(time.RFC3339Nano),
|
||||
}},
|
||||
},
|
||||
{
|
||||
Record: store.MessageRecord{
|
||||
ID: "dm-msg",
|
||||
GuildID: store.DirectMessageGuildID,
|
||||
ChannelID: "dm1",
|
||||
ChannelName: "Alice",
|
||||
AuthorID: "u2",
|
||||
AuthorName: "Alice",
|
||||
CreatedAt: base.Add(time.Minute).Format(time.RFC3339Nano),
|
||||
Content: "private hello",
|
||||
NormalizedContent: "private hello",
|
||||
RawJSON: `{"source":"discord_desktop"}`,
|
||||
},
|
||||
},
|
||||
}))
|
||||
require.NoError(t, s.Close())
|
||||
|
||||
rt := &runtime{
|
||||
@ -2077,23 +1602,11 @@ func TestRuntimeHelpersAndSubcommands(t *testing.T) {
|
||||
require.NoError(t, rt.runMessages([]string{"--channel", "#general", "--hours", "6", "--last", "1"}))
|
||||
require.NoError(t, rt.runMessages([]string{"--channel", "#general", "--days", "7", "--all"}))
|
||||
require.NoError(t, rt.runMessages([]string{"--channel", "#general", "--days", "7", "--all", "--include-empty"}))
|
||||
require.NoError(t, rt.runMessages([]string{"--channel", "#general", "--since", "2026-03-08T00:00:00Z", "--before", "2026-03-09T00:00:00Z", "--limit", "1"}))
|
||||
require.NoError(t, rt.runMessages([]string{"--dm", "--channel", "Alice", "--last", "1"}))
|
||||
require.NoError(t, rt.runDirectMessages([]string{"--list"}))
|
||||
require.NoError(t, rt.runDirectMessages([]string{"--with", "Alice", "--search", "private", "--limit", "1"}))
|
||||
require.NoError(t, rt.runDirectMessages([]string{"--with", "Alice", "--since", "2026-03-08T00:00:00Z", "--before", "2026-03-09T00:00:00Z", "--all"}))
|
||||
require.NoError(t, rt.runMentions([]string{"--channel", "#general", "--target", "u2"}))
|
||||
require.NoError(t, rt.runMentions([]string{"--channel", "#general", "--days", "7", "--type", "user"}))
|
||||
require.NoError(t, rt.runDigest([]string{"--since", "12h", "--channel", "general", "--top-n", "2"}))
|
||||
require.NoError(t, rt.runReport([]string{"--readme", filepath.Join(dir, "README.md")}))
|
||||
require.NoError(t, rt.runSearch([]string{"--include-empty", "Peter"}))
|
||||
require.NoError(t, rt.runChannels([]string{"show", "c1"}))
|
||||
require.NoError(t, rt.runChannels([]string{"list"}))
|
||||
require.NoError(t, rt.runStatus(nil))
|
||||
require.NoError(t, rt.runAnalytics([]string{}))
|
||||
require.NoError(t, rt.runTUI([]string{"--json", "--limit", "1", "--include-empty"}))
|
||||
require.NoError(t, rt.runAnalytics([]string{"quiet", "--since", "1d"}))
|
||||
require.NoError(t, rt.runAnalytics([]string{"trends", "--weeks", "1", "--channel", "general"}))
|
||||
return nil
|
||||
}))
|
||||
}
|
||||
@ -2411,8 +1924,6 @@ func TestCommandUsageErrors(t *testing.T) {
|
||||
require.Equal(t, 2, ExitCode(rt.runMessages([]string{"--days", "-1"})))
|
||||
require.Equal(t, 2, ExitCode(rt.runMessages([]string{"--days", "1", "--since", "2026-03-01T00:00:00Z"})))
|
||||
require.Equal(t, 2, ExitCode(rt.runSync([]string{"--all", "--guild", "g1"})))
|
||||
require.Equal(t, 2, ExitCode(rt.runSync([]string{"--update", "bogus"})))
|
||||
require.Equal(t, 2, ExitCode(rt.runSync([]string{"--update=force", "--no-update"})))
|
||||
require.Equal(t, 2, ExitCode(rt.runChannels(nil)))
|
||||
require.Equal(t, 2, ExitCode(rt.runStatus([]string{"extra"})))
|
||||
require.NoError(t, (&runtime{stdout: &bytes.Buffer{}}).runDoctor(nil))
|
||||
|
||||
@ -1,96 +0,0 @@
|
||||
package cli
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"flag"
|
||||
"fmt"
|
||||
"io"
|
||||
"os"
|
||||
"time"
|
||||
|
||||
"github.com/openclaw/crawlkit/control"
|
||||
"github.com/openclaw/discrawl/internal/config"
|
||||
"github.com/openclaw/discrawl/internal/store"
|
||||
)
|
||||
|
||||
func (r *runtime) runMetadata(args []string) error {
|
||||
fs := flag.NewFlagSet("metadata", flag.ContinueOnError)
|
||||
fs.SetOutput(io.Discard)
|
||||
jsonOut := fs.Bool("json", false, "")
|
||||
if err := fs.Parse(args); err != nil {
|
||||
return usageErr(err)
|
||||
}
|
||||
if fs.NArg() != 0 {
|
||||
return usageErr(errors.New("metadata takes flags only"))
|
||||
}
|
||||
if *jsonOut {
|
||||
r.json = true
|
||||
}
|
||||
cfg := config.Default()
|
||||
manifest := control.NewManifest("discrawl", "Discord Crawl", "discrawl")
|
||||
manifest.Description = "Local-first Discord archive crawler."
|
||||
manifest.Branding = control.Branding{SymbolName: "bubble.left.and.bubble.right.fill", AccentColor: "#5865f2", BundleIdentifier: "com.hnc.Discord"}
|
||||
manifest.Paths = control.Paths{
|
||||
DefaultConfig: config.ResolvePath(""),
|
||||
ConfigEnv: config.DefaultConfigEnv,
|
||||
DefaultDatabase: cfg.DBPath,
|
||||
DefaultCache: cfg.CacheDir,
|
||||
DefaultLogs: cfg.LogDir,
|
||||
DefaultShare: cfg.Share.RepoPath,
|
||||
}
|
||||
manifest.Capabilities = []string{"metadata", "status", "doctor", "sync", "tap", "tui", "git-share", "sql", "embeddings"}
|
||||
manifest.Privacy = control.Privacy{ContainsPrivateMessages: true, ExportsSecrets: false, LocalOnlyScopes: []string{"discord", "desktop-cache", "sqlite", "git-share"}}
|
||||
manifest.Commands = map[string]control.Command{
|
||||
"status": {Title: "Status", Argv: []string{"discrawl", "status", "--json"}, JSON: true},
|
||||
"doctor": {Title: "Doctor", Argv: []string{"discrawl", "doctor", "--json"}, JSON: true},
|
||||
"sync": {Title: "Sync", Argv: []string{"discrawl", "--json", "sync"}, JSON: true, Mutates: true},
|
||||
"tap": {Title: "Import desktop cache", Argv: []string{"discrawl", "--json", "tap"}, JSON: true, Mutates: true},
|
||||
"cache-import": {Title: "Import desktop cache", Argv: []string{"discrawl", "--json", "cache-import"}, JSON: true, Mutates: true},
|
||||
"wiretap": {Title: "Legacy desktop cache import", Argv: []string{"discrawl", "--json", "wiretap"}, JSON: true, Mutates: true, Legacy: true, Deprecated: true},
|
||||
"tui": {Title: "Terminal browser", Argv: []string{"discrawl", "tui"}},
|
||||
"tui-json": {Title: "Terminal browser rows", Argv: []string{"discrawl", "tui", "--json"}, JSON: true},
|
||||
"publish": {Title: "Publish share", Argv: []string{"discrawl", "--json", "publish"}, JSON: true, Mutates: true},
|
||||
"subscribe": {Title: "Subscribe share", Argv: []string{"discrawl", "--json", "subscribe"}, JSON: true, Mutates: true},
|
||||
"update": {Title: "Update share", Argv: []string{"discrawl", "--json", "update"}, JSON: true, Mutates: true},
|
||||
}
|
||||
return r.print(manifest)
|
||||
}
|
||||
|
||||
func controlStatus(configPath string, cfg config.Config, status store.Status, shareNeedsUpdate bool) control.Status {
|
||||
counts := []control.Count{
|
||||
control.NewCount("guilds", "Guilds", int64(status.GuildCount)),
|
||||
control.NewCount("channels", "Channels", int64(status.ChannelCount)),
|
||||
control.NewCount("threads", "Threads", int64(status.ThreadCount)),
|
||||
control.NewCount("messages", "Messages", int64(status.MessageCount)),
|
||||
control.NewCount("members", "Members", int64(status.MemberCount)),
|
||||
control.NewCount("embedding_backlog", "Embedding backlog", int64(status.EmbeddingBacklog)),
|
||||
}
|
||||
out := control.NewStatus("discrawl", fmt.Sprintf("%d messages across %d channels", status.MessageCount, status.ChannelCount))
|
||||
out.State = "current"
|
||||
out.ConfigPath = configPath
|
||||
out.DatabasePath = status.DBPath
|
||||
out.Counts = counts
|
||||
if !status.LastSyncAt.IsZero() {
|
||||
out.LastSyncAt = status.LastSyncAt.UTC().Format(time.RFC3339)
|
||||
}
|
||||
db := control.SQLiteDatabase("primary", "Discord archive", "archive", status.DBPath, true, counts)
|
||||
out.DatabaseBytes = db.Bytes
|
||||
out.WALBytes = fileSize(status.DBPath + "-wal")
|
||||
out.Databases = []control.Database{db}
|
||||
out.Share = &control.Share{
|
||||
Enabled: cfg.ShareEnabled(),
|
||||
RepoPath: cfg.Share.RepoPath,
|
||||
Remote: cfg.Share.Remote,
|
||||
Branch: cfg.Share.Branch,
|
||||
NeedsUpdate: shareNeedsUpdate,
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func fileSize(path string) int64 {
|
||||
info, err := os.Stat(path)
|
||||
if err != nil {
|
||||
return 0
|
||||
}
|
||||
return info.Size()
|
||||
}
|
||||
@ -9,7 +9,7 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/openclaw/discrawl/internal/report"
|
||||
"github.com/steipete/discrawl/internal/report"
|
||||
)
|
||||
|
||||
func (r *runtime) runDigest(args []string) error {
|
||||
|
||||
@ -10,8 +10,8 @@ import (
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/openclaw/discrawl/internal/config"
|
||||
"github.com/openclaw/discrawl/internal/store"
|
||||
"github.com/steipete/discrawl/internal/config"
|
||||
"github.com/steipete/discrawl/internal/store"
|
||||
)
|
||||
|
||||
func TestParseLookback(t *testing.T) {
|
||||
|
||||
@ -8,7 +8,7 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/openclaw/discrawl/internal/store"
|
||||
"github.com/steipete/discrawl/internal/store"
|
||||
)
|
||||
|
||||
const defaultDMLast = 50
|
||||
|
||||
@ -6,7 +6,7 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/openclaw/discrawl/internal/store"
|
||||
"github.com/steipete/discrawl/internal/store"
|
||||
)
|
||||
|
||||
func (r *runtime) resolveSyncGuilds(guild, guilds string) []string {
|
||||
|
||||
@ -8,7 +8,7 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/openclaw/discrawl/internal/store"
|
||||
"github.com/steipete/discrawl/internal/store"
|
||||
)
|
||||
|
||||
func (r *runtime) runMentions(args []string) error {
|
||||
|
||||
@ -8,7 +8,7 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/openclaw/discrawl/internal/store"
|
||||
"github.com/steipete/discrawl/internal/store"
|
||||
)
|
||||
|
||||
const defaultMessageLimit = 200
|
||||
|
||||
@ -11,10 +11,10 @@ import (
|
||||
"text/tabwriter"
|
||||
"time"
|
||||
|
||||
"github.com/openclaw/discrawl/internal/discorddesktop"
|
||||
"github.com/openclaw/discrawl/internal/report"
|
||||
"github.com/openclaw/discrawl/internal/store"
|
||||
"github.com/openclaw/discrawl/internal/syncer"
|
||||
"github.com/steipete/discrawl/internal/discorddesktop"
|
||||
"github.com/steipete/discrawl/internal/report"
|
||||
"github.com/steipete/discrawl/internal/store"
|
||||
"github.com/steipete/discrawl/internal/syncer"
|
||||
)
|
||||
|
||||
func (r *runtime) print(value any) error {
|
||||
@ -100,16 +100,11 @@ Usage:
|
||||
discrawl [global flags] <command> [args]
|
||||
|
||||
Commands:
|
||||
metadata
|
||||
version
|
||||
init
|
||||
sync
|
||||
tail
|
||||
tap
|
||||
cache-import
|
||||
wiretap
|
||||
search
|
||||
tui
|
||||
messages
|
||||
digest
|
||||
analytics
|
||||
@ -147,8 +142,8 @@ func printHuman(w io.Writer, value any) error {
|
||||
}
|
||||
}
|
||||
if v.Wiretap != nil {
|
||||
if _, err := fmt.Fprintf(w, "wiretap_visited=%d\nwiretap_files=%d\nwiretap_unchanged=%d\nwiretap_fast_skipped=%d\nwiretap_messages=%d\nwiretap_dm_messages=%d\nwiretap_dm_channels=%d\nwiretap_guild_messages=%d\nwiretap_skipped_messages=%d\nwiretap_skipped_channels=%d\nwiretap_checkpoints=%d\n",
|
||||
v.Wiretap.FilesVisited, v.Wiretap.FilesScanned, v.Wiretap.FilesUnchanged, v.Wiretap.CacheFilesFastSkipped, v.Wiretap.Messages, v.Wiretap.DMMessages, v.Wiretap.DMChannels, v.Wiretap.GuildMessages, v.Wiretap.SkippedMessages, v.Wiretap.SkippedChannels, v.Wiretap.Checkpoints); err != nil {
|
||||
if _, err := fmt.Fprintf(w, "wiretap_files=%d\nwiretap_unchanged=%d\nwiretap_messages=%d\nwiretap_dm_messages=%d\nwiretap_dm_channels=%d\nwiretap_guild_messages=%d\nwiretap_skipped_messages=%d\nwiretap_skipped_channels=%d\n",
|
||||
v.Wiretap.FilesScanned, v.Wiretap.FilesUnchanged, v.Wiretap.Messages, v.Wiretap.DMMessages, v.Wiretap.DMChannels, v.Wiretap.GuildMessages, v.Wiretap.SkippedMessages, v.Wiretap.SkippedChannels); err != nil {
|
||||
return err
|
||||
}
|
||||
}
|
||||
@ -157,8 +152,8 @@ func printHuman(w io.Writer, value any) error {
|
||||
_, err := fmt.Fprintf(w, "guilds=%d channels=%d threads=%d members=%d messages=%d\n", v.Guilds, v.Channels, v.Threads, v.Members, v.Messages)
|
||||
return err
|
||||
case discorddesktop.Stats:
|
||||
_, err := fmt.Fprintf(w, "path=%s\nvisited=%d\nfiles=%d\nskipped=%d\nunchanged=%d\nfast_skipped=%d\nobjects=%d\nguilds=%d\nchannels=%d\nmessages=%d\ndm_messages=%d\ndm_channels=%d\nguild_messages=%d\nskipped_messages=%d\nskipped_channels=%d\ncheckpoints=%d\nfull_cache=%t\ndry_run=%t\n",
|
||||
v.Path, v.FilesVisited, v.FilesScanned, v.FilesSkipped, v.FilesUnchanged, v.CacheFilesFastSkipped, v.JSONObjects, v.Guilds, v.Channels, v.Messages, v.DMMessages, v.DMChannels, v.GuildMessages, v.SkippedMessages, v.SkippedChannels, v.Checkpoints, v.FullCache, v.DryRun)
|
||||
_, err := fmt.Fprintf(w, "path=%s\nfiles=%d\nskipped=%d\nunchanged=%d\nobjects=%d\nguilds=%d\nchannels=%d\nmessages=%d\ndm_messages=%d\ndm_channels=%d\nguild_messages=%d\nskipped_messages=%d\nskipped_channels=%d\ndry_run=%t\n",
|
||||
v.Path, v.FilesScanned, v.FilesSkipped, v.FilesUnchanged, v.JSONObjects, v.Guilds, v.Channels, v.Messages, v.DMMessages, v.DMChannels, v.GuildMessages, v.SkippedMessages, v.SkippedChannels, v.DryRun)
|
||||
return err
|
||||
case store.Status:
|
||||
_, err := fmt.Fprintf(w, "db=%s\nguilds=%d\nchannels=%d\nthreads=%d\nmessages=%d\nmembers=%d\nembedding_backlog=%d\nlast_sync=%s\nlast_tail_event=%s\n",
|
||||
|
||||
@ -7,8 +7,8 @@ import (
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/openclaw/discrawl/internal/store"
|
||||
"github.com/openclaw/discrawl/internal/syncer"
|
||||
"github.com/steipete/discrawl/internal/store"
|
||||
"github.com/steipete/discrawl/internal/syncer"
|
||||
)
|
||||
|
||||
func TestPrintRows(t *testing.T) {
|
||||
|
||||
@ -9,9 +9,9 @@ import (
|
||||
"os"
|
||||
"strings"
|
||||
|
||||
"github.com/openclaw/crawlkit/embed"
|
||||
"github.com/openclaw/discrawl/internal/config"
|
||||
"github.com/openclaw/discrawl/internal/store"
|
||||
"github.com/steipete/discrawl/internal/config"
|
||||
"github.com/steipete/discrawl/internal/embed"
|
||||
"github.com/steipete/discrawl/internal/store"
|
||||
)
|
||||
|
||||
func (r *runtime) runSearch(args []string) error {
|
||||
@ -25,7 +25,7 @@ func (r *runtime) runSearch(args []string) error {
|
||||
dm := fs.Bool("dm", false, "")
|
||||
guildsFlag := fs.String("guilds", "", "")
|
||||
guildFlag := fs.String("guild", "", "")
|
||||
if err := fs.Parse(permuteSearchFlags(args)); err != nil {
|
||||
if err := fs.Parse(args); err != nil {
|
||||
return usageErr(err)
|
||||
}
|
||||
if fs.NArg() != 1 {
|
||||
@ -67,51 +67,6 @@ func (r *runtime) runSearch(args []string) error {
|
||||
}
|
||||
}
|
||||
|
||||
func permuteSearchFlags(args []string) []string {
|
||||
valueFlags := map[string]struct{}{
|
||||
"--mode": {},
|
||||
"--channel": {},
|
||||
"--author": {},
|
||||
"--limit": {},
|
||||
"--guilds": {},
|
||||
"--guild": {},
|
||||
}
|
||||
boolFlags := map[string]struct{}{
|
||||
"--include-empty": {},
|
||||
"--dm": {},
|
||||
}
|
||||
flags := make([]string, 0, len(args))
|
||||
positionals := make([]string, 0, len(args))
|
||||
for i := 0; i < len(args); i++ {
|
||||
arg := args[i]
|
||||
if arg == "--" {
|
||||
positionals = append(positionals, args[i+1:]...)
|
||||
break
|
||||
}
|
||||
if name, _, ok := strings.Cut(arg, "="); ok {
|
||||
if _, known := valueFlags[name]; known {
|
||||
flags = append(flags, arg)
|
||||
continue
|
||||
}
|
||||
if _, known := boolFlags[name]; known {
|
||||
flags = append(flags, arg)
|
||||
continue
|
||||
}
|
||||
}
|
||||
if _, known := boolFlags[arg]; known {
|
||||
flags = append(flags, arg)
|
||||
continue
|
||||
}
|
||||
if _, known := valueFlags[arg]; known && i+1 < len(args) {
|
||||
flags = append(flags, arg, args[i+1])
|
||||
i++
|
||||
continue
|
||||
}
|
||||
positionals = append(positionals, arg)
|
||||
}
|
||||
return append(flags, positionals...)
|
||||
}
|
||||
|
||||
func (r *runtime) searchMessagesSemantic(opts store.SearchOptions) ([]store.SearchResult, error) {
|
||||
semanticOpts, err := r.semanticSearchOptions(opts)
|
||||
if err != nil {
|
||||
@ -157,7 +112,7 @@ func (r *runtime) semanticSearchOptions(opts store.SearchOptions) (store.Semanti
|
||||
providerFactory := r.newEmbed
|
||||
if providerFactory == nil {
|
||||
providerFactory = func(cfg config.EmbeddingsConfig) (embed.Provider, error) {
|
||||
return embed.NewProvider(crawlkitEmbeddingConfig(cfg))
|
||||
return embed.NewProvider(cfg)
|
||||
}
|
||||
}
|
||||
provider, err := providerFactory(r.cfg.Search.Embeddings)
|
||||
|
||||
@ -6,7 +6,7 @@ import (
|
||||
"slices"
|
||||
"strings"
|
||||
|
||||
"github.com/openclaw/discrawl/internal/syncer"
|
||||
"github.com/steipete/discrawl/internal/syncer"
|
||||
)
|
||||
|
||||
func (r *runtime) syncMessagesQuery(channel, guild, guilds string) error {
|
||||
@ -96,27 +96,3 @@ func hasBoolFlag(args []string, name string) bool {
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func boolFlagEnabled(args []string, name string) bool {
|
||||
for _, arg := range args {
|
||||
if arg == name {
|
||||
return true
|
||||
}
|
||||
if raw, ok := strings.CutPrefix(arg, name+"="); ok {
|
||||
switch strings.ToLower(strings.TrimSpace(raw)) {
|
||||
case "1", "t", "true", "y", "yes", "on":
|
||||
return true
|
||||
}
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func hasHelpArg(args []string) bool {
|
||||
for _, arg := range args {
|
||||
if arg == "help" || arg == "--help" || arg == "-h" {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
@ -8,8 +8,8 @@ import (
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/openclaw/discrawl/internal/config"
|
||||
"github.com/openclaw/discrawl/internal/store"
|
||||
"github.com/steipete/discrawl/internal/config"
|
||||
"github.com/steipete/discrawl/internal/store"
|
||||
)
|
||||
|
||||
func TestMessageSyncOptionsNumericChannelID(t *testing.T) {
|
||||
|
||||
@ -5,7 +5,7 @@ import (
|
||||
"flag"
|
||||
"io"
|
||||
|
||||
"github.com/openclaw/discrawl/internal/report"
|
||||
"github.com/steipete/discrawl/internal/report"
|
||||
)
|
||||
|
||||
func (r *runtime) runReport(args []string) error {
|
||||
|
||||
@ -6,10 +6,10 @@ import (
|
||||
"io"
|
||||
"os"
|
||||
|
||||
"github.com/openclaw/discrawl/internal/config"
|
||||
"github.com/openclaw/discrawl/internal/report"
|
||||
"github.com/openclaw/discrawl/internal/share"
|
||||
"github.com/openclaw/discrawl/internal/store"
|
||||
"github.com/steipete/discrawl/internal/config"
|
||||
"github.com/steipete/discrawl/internal/report"
|
||||
"github.com/steipete/discrawl/internal/share"
|
||||
"github.com/steipete/discrawl/internal/store"
|
||||
)
|
||||
|
||||
func (r *runtime) runPublish(args []string) error {
|
||||
@ -136,15 +136,13 @@ func (r *runtime) runSubscribe(args []string) error {
|
||||
if err != nil {
|
||||
return configErr(err)
|
||||
}
|
||||
opts := share.Options{RepoPath: expandedRepo, Remote: cfg.Share.Remote, Branch: cfg.Share.Branch, Progress: r.shareProgress}
|
||||
opts := share.Options{RepoPath: expandedRepo, Remote: cfg.Share.Remote, Branch: cfg.Share.Branch}
|
||||
if *withEmbeddings {
|
||||
applyEmbeddingShareOptions(&opts, cfg)
|
||||
}
|
||||
r.setSyncLockPhase("share pull")
|
||||
if err := share.Pull(r.ctx, opts); err != nil {
|
||||
return err
|
||||
}
|
||||
r.setSyncLockPhase("share import")
|
||||
manifest, imported, err := share.ImportIfChanged(r.ctx, s, opts)
|
||||
if err != nil {
|
||||
return err
|
||||
@ -178,15 +176,12 @@ func (r *runtime) runUpdate(args []string) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
opts.Progress = r.shareProgress
|
||||
if *withEmbeddings {
|
||||
applyEmbeddingShareOptions(&opts, r.cfg)
|
||||
}
|
||||
r.setSyncLockPhase("share pull")
|
||||
if err := share.Pull(r.ctx, opts); err != nil {
|
||||
return err
|
||||
}
|
||||
r.setSyncLockPhase("share import")
|
||||
manifest, imported, err := share.ImportIfChanged(r.ctx, r.store, opts)
|
||||
if err != nil {
|
||||
return err
|
||||
|
||||
@ -1,110 +0,0 @@
|
||||
package cli
|
||||
|
||||
import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/openclaw/discrawl/internal/share"
|
||||
)
|
||||
|
||||
type shareUpdateMode string
|
||||
|
||||
const (
|
||||
shareUpdateConfigured shareUpdateMode = "configured"
|
||||
shareUpdateAuto shareUpdateMode = "auto"
|
||||
shareUpdateNever shareUpdateMode = "never"
|
||||
shareUpdateForce shareUpdateMode = "force"
|
||||
)
|
||||
|
||||
func boolShareUpdateMode(enabled bool) shareUpdateMode {
|
||||
if enabled {
|
||||
return shareUpdateConfigured
|
||||
}
|
||||
return shareUpdateNever
|
||||
}
|
||||
|
||||
func parseShareUpdateMode(raw string) (shareUpdateMode, error) {
|
||||
switch shareUpdateMode(strings.ToLower(strings.TrimSpace(raw))) {
|
||||
case "", shareUpdateAuto:
|
||||
return shareUpdateAuto, nil
|
||||
case shareUpdateNever:
|
||||
return shareUpdateNever, nil
|
||||
case shareUpdateForce:
|
||||
return shareUpdateForce, nil
|
||||
default:
|
||||
return "", fmt.Errorf("invalid --update %q; use auto, never, or force", raw)
|
||||
}
|
||||
}
|
||||
|
||||
func syncShareUpdateMode(args []string) (shareUpdateMode, error) {
|
||||
mode := shareUpdateNever
|
||||
sawNoUpdate := false
|
||||
sawUpdate := false
|
||||
for i := 0; i < len(args); i++ {
|
||||
arg := args[i]
|
||||
switch {
|
||||
case arg == "--no-update":
|
||||
sawNoUpdate = true
|
||||
mode = shareUpdateNever
|
||||
case arg == "--update":
|
||||
if i+1 >= len(args) {
|
||||
return "", errors.New("--update requires auto, never, or force")
|
||||
}
|
||||
parsed, err := parseShareUpdateMode(args[i+1])
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
sawUpdate = true
|
||||
mode = parsed
|
||||
i++
|
||||
case strings.HasPrefix(arg, "--update="):
|
||||
parsed, err := parseShareUpdateMode(strings.TrimPrefix(arg, "--update="))
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
sawUpdate = true
|
||||
mode = parsed
|
||||
}
|
||||
}
|
||||
if sawNoUpdate && sawUpdate && mode != shareUpdateNever {
|
||||
return "", errors.New("use either --no-update or --update, not both")
|
||||
}
|
||||
return mode, nil
|
||||
}
|
||||
|
||||
func (r *runtime) shareProgress(progress share.ImportProgress) {
|
||||
if progress.Phase == "" {
|
||||
return
|
||||
}
|
||||
phase := "share " + progress.Phase
|
||||
if progress.Table != "" {
|
||||
phase += " " + progress.Table
|
||||
}
|
||||
if progress.File != "" {
|
||||
phase += " " + progress.File
|
||||
}
|
||||
r.setSyncLockPhase(phase)
|
||||
attrs := []any{"phase", progress.Phase}
|
||||
if progress.Table != "" {
|
||||
attrs = append(attrs, "table", progress.Table)
|
||||
}
|
||||
if progress.Rows != 0 {
|
||||
attrs = append(attrs, "rows", progress.Rows)
|
||||
}
|
||||
if progress.TotalRows != 0 {
|
||||
attrs = append(attrs, "total_rows", progress.TotalRows)
|
||||
}
|
||||
if progress.File != "" {
|
||||
attrs = append(attrs, "file", progress.File, "file_index", progress.FileIndex, "file_count", progress.FileCount)
|
||||
}
|
||||
r.logger.Info("share import progress", attrs...)
|
||||
}
|
||||
|
||||
func (r *runtime) nowUTC() time.Time {
|
||||
if r.now != nil {
|
||||
return r.now().UTC()
|
||||
}
|
||||
return time.Now().UTC()
|
||||
}
|
||||
@ -3,12 +3,9 @@ package cli
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/openclaw/discrawl/internal/config"
|
||||
"github.com/steipete/discrawl/internal/config"
|
||||
)
|
||||
|
||||
func (r *runtime) withSyncLock(fn func() error) error {
|
||||
@ -24,60 +21,13 @@ func (r *runtime) withSyncLock(fn func() error) error {
|
||||
return err
|
||||
}
|
||||
r.dbLockHeld = true
|
||||
r.lockStarted = r.nowUTC()
|
||||
r.setSyncLockPhase("locked")
|
||||
defer func() {
|
||||
r.dbLockHeld = false
|
||||
r.lockStarted = time.Time{}
|
||||
_ = release()
|
||||
}()
|
||||
return fn()
|
||||
}
|
||||
|
||||
func (r *runtime) tryWithSyncLock(fn func() error) (bool, error) {
|
||||
if r.dbLockHeld {
|
||||
return true, fn()
|
||||
}
|
||||
lockPath, err := r.syncLockPath()
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
release, locked, err := tryAcquireSyncLock(lockPath)
|
||||
if err != nil || !locked {
|
||||
return locked, err
|
||||
}
|
||||
r.dbLockHeld = true
|
||||
r.lockStarted = r.nowUTC()
|
||||
r.setSyncLockPhase("locked")
|
||||
defer func() {
|
||||
r.dbLockHeld = false
|
||||
r.lockStarted = time.Time{}
|
||||
_ = release()
|
||||
}()
|
||||
return true, fn()
|
||||
}
|
||||
|
||||
func (r *runtime) setSyncLockPhase(phase string) {
|
||||
if !r.dbLockHeld {
|
||||
return
|
||||
}
|
||||
path, err := r.syncLockPath()
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
started := r.lockStarted
|
||||
if started.IsZero() {
|
||||
started = r.nowUTC()
|
||||
}
|
||||
body := fmt.Sprintf("pid=%d\nstarted_at=%s\nupdated_at=%s\nphase=%s\n",
|
||||
os.Getpid(),
|
||||
started.Format(time.RFC3339Nano),
|
||||
r.nowUTC().Format(time.RFC3339Nano),
|
||||
phase,
|
||||
)
|
||||
_ = os.WriteFile(path, []byte(body), 0o600)
|
||||
}
|
||||
|
||||
func (r *runtime) syncLockPath() (string, error) {
|
||||
dbPath, err := config.ExpandPath(r.cfg.DBPath)
|
||||
if err != nil {
|
||||
@ -88,12 +38,6 @@ func (r *runtime) syncLockPath() (string, error) {
|
||||
|
||||
func syncLockErr(ctx context.Context, path string) error {
|
||||
if ctx.Err() != nil {
|
||||
if body, err := os.ReadFile(path); err == nil {
|
||||
details := strings.TrimSpace(string(body))
|
||||
if details != "" {
|
||||
return fmt.Errorf("wait for sync lock %s (%s): %w", path, strings.ReplaceAll(details, "\n", ", "), ctx.Err())
|
||||
}
|
||||
}
|
||||
return fmt.Errorf("wait for sync lock %s: %w", path, ctx.Err())
|
||||
}
|
||||
return nil
|
||||
|
||||
@ -7,7 +7,3 @@ import "context"
|
||||
func acquireSyncLock(context.Context, string) (func() error, error) {
|
||||
return func() error { return nil }, nil
|
||||
}
|
||||
|
||||
func tryAcquireSyncLock(string) (func() error, bool, error) {
|
||||
return func() error { return nil }, true, nil
|
||||
}
|
||||
|
||||
@ -51,29 +51,3 @@ func acquireSyncLock(ctx context.Context, path string) (func() error, error) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func tryAcquireSyncLock(path string) (func() error, bool, error) {
|
||||
file, err := os.OpenFile(path, os.O_CREATE|os.O_RDWR, 0o600)
|
||||
if err != nil {
|
||||
return nil, false, fmt.Errorf("open sync lock: %w", err)
|
||||
}
|
||||
err = unix.Flock(int(file.Fd()), unix.LOCK_EX|unix.LOCK_NB)
|
||||
if err != nil {
|
||||
_ = file.Close()
|
||||
if errors.Is(err, unix.EWOULDBLOCK) || errors.Is(err, unix.EAGAIN) {
|
||||
return nil, false, nil
|
||||
}
|
||||
return nil, false, fmt.Errorf("acquire sync lock: %w", err)
|
||||
}
|
||||
_, _ = file.Seek(0, 0)
|
||||
_ = file.Truncate(0)
|
||||
_, _ = fmt.Fprintf(file, "pid=%d\n", os.Getpid())
|
||||
return func() error {
|
||||
unlockErr := unix.Flock(int(file.Fd()), unix.LOCK_UN)
|
||||
closeErr := file.Close()
|
||||
if unlockErr != nil {
|
||||
return unlockErr
|
||||
}
|
||||
return closeErr
|
||||
}, true, nil
|
||||
}
|
||||
|
||||
@ -49,28 +49,3 @@ func acquireSyncLock(ctx context.Context, path string) (func() error, error) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func tryAcquireSyncLock(path string) (func() error, bool, error) {
|
||||
file, err := os.OpenFile(path, os.O_CREATE|os.O_RDWR, 0o600)
|
||||
if err != nil {
|
||||
return nil, false, fmt.Errorf("open sync lock: %w", err)
|
||||
}
|
||||
handle := windows.Handle(file.Fd())
|
||||
overlapped := &windows.Overlapped{}
|
||||
err = windows.LockFileEx(handle, windows.LOCKFILE_EXCLUSIVE_LOCK|windows.LOCKFILE_FAIL_IMMEDIATELY, 0, 1, 0, overlapped)
|
||||
if err != nil {
|
||||
_ = file.Close()
|
||||
return nil, false, nil
|
||||
}
|
||||
_, _ = file.Seek(0, 0)
|
||||
_ = file.Truncate(0)
|
||||
_, _ = fmt.Fprintf(file, "pid=%d\n", os.Getpid())
|
||||
return func() error {
|
||||
unlockErr := windows.UnlockFileEx(handle, 0, 1, 0, overlapped)
|
||||
closeErr := file.Close()
|
||||
if unlockErr != nil {
|
||||
return unlockErr
|
||||
}
|
||||
return closeErr
|
||||
}, true, nil
|
||||
}
|
||||
|
||||
@ -1,239 +0,0 @@
|
||||
package cli
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"flag"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"github.com/openclaw/crawlkit/tui"
|
||||
|
||||
"github.com/openclaw/discrawl/internal/store"
|
||||
)
|
||||
|
||||
func (r *runtime) runTUI(args []string) error {
|
||||
fs := flag.NewFlagSet("tui", flag.ContinueOnError)
|
||||
fs.SetOutput(r.stderr)
|
||||
fs.Usage = func() {
|
||||
_, _ = fmt.Fprintln(fs.Output(), "Usage of tui:")
|
||||
fs.PrintDefaults()
|
||||
_, _ = fmt.Fprintln(fs.Output())
|
||||
_, _ = fmt.Fprintln(fs.Output(), tui.ControlsHelp())
|
||||
}
|
||||
if hasHelpArg(args) {
|
||||
fs.SetOutput(r.stdout)
|
||||
}
|
||||
channel := fs.String("channel", "", "channel id")
|
||||
author := fs.String("author", "", "author/user id")
|
||||
limit := fs.Int("limit", 200, "row limit")
|
||||
includeEmpty := fs.Bool("include-empty", false, "include empty messages")
|
||||
dm := fs.Bool("dm", false, "browse direct messages")
|
||||
guildsFlag := fs.String("guilds", "", "comma-separated guild ids")
|
||||
guildFlag := fs.String("guild", "", "guild id")
|
||||
jsonOut := fs.Bool("json", false, "write browser rows as JSON")
|
||||
if len(args) == 1 && args[0] == "help" {
|
||||
fs.Usage()
|
||||
return nil
|
||||
}
|
||||
if err := fs.Parse(args); err != nil {
|
||||
if errors.Is(err, flag.ErrHelp) {
|
||||
return nil
|
||||
}
|
||||
return usageErr(err)
|
||||
}
|
||||
if *jsonOut {
|
||||
r.json = true
|
||||
}
|
||||
if fs.NArg() != 0 {
|
||||
return usageErr(errors.New("tui takes flags only"))
|
||||
}
|
||||
if *limit <= 0 {
|
||||
return usageErr(errors.New("tui --limit must be positive"))
|
||||
}
|
||||
guildIDs, err := r.resolveTUIGuilds(*dm, *guildFlag, *guildsFlag)
|
||||
if err != nil {
|
||||
return usageErr(err)
|
||||
}
|
||||
if r.store == nil {
|
||||
return tui.Browse(r.ctx, tui.BrowseOptions{
|
||||
AppName: "discrawl",
|
||||
Title: "discrawl archive",
|
||||
EmptyMessage: "discrawl has no local messages yet",
|
||||
JSON: r.json,
|
||||
Layout: tui.LayoutChat,
|
||||
SourceKind: r.archiveSourceKind(),
|
||||
SourceLocation: r.archiveSourceLocation(),
|
||||
Stdout: r.stdout,
|
||||
})
|
||||
}
|
||||
loadRows := func() ([]tui.Row, error) {
|
||||
rows, err := r.store.ListMessagesWithThreadContext(r.ctx, store.MessageListOptions{
|
||||
GuildIDs: guildIDs,
|
||||
Channel: *channel,
|
||||
Author: *author,
|
||||
Last: *limit,
|
||||
IncludeEmpty: *includeEmpty,
|
||||
})
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return discordTUIRows(rows), nil
|
||||
}
|
||||
archiveRows, err := loadRows()
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return tui.Browse(r.ctx, tui.BrowseOptions{
|
||||
AppName: "discrawl",
|
||||
Title: "discrawl archive",
|
||||
EmptyMessage: "discrawl has no local messages yet",
|
||||
Rows: archiveRows,
|
||||
Refresh: func(context.Context) ([]tui.Row, error) { return loadRows() },
|
||||
JSON: r.json,
|
||||
Layout: tui.LayoutChat,
|
||||
SourceKind: r.archiveSourceKind(),
|
||||
SourceLocation: r.archiveSourceLocation(),
|
||||
Stdout: r.stdout,
|
||||
})
|
||||
}
|
||||
|
||||
func (r *runtime) resolveTUIGuilds(dm bool, guild, guilds string) ([]string, error) {
|
||||
guildIDs, err := directMessageGuildScope(dm, guild, guilds)
|
||||
if err != nil || dm || len(guildIDs) > 0 {
|
||||
return guildIDs, err
|
||||
}
|
||||
if defaultGuild := r.cfg.EffectiveDefaultGuildID(); defaultGuild != "" {
|
||||
return []string{defaultGuild}, nil
|
||||
}
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
func (r *runtime) archiveSourceKind() string {
|
||||
if strings.TrimSpace(r.cfg.Share.Remote) != "" {
|
||||
return tui.SourceRemote
|
||||
}
|
||||
return tui.SourceLocal
|
||||
}
|
||||
|
||||
func (r *runtime) archiveSourceLocation() string {
|
||||
if strings.TrimSpace(r.cfg.Share.Remote) != "" {
|
||||
return r.cfg.Share.Remote
|
||||
}
|
||||
return r.cfg.DBPath
|
||||
}
|
||||
|
||||
func discordTUIRows(rows []store.MessageRow) []tui.Row {
|
||||
items := make([]tui.Row, 0, len(rows))
|
||||
for _, row := range rows {
|
||||
content := discordDisplayContent(row)
|
||||
title := strings.TrimSpace(content)
|
||||
detail := discordDetailContent(row, content)
|
||||
if title == "" {
|
||||
title = firstNonEmpty(strings.TrimSpace(row.AttachmentText), row.MessageID)
|
||||
}
|
||||
tags := []string{row.GuildID, row.ChannelID}
|
||||
if row.GuildID == "@me" {
|
||||
tags = append(tags, "dm")
|
||||
}
|
||||
if row.Source != "" {
|
||||
tags = append(tags, row.Source)
|
||||
}
|
||||
items = append(items, tui.Row{
|
||||
Source: "discord",
|
||||
Kind: "message",
|
||||
ID: row.MessageID,
|
||||
ParentID: row.ReplyToMessage,
|
||||
Scope: discordScopeLabel(row),
|
||||
Container: discordContainerLabel(row),
|
||||
Author: discordAuthorLabel(row),
|
||||
Title: title,
|
||||
Text: content,
|
||||
Detail: detail,
|
||||
URL: discordMessageURL(row),
|
||||
CreatedAt: formatTime(row.CreatedAt),
|
||||
Tags: tags,
|
||||
Fields: map[string]string{
|
||||
"attachment_names": row.AttachmentNames,
|
||||
"attachments": boolString(row.HasAttachments),
|
||||
"author_id": row.AuthorID,
|
||||
"channel_id": row.ChannelID,
|
||||
"guild_id": row.GuildID,
|
||||
"pinned": boolString(row.Pinned),
|
||||
"reply_to": row.ReplyToMessage,
|
||||
"source": row.Source,
|
||||
},
|
||||
})
|
||||
}
|
||||
return items
|
||||
}
|
||||
|
||||
func discordDetailContent(row store.MessageRow, content string) string {
|
||||
var parts []string
|
||||
if strings.TrimSpace(content) != "" {
|
||||
parts = append(parts, strings.TrimSpace(content))
|
||||
}
|
||||
if strings.TrimSpace(row.AttachmentText) != "" {
|
||||
parts = append(parts, "Attachments\n"+strings.TrimSpace(row.AttachmentText))
|
||||
}
|
||||
if len(parts) == 0 {
|
||||
return ""
|
||||
}
|
||||
return strings.Join(parts, "\n\n")
|
||||
}
|
||||
|
||||
func discordDisplayContent(row store.MessageRow) string {
|
||||
if content := strings.TrimSpace(row.DisplayContent); content != "" {
|
||||
return content
|
||||
}
|
||||
return row.Content
|
||||
}
|
||||
|
||||
func discordMessageURL(row store.MessageRow) string {
|
||||
guildID := strings.TrimSpace(row.GuildID)
|
||||
channelID := strings.TrimSpace(row.ChannelID)
|
||||
messageID := strings.TrimSpace(row.MessageID)
|
||||
if guildID == "" || channelID == "" || messageID == "" {
|
||||
return ""
|
||||
}
|
||||
return "https://discord.com/channels/" + guildID + "/" + channelID + "/" + messageID
|
||||
}
|
||||
|
||||
func discordScopeLabel(row store.MessageRow) string {
|
||||
if row.GuildID == "@me" {
|
||||
return "Direct messages"
|
||||
}
|
||||
return firstNonEmpty(row.GuildName, row.GuildID)
|
||||
}
|
||||
|
||||
func discordContainerLabel(row store.MessageRow) string {
|
||||
if row.GuildID == "@me" {
|
||||
return firstNonEmpty(row.ChannelName, "DM "+compactDiscordID(row.ChannelID))
|
||||
}
|
||||
return firstNonEmpty(row.ChannelName, row.ChannelID)
|
||||
}
|
||||
|
||||
func discordAuthorLabel(row store.MessageRow) string {
|
||||
if name := strings.TrimSpace(row.AuthorName); name != "" {
|
||||
return name
|
||||
}
|
||||
if id := strings.TrimSpace(row.AuthorID); id != "" {
|
||||
return "user:" + compactDiscordID(id)
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func compactDiscordID(id string) string {
|
||||
id = strings.TrimSpace(id)
|
||||
if len(id) <= 10 {
|
||||
return id
|
||||
}
|
||||
return id[:6] + "..." + id[len(id)-4:]
|
||||
}
|
||||
|
||||
func boolString(value bool) string {
|
||||
if value {
|
||||
return "true"
|
||||
}
|
||||
return ""
|
||||
}
|
||||
@ -1,3 +1,3 @@
|
||||
package cli
|
||||
|
||||
var version = "0.7.0"
|
||||
var version = "0.6.3"
|
||||
|
||||
@ -9,7 +9,7 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
crawlconfig "github.com/openclaw/crawlkit/config"
|
||||
"github.com/pelletier/go-toml/v2"
|
||||
)
|
||||
|
||||
const (
|
||||
@ -44,7 +44,6 @@ type DiscordConfig struct {
|
||||
type DesktopConfig struct {
|
||||
Path string `toml:"path"`
|
||||
MaxFileBytes int64 `toml:"max_file_bytes"`
|
||||
FullCache bool `toml:"full_cache"`
|
||||
}
|
||||
|
||||
type SyncConfig struct {
|
||||
@ -85,25 +84,14 @@ type TokenResolution struct {
|
||||
Path string
|
||||
}
|
||||
|
||||
var appConfig = crawlconfig.App{Name: "discrawl", ConfigEnv: DefaultConfigEnv, BaseDir: "~/.discrawl", LegacyBaseDir: "~/.discrawl"}
|
||||
|
||||
func Default() Config {
|
||||
home, _ := os.UserHomeDir()
|
||||
paths, err := appConfig.DefaultPaths()
|
||||
if err != nil {
|
||||
base := filepath.Join(home, ".discrawl")
|
||||
paths = crawlconfig.Paths{
|
||||
DBPath: filepath.Join(base, "discrawl.db"),
|
||||
CacheDir: filepath.Join(base, "cache"),
|
||||
LogDir: filepath.Join(base, "logs"),
|
||||
ShareDir: filepath.Join(base, "share"),
|
||||
}
|
||||
}
|
||||
base := filepath.Join(home, ".discrawl")
|
||||
return Config{
|
||||
Version: 1,
|
||||
DBPath: paths.DBPath,
|
||||
CacheDir: paths.CacheDir,
|
||||
LogDir: paths.LogDir,
|
||||
DBPath: filepath.Join(base, "discrawl.db"),
|
||||
CacheDir: filepath.Join(base, "cache"),
|
||||
LogDir: filepath.Join(base, "logs"),
|
||||
DefaultGuildID: "",
|
||||
Discord: DiscordConfig{
|
||||
TokenSource: "env",
|
||||
@ -135,7 +123,7 @@ func Default() Config {
|
||||
},
|
||||
},
|
||||
Share: ShareConfig{
|
||||
RepoPath: paths.ShareDir,
|
||||
RepoPath: filepath.Join(base, "share"),
|
||||
Branch: "main",
|
||||
AutoUpdate: true,
|
||||
StaleAfter: "15m",
|
||||
@ -156,12 +144,14 @@ func defaultSyncConcurrency() int {
|
||||
}
|
||||
|
||||
func ResolvePath(flagPath string) string {
|
||||
path, err := appConfig.ResolveConfigPath(flagPath)
|
||||
if err != nil {
|
||||
home, _ := os.UserHomeDir()
|
||||
return filepath.Join(home, ".discrawl", "config.toml")
|
||||
if strings.TrimSpace(flagPath) != "" {
|
||||
return flagPath
|
||||
}
|
||||
return path
|
||||
if envPath := strings.TrimSpace(os.Getenv(DefaultConfigEnv)); envPath != "" {
|
||||
return envPath
|
||||
}
|
||||
home, _ := os.UserHomeDir()
|
||||
return filepath.Join(home, ".discrawl", "config.toml")
|
||||
}
|
||||
|
||||
func Load(path string) (Config, error) {
|
||||
@ -170,9 +160,13 @@ func Load(path string) (Config, error) {
|
||||
if err != nil {
|
||||
return Config{}, err
|
||||
}
|
||||
if err := crawlconfig.LoadTOML(expanded, &cfg); err != nil {
|
||||
data, err := os.ReadFile(expanded)
|
||||
if err != nil {
|
||||
return Config{}, err
|
||||
}
|
||||
if err := toml.Unmarshal(data, &cfg); err != nil {
|
||||
return Config{}, fmt.Errorf("parse config: %w", err)
|
||||
}
|
||||
if err := cfg.Normalize(); err != nil {
|
||||
return Config{}, err
|
||||
}
|
||||
@ -187,7 +181,14 @@ func Write(path string, cfg Config) error {
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return crawlconfig.WriteTOML(expanded, cfg, 0o600)
|
||||
if err := os.MkdirAll(filepath.Dir(expanded), 0o755); err != nil {
|
||||
return fmt.Errorf("mkdir config dir: %w", err)
|
||||
}
|
||||
data, err := toml.Marshal(cfg)
|
||||
if err != nil {
|
||||
return fmt.Errorf("marshal config: %w", err)
|
||||
}
|
||||
return os.WriteFile(expanded, data, 0o600)
|
||||
}
|
||||
|
||||
func (c *Config) Normalize() error {
|
||||
@ -341,18 +342,35 @@ func (c Config) ShareEnabled() bool {
|
||||
}
|
||||
|
||||
func EnsureRuntimeDirs(cfg Config) error {
|
||||
return crawlconfig.EnsureRuntimeDirs(crawlconfig.RuntimeConfig{
|
||||
DBPath: cfg.DBPath,
|
||||
CacheDir: cfg.CacheDir,
|
||||
LogDir: cfg.LogDir,
|
||||
})
|
||||
paths := []string{cfg.CacheDir, cfg.LogDir, filepath.Dir(cfg.DBPath)}
|
||||
for _, path := range paths {
|
||||
expanded, err := ExpandPath(path)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
if err := os.MkdirAll(expanded, 0o755); err != nil {
|
||||
return fmt.Errorf("mkdir %s: %w", expanded, err)
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func ExpandPath(path string) (string, error) {
|
||||
if strings.TrimSpace(path) == "" {
|
||||
return "", errors.New("empty path")
|
||||
}
|
||||
return filepath.Clean(os.ExpandEnv(crawlconfig.ExpandHome(path))), nil
|
||||
if strings.HasPrefix(path, "~/") || path == "~" {
|
||||
home, err := os.UserHomeDir()
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("home dir: %w", err)
|
||||
}
|
||||
if path == "~" {
|
||||
path = home
|
||||
} else {
|
||||
path = filepath.Join(home, strings.TrimPrefix(path, "~/"))
|
||||
}
|
||||
}
|
||||
return filepath.Clean(os.ExpandEnv(path)), nil
|
||||
}
|
||||
|
||||
func uniqueStrings(in []string) []string {
|
||||
|
||||
@ -9,7 +9,6 @@ import (
|
||||
"fmt"
|
||||
"io"
|
||||
"io/fs"
|
||||
"maps"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"regexp"
|
||||
@ -20,7 +19,7 @@ import (
|
||||
"time"
|
||||
"unicode"
|
||||
|
||||
"github.com/openclaw/discrawl/internal/store"
|
||||
"github.com/steipete/discrawl/internal/store"
|
||||
)
|
||||
|
||||
const (
|
||||
@ -28,45 +27,35 @@ const (
|
||||
DirectMessageGuildName = "Discord Direct Messages"
|
||||
defaultMaxFileBytes = 64 << 20
|
||||
maxObjectBytes = 4 << 20
|
||||
cacheSniffBytes = 1 << 20
|
||||
checkpointEveryFiles = 256
|
||||
)
|
||||
|
||||
var (
|
||||
channelRouteRE = regexp.MustCompile(`/channels/(@me|[0-9]{12,24})/([0-9]{12,24})`)
|
||||
apiMessagesRouteRE = regexp.MustCompile(`/api/v[0-9]+/channels/[0-9]{12,24}/messages`)
|
||||
)
|
||||
var channelRouteRE = regexp.MustCompile(`/channels/(@me|[0-9]{12,24})/([0-9]{12,24})`)
|
||||
|
||||
type Options struct {
|
||||
Path string
|
||||
MaxFileBytes int64
|
||||
DryRun bool
|
||||
FullCache bool
|
||||
Now func() time.Time
|
||||
}
|
||||
|
||||
type Stats struct {
|
||||
Path string `json:"path"`
|
||||
FilesVisited int `json:"files_visited"`
|
||||
FilesScanned int `json:"files_scanned"`
|
||||
FilesSkipped int `json:"files_skipped"`
|
||||
FilesUnchanged int `json:"files_unchanged"`
|
||||
CacheFilesFastSkipped int `json:"cache_files_fast_skipped"`
|
||||
BytesScanned int64 `json:"bytes_scanned"`
|
||||
JSONObjects int `json:"json_objects"`
|
||||
Guilds int `json:"guilds"`
|
||||
Channels int `json:"channels"`
|
||||
Messages int `json:"messages"`
|
||||
DMMessages int `json:"dm_messages"`
|
||||
DMChannels int `json:"dm_channels"`
|
||||
GuildMessages int `json:"guild_messages"`
|
||||
SkippedMessages int `json:"skipped_messages"`
|
||||
SkippedChannels int `json:"skipped_channels"`
|
||||
Checkpoints int `json:"checkpoints"`
|
||||
DryRun bool `json:"dry_run,omitempty"`
|
||||
FullCache bool `json:"full_cache,omitempty"`
|
||||
StartedAt time.Time `json:"started_at"`
|
||||
FinishedAt time.Time `json:"finished_at"`
|
||||
Path string `json:"path"`
|
||||
FilesScanned int `json:"files_scanned"`
|
||||
FilesSkipped int `json:"files_skipped"`
|
||||
FilesUnchanged int `json:"files_unchanged"`
|
||||
BytesScanned int64 `json:"bytes_scanned"`
|
||||
JSONObjects int `json:"json_objects"`
|
||||
Guilds int `json:"guilds"`
|
||||
Channels int `json:"channels"`
|
||||
Messages int `json:"messages"`
|
||||
DMMessages int `json:"dm_messages"`
|
||||
DMChannels int `json:"dm_channels"`
|
||||
GuildMessages int `json:"guild_messages"`
|
||||
SkippedMessages int `json:"skipped_messages"`
|
||||
SkippedChannels int `json:"skipped_channels"`
|
||||
DryRun bool `json:"dry_run,omitempty"`
|
||||
StartedAt time.Time `json:"started_at"`
|
||||
FinishedAt time.Time `json:"finished_at"`
|
||||
}
|
||||
|
||||
type snapshot struct {
|
||||
@ -78,9 +67,8 @@ type snapshot struct {
|
||||
}
|
||||
|
||||
type fileFingerprint struct {
|
||||
Size int64 `json:"size"`
|
||||
ModUnixNS int64 `json:"mod_unix_ns"`
|
||||
Status string `json:"status,omitempty"`
|
||||
Size int64 `json:"size"`
|
||||
ModUnixNS int64 `json:"mod_unix_ns"`
|
||||
}
|
||||
|
||||
type scanState struct {
|
||||
@ -89,42 +77,8 @@ type scanState struct {
|
||||
channels map[string]store.ChannelRecord
|
||||
}
|
||||
|
||||
type fileSource int
|
||||
|
||||
const (
|
||||
fileSourceContext fileSource = iota
|
||||
fileSourceCacheData
|
||||
)
|
||||
|
||||
type fileCandidate struct {
|
||||
absPath string
|
||||
relPath string
|
||||
relKey string
|
||||
source fileSource
|
||||
info fs.FileInfo
|
||||
fingerprint fileFingerprint
|
||||
}
|
||||
|
||||
type scanTotals struct {
|
||||
guilds map[string]struct{}
|
||||
channels map[string]struct{}
|
||||
messages map[string]struct{}
|
||||
dmMessages map[string]struct{}
|
||||
guildMessages map[string]struct{}
|
||||
dmChannels map[string]struct{}
|
||||
skippedMessages map[string]struct{}
|
||||
skippedChannels map[string]struct{}
|
||||
}
|
||||
|
||||
type unresolvedMessages map[string]string
|
||||
|
||||
const wiretapFileIndexScope = "wiretap:file_index:v1"
|
||||
|
||||
const (
|
||||
fileStatusImported = "imported"
|
||||
fileStatusSkipped = "skipped"
|
||||
)
|
||||
|
||||
func DefaultPath() string {
|
||||
home, _ := os.UserHomeDir()
|
||||
switch runtime.GOOS {
|
||||
@ -151,29 +105,25 @@ func Import(ctx context.Context, st *store.Store, opts Options) (Stats, error) {
|
||||
if err != nil {
|
||||
return Stats{}, err
|
||||
}
|
||||
if opts.FullCache {
|
||||
stats, snap, err := scanFullCache(ctx, opts, state)
|
||||
if err != nil {
|
||||
return stats, err
|
||||
}
|
||||
stats.DryRun = opts.DryRun
|
||||
if opts.DryRun {
|
||||
return stats, nil
|
||||
}
|
||||
if err := writeSnapshot(ctx, st, snap, len(state.previous) == 0); err != nil {
|
||||
return stats, err
|
||||
}
|
||||
if err := saveFileIndex(ctx, st, opts, state.current); err != nil {
|
||||
return stats, err
|
||||
}
|
||||
stats.Checkpoints = 1
|
||||
return stats, nil
|
||||
}
|
||||
stats, err := scanAndImport(ctx, st, opts, state)
|
||||
stats, snap, err := scan(ctx, opts, state)
|
||||
if err != nil {
|
||||
return stats, err
|
||||
}
|
||||
stats.DryRun = opts.DryRun
|
||||
if opts.DryRun {
|
||||
return stats, nil
|
||||
}
|
||||
fullScan := len(state.previous) == 0
|
||||
if snapshotHasChanges(snap) || fullScan {
|
||||
if err := writeSnapshot(ctx, st, snap, fullScan); err != nil {
|
||||
return stats, err
|
||||
}
|
||||
} else if err := st.SetSyncState(ctx, "wiretap:last_import", time.Now().UTC().Format(time.RFC3339Nano)); err != nil {
|
||||
return stats, err
|
||||
}
|
||||
if err := saveFileIndex(ctx, st, state.current); err != nil {
|
||||
return stats, err
|
||||
}
|
||||
return stats, nil
|
||||
}
|
||||
|
||||
@ -186,7 +136,7 @@ func loadScanState(ctx context.Context, st *store.Store, opts Options) (scanStat
|
||||
if st == nil || opts.DryRun {
|
||||
return state, nil
|
||||
}
|
||||
raw, err := st.GetSyncState(ctx, fileIndexScope(opts))
|
||||
raw, err := st.GetSyncState(ctx, wiretapFileIndexScope)
|
||||
if err != nil {
|
||||
return state, err
|
||||
}
|
||||
@ -210,107 +160,19 @@ func loadScanState(ctx context.Context, st *store.Store, opts Options) (scanStat
|
||||
return state, nil
|
||||
}
|
||||
|
||||
func fileIndexScope(Options) string {
|
||||
return wiretapFileIndexScope
|
||||
}
|
||||
|
||||
func saveFileIndex(ctx context.Context, st *store.Store, opts Options, index map[string]fileFingerprint) error {
|
||||
func saveFileIndex(ctx context.Context, st *store.Store, index map[string]fileFingerprint) error {
|
||||
body, err := json.Marshal(index)
|
||||
if err != nil {
|
||||
return err
|
||||
}
|
||||
return st.SetSyncState(ctx, fileIndexScope(opts), string(body))
|
||||
}
|
||||
|
||||
func sameFileFingerprint(a, b fileFingerprint) bool {
|
||||
return a.Size == b.Size && a.ModUnixNS == b.ModUnixNS
|
||||
}
|
||||
|
||||
func isImportedFingerprint(fingerprint fileFingerprint) bool {
|
||||
return fingerprint.Status == "" || fingerprint.Status == fileStatusImported
|
||||
}
|
||||
|
||||
func importedFingerprint(fingerprint fileFingerprint) fileFingerprint {
|
||||
fingerprint.Status = fileStatusImported
|
||||
return fingerprint
|
||||
}
|
||||
|
||||
func skippedFingerprint(fingerprint fileFingerprint) fileFingerprint {
|
||||
fingerprint.Status = fileStatusSkipped
|
||||
return fingerprint
|
||||
return st.SetSyncState(ctx, wiretapFileIndexScope, string(body))
|
||||
}
|
||||
|
||||
func snapshotHasChanges(snap snapshot) bool {
|
||||
return len(snap.guilds) > 0 || len(snap.channels) > 0 || len(snap.messages) > 0
|
||||
}
|
||||
|
||||
func scanAndImport(ctx context.Context, st *store.Store, opts Options, state scanState) (Stats, error) {
|
||||
now := opts.Now
|
||||
if now == nil {
|
||||
now = time.Now
|
||||
}
|
||||
root := strings.TrimSpace(opts.Path)
|
||||
if root == "" {
|
||||
root = DefaultPath()
|
||||
}
|
||||
stats := Stats{Path: root, FullCache: opts.FullCache, StartedAt: now().UTC()}
|
||||
rootFS, err := os.OpenRoot(root)
|
||||
if err != nil {
|
||||
stats.FinishedAt = now().UTC()
|
||||
return stats, ignoreCacheFileError(err)
|
||||
}
|
||||
defer func() { _ = rootFS.Close() }()
|
||||
contextFiles, cacheFiles, err := discoverCandidates(ctx, root, rootFS, opts, state, &stats)
|
||||
if err != nil {
|
||||
stats.FinishedAt = now().UTC()
|
||||
return stats, err
|
||||
}
|
||||
fullScan := len(state.previous) == 0
|
||||
if fullScan && !opts.DryRun {
|
||||
if err := st.DeleteGuildData(ctx, "@unknown"); err != nil {
|
||||
stats.FinishedAt = now().UTC()
|
||||
return stats, err
|
||||
}
|
||||
}
|
||||
run := newImportRun(ctx, st, opts, state, rootFS, &stats)
|
||||
if err := run.scanContext(contextFiles); err != nil {
|
||||
stats.FinishedAt = now().UTC()
|
||||
return stats, err
|
||||
}
|
||||
if err := collectCacheRouteHints(ctx, rootFS, cacheFiles, run.base); err != nil {
|
||||
stats.FinishedAt = now().UTC()
|
||||
return stats, err
|
||||
}
|
||||
if err := run.scanCacheBatches(cacheFiles); err != nil {
|
||||
stats.FinishedAt = now().UTC()
|
||||
return stats, err
|
||||
}
|
||||
if err := run.retryPending(); err != nil {
|
||||
stats.FinishedAt = now().UTC()
|
||||
return stats, err
|
||||
}
|
||||
if !opts.DryRun {
|
||||
if len(contextFiles) == 0 && len(cacheFiles) == 0 {
|
||||
if err := st.SetSyncState(ctx, "wiretap:last_import", time.Now().UTC().Format(time.RFC3339Nano)); err != nil {
|
||||
stats.FinishedAt = now().UTC()
|
||||
return stats, err
|
||||
}
|
||||
if err := saveFileIndex(ctx, st, opts, state.current); err != nil {
|
||||
stats.FinishedAt = now().UTC()
|
||||
return stats, err
|
||||
}
|
||||
stats.Checkpoints++
|
||||
}
|
||||
if err := st.DeleteOrphanChannels(ctx, DirectMessageGuildID); err != nil {
|
||||
stats.FinishedAt = now().UTC()
|
||||
return stats, err
|
||||
}
|
||||
}
|
||||
stats.FinishedAt = now().UTC()
|
||||
return stats, nil
|
||||
}
|
||||
|
||||
func scanFullCache(ctx context.Context, opts Options, state scanState) (Stats, snapshot, error) {
|
||||
func scan(ctx context.Context, opts Options, state scanState) (Stats, snapshot, error) {
|
||||
now := opts.Now
|
||||
if now == nil {
|
||||
now = time.Now
|
||||
@ -323,8 +185,14 @@ func scanFullCache(ctx context.Context, opts Options, state scanState) (Stats, s
|
||||
if maxBytes <= 0 {
|
||||
maxBytes = defaultMaxFileBytes
|
||||
}
|
||||
stats := Stats{Path: root, FullCache: true, StartedAt: now().UTC()}
|
||||
snap := newSnapshot()
|
||||
stats := Stats{Path: root, StartedAt: now().UTC()}
|
||||
snap := snapshot{
|
||||
guilds: map[string]store.GuildRecord{},
|
||||
channels: map[string]store.ChannelRecord{},
|
||||
messages: map[string]store.MessageMutation{},
|
||||
routes: map[string]string{},
|
||||
userLabels: map[string]userLabel{},
|
||||
}
|
||||
rootFS, err := os.OpenRoot(root)
|
||||
if err != nil {
|
||||
stats.FinishedAt = now().UTC()
|
||||
@ -344,7 +212,6 @@ func scanFullCache(ctx context.Context, opts Options, state scanState) (Stats, s
|
||||
}
|
||||
return nil
|
||||
}
|
||||
stats.FilesVisited++
|
||||
info, err := entry.Info()
|
||||
if err != nil {
|
||||
stats.FilesSkipped++
|
||||
@ -364,8 +231,8 @@ func scanFullCache(ctx context.Context, opts Options, state scanState) (Stats, s
|
||||
Size: info.Size(),
|
||||
ModUnixNS: info.ModTime().UnixNano(),
|
||||
}
|
||||
state.current[relKey] = importedFingerprint(fingerprint)
|
||||
if previous, ok := state.previous[relKey]; ok && sameFileFingerprint(previous, fingerprint) && isImportedFingerprint(previous) {
|
||||
state.current[relKey] = fingerprint
|
||||
if previous, ok := state.previous[relKey]; ok && previous == fingerprint {
|
||||
stats.FilesUnchanged++
|
||||
return nil
|
||||
}
|
||||
@ -379,17 +246,11 @@ func scanFullCache(ctx context.Context, opts Options, state scanState) (Stats, s
|
||||
collectChannelRoutes(snap, bytes.ToValidUTF8(data, nil))
|
||||
objects := extractJSONValues(bytes.ToValidUTF8(data, nil))
|
||||
for _, payload := range extractGzipPayloads(data, maxBytes) {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
collectChannelRoutes(snap, bytes.ToValidUTF8(payload, nil))
|
||||
objects = append(objects, extractJSONValues(bytes.ToValidUTF8(payload, nil))...)
|
||||
}
|
||||
stats.JSONObjects += len(objects)
|
||||
for _, raw := range objects {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
var value any
|
||||
if err := json.Unmarshal(raw, &value); err != nil {
|
||||
continue
|
||||
@ -400,181 +261,15 @@ func scanFullCache(ctx context.Context, opts Options, state scanState) (Stats, s
|
||||
}); err != nil {
|
||||
return stats, snap, err
|
||||
}
|
||||
totals := newScanTotals()
|
||||
finalizeSnapshot(snap, state.channels, totals, &stats, true)
|
||||
stats.FinishedAt = now().UTC()
|
||||
return stats, snap, nil
|
||||
}
|
||||
|
||||
func discoverCandidates(ctx context.Context, root string, rootFS *os.Root, opts Options, state scanState, stats *Stats) ([]fileCandidate, []fileCandidate, error) {
|
||||
var contextFiles []fileCandidate
|
||||
var cacheFiles []fileCandidate
|
||||
maxBytes := opts.MaxFileBytes
|
||||
if maxBytes <= 0 {
|
||||
maxBytes = defaultMaxFileBytes
|
||||
}
|
||||
err := filepath.WalkDir(root, func(path string, entry fs.DirEntry, err error) error {
|
||||
if err != nil {
|
||||
return ignoreCacheFileError(err)
|
||||
}
|
||||
if ctx.Err() != nil {
|
||||
return ctx.Err()
|
||||
}
|
||||
if entry.IsDir() {
|
||||
if shouldSkipDir(entry.Name()) && path != root {
|
||||
return filepath.SkipDir
|
||||
}
|
||||
return nil
|
||||
}
|
||||
stats.FilesVisited++
|
||||
info, err := entry.Info()
|
||||
if err != nil {
|
||||
stats.FilesSkipped++
|
||||
return ignoreCacheFileError(err)
|
||||
}
|
||||
if !isCandidateFile(path) || info.Size() <= 0 || info.Size() > maxBytes {
|
||||
stats.FilesSkipped++
|
||||
return nil
|
||||
}
|
||||
relPath, err := filepath.Rel(root, path)
|
||||
if err != nil {
|
||||
stats.FilesSkipped++
|
||||
return ignoreCacheFileError(err)
|
||||
}
|
||||
relKey := filepath.ToSlash(relPath)
|
||||
fingerprint := fileFingerprint{
|
||||
Size: info.Size(),
|
||||
ModUnixNS: info.ModTime().UnixNano(),
|
||||
}
|
||||
candidate := fileCandidate{
|
||||
absPath: path,
|
||||
relPath: relPath,
|
||||
relKey: relKey,
|
||||
source: sourceForPath(root, path, relPath),
|
||||
info: info,
|
||||
fingerprint: fingerprint,
|
||||
}
|
||||
if candidate.source == fileSourceCacheData {
|
||||
if previous, ok := state.previous[relKey]; ok && sameFileFingerprint(previous, fingerprint) {
|
||||
if !opts.FullCache || isImportedFingerprint(previous) {
|
||||
state.current[relKey] = previous
|
||||
stats.FilesUnchanged++
|
||||
return nil
|
||||
}
|
||||
}
|
||||
if !opts.FullCache {
|
||||
ok, err := cacheFileHasRouteHint(rootFS, relPath)
|
||||
if err != nil {
|
||||
stats.FilesSkipped++
|
||||
return ignoreCacheFileError(err)
|
||||
}
|
||||
if !ok {
|
||||
state.current[relKey] = skippedFingerprint(fingerprint)
|
||||
stats.FilesSkipped++
|
||||
stats.CacheFilesFastSkipped++
|
||||
return nil
|
||||
}
|
||||
}
|
||||
cacheFiles = append(cacheFiles, candidate)
|
||||
return nil
|
||||
}
|
||||
if previous, ok := state.previous[relKey]; ok && sameFileFingerprint(previous, fingerprint) {
|
||||
state.current[relKey] = previous
|
||||
stats.FilesUnchanged++
|
||||
return nil
|
||||
}
|
||||
contextFiles = append(contextFiles, candidate)
|
||||
return nil
|
||||
})
|
||||
return contextFiles, cacheFiles, err
|
||||
}
|
||||
|
||||
func scanCandidates(ctx context.Context, rootFS *os.Root, opts Options, candidates []fileCandidate, snap snapshot, channelLookup map[string]store.ChannelRecord, stats *Stats) error {
|
||||
maxBytes := opts.MaxFileBytes
|
||||
if maxBytes <= 0 {
|
||||
maxBytes = defaultMaxFileBytes
|
||||
}
|
||||
for _, candidate := range candidates {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
data, err := rootFS.ReadFile(candidate.relPath)
|
||||
if err != nil {
|
||||
stats.FilesSkipped++
|
||||
if err := ignoreCacheFileError(err); err != nil {
|
||||
return err
|
||||
}
|
||||
continue
|
||||
}
|
||||
stats.FilesScanned++
|
||||
stats.BytesScanned += int64(len(data))
|
||||
collectChannelRoutes(snap, bytes.ToValidUTF8(data, nil))
|
||||
objects := extractJSONValues(bytes.ToValidUTF8(data, nil))
|
||||
for _, payload := range extractGzipPayloads(data, maxBytes) {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
collectChannelRoutes(snap, bytes.ToValidUTF8(payload, nil))
|
||||
objects = append(objects, extractJSONValues(bytes.ToValidUTF8(payload, nil))...)
|
||||
}
|
||||
stats.JSONObjects += len(objects)
|
||||
for _, raw := range objects {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
var value any
|
||||
if err := json.Unmarshal(raw, &value); err != nil {
|
||||
continue
|
||||
}
|
||||
collectValue(snap, channelLookup, value, candidate.info.ModTime().UTC())
|
||||
}
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func collectCacheRouteHints(ctx context.Context, rootFS *os.Root, candidates []fileCandidate, snap snapshot) error {
|
||||
for _, candidate := range candidates {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
data, err := readFilePrefix(rootFS, candidate.relPath)
|
||||
if err != nil {
|
||||
if err := ignoreCacheFileError(err); err != nil {
|
||||
return err
|
||||
}
|
||||
continue
|
||||
}
|
||||
collectChannelRoutes(snap, bytes.ToValidUTF8(data, nil))
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func newScanTotals() scanTotals {
|
||||
return scanTotals{
|
||||
guilds: map[string]struct{}{},
|
||||
channels: map[string]struct{}{},
|
||||
messages: map[string]struct{}{},
|
||||
dmMessages: map[string]struct{}{},
|
||||
guildMessages: map[string]struct{}{},
|
||||
dmChannels: map[string]struct{}{},
|
||||
skippedMessages: map[string]struct{}{},
|
||||
skippedChannels: map[string]struct{}{},
|
||||
}
|
||||
}
|
||||
|
||||
func finalizeSnapshot(snap snapshot, channelLookup map[string]store.ChannelRecord, totals scanTotals, stats *Stats, recordSkipped bool) unresolvedMessages {
|
||||
reconcileMessages(snap, channelLookup)
|
||||
reconcileMessages(snap, state.channels)
|
||||
inferDirectMessageNames(snap)
|
||||
reconcileMessages(snap, channelLookup)
|
||||
unresolved := unresolvedMessages{}
|
||||
reconcileMessages(snap, state.channels)
|
||||
skippedChannels := map[string]struct{}{}
|
||||
for id, msg := range snap.messages {
|
||||
guildID := msg.Record.GuildID
|
||||
if guildID == "" {
|
||||
unresolved[id] = msg.Record.ChannelID
|
||||
if recordSkipped {
|
||||
totals.skippedMessages[id] = struct{}{}
|
||||
totals.skippedChannels[msg.Record.ChannelID] = struct{}{}
|
||||
}
|
||||
stats.SkippedMessages++
|
||||
skippedChannels[msg.Record.ChannelID] = struct{}{}
|
||||
delete(snap.messages, id)
|
||||
continue
|
||||
}
|
||||
@ -582,183 +277,34 @@ func finalizeSnapshot(snap snapshot, channelLookup map[string]store.ChannelRecor
|
||||
snap.guilds[guildID] = syntheticGuild(guildID, guildName(guildID))
|
||||
}
|
||||
if _, ok := snap.channels[msg.Record.ChannelID]; !ok {
|
||||
if channel, ok := channelLookup[msg.Record.ChannelID]; ok && channel.GuildID != "" {
|
||||
snap.channels[msg.Record.ChannelID] = channel
|
||||
} else {
|
||||
snap.channels[msg.Record.ChannelID] = syntheticChannel(msg.Record.ChannelID, guildID, msg.Record.ChannelName)
|
||||
}
|
||||
snap.channels[msg.Record.ChannelID] = syntheticChannel(msg.Record.ChannelID, guildID, msg.Record.ChannelName)
|
||||
}
|
||||
snap.messages[id] = msg
|
||||
}
|
||||
messageChannels := map[string]struct{}{}
|
||||
dmChannels := map[string]struct{}{}
|
||||
for _, msg := range snap.messages {
|
||||
totals.messages[msg.Record.ID] = struct{}{}
|
||||
messageChannels[msg.Record.ChannelID] = struct{}{}
|
||||
switch msg.Record.GuildID {
|
||||
case DirectMessageGuildID:
|
||||
totals.dmMessages[msg.Record.ID] = struct{}{}
|
||||
totals.dmChannels[msg.Record.ChannelID] = struct{}{}
|
||||
stats.DMMessages++
|
||||
dmChannels[msg.Record.ChannelID] = struct{}{}
|
||||
default:
|
||||
totals.guildMessages[msg.Record.ID] = struct{}{}
|
||||
stats.GuildMessages++
|
||||
}
|
||||
}
|
||||
for id, channel := range snap.channels {
|
||||
channelLookup[id] = channel
|
||||
totals.channels[id] = struct{}{}
|
||||
}
|
||||
for id := range snap.guilds {
|
||||
totals.guilds[id] = struct{}{}
|
||||
}
|
||||
stats.DMChannels = len(totals.dmChannels)
|
||||
stats.SkippedChannels = len(totals.skippedChannels)
|
||||
stats.Guilds = len(totals.guilds)
|
||||
stats.Channels = len(totals.channels)
|
||||
stats.Messages = len(totals.messages)
|
||||
stats.DMMessages = len(totals.dmMessages)
|
||||
stats.GuildMessages = len(totals.guildMessages)
|
||||
stats.SkippedMessages = len(totals.skippedMessages)
|
||||
return unresolved
|
||||
}
|
||||
|
||||
func mergeUnresolved(dst, src unresolvedMessages) {
|
||||
maps.Copy(dst, src)
|
||||
}
|
||||
|
||||
func recordUnresolved(unresolved unresolvedMessages, totals scanTotals, stats *Stats) {
|
||||
for messageID, channelID := range unresolved {
|
||||
totals.skippedMessages[messageID] = struct{}{}
|
||||
totals.skippedChannels[channelID] = struct{}{}
|
||||
}
|
||||
stats.SkippedChannels = len(totals.skippedChannels)
|
||||
stats.SkippedMessages = len(totals.skippedMessages)
|
||||
}
|
||||
|
||||
func commitSnapshot(ctx context.Context, st *store.Store, opts Options, state scanState, candidates []fileCandidate, snap snapshot, checkpoint bool, stats *Stats) error {
|
||||
if opts.DryRun {
|
||||
return nil
|
||||
}
|
||||
if !checkpoint {
|
||||
if snapshotHasChanges(snap) {
|
||||
return writeSnapshot(ctx, st, snapshotWithoutMessageEvents(snap), false)
|
||||
for id := range snap.channels {
|
||||
if _, ok := messageChannels[id]; !ok {
|
||||
delete(snap.channels, id)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
if snapshotHasChanges(snap) {
|
||||
if err := writeSnapshot(ctx, st, snap, false); err != nil {
|
||||
return err
|
||||
}
|
||||
} else if err := st.SetSyncState(ctx, "wiretap:last_import", time.Now().UTC().Format(time.RFC3339Nano)); err != nil {
|
||||
return err
|
||||
}
|
||||
for _, candidate := range candidates {
|
||||
state.current[candidate.relKey] = importedFingerprint(candidate.fingerprint)
|
||||
}
|
||||
if err := saveFileIndex(ctx, st, opts, state.current); err != nil {
|
||||
return err
|
||||
}
|
||||
stats.Checkpoints++
|
||||
return nil
|
||||
}
|
||||
|
||||
func checkpointScannedCandidates(ctx context.Context, st *store.Store, opts Options, state scanState, candidates []fileCandidate, stats *Stats) error {
|
||||
if opts.DryRun {
|
||||
return nil
|
||||
}
|
||||
if err := st.SetSyncState(ctx, "wiretap:last_import", time.Now().UTC().Format(time.RFC3339Nano)); err != nil {
|
||||
return err
|
||||
}
|
||||
for _, candidate := range candidates {
|
||||
state.current[candidate.relKey] = importedFingerprint(candidate.fingerprint)
|
||||
}
|
||||
if err := saveFileIndex(ctx, st, opts, state.current); err != nil {
|
||||
return err
|
||||
}
|
||||
stats.Checkpoints++
|
||||
return nil
|
||||
}
|
||||
|
||||
func snapshotWithoutMessageEvents(snap snapshot) snapshot {
|
||||
out := snapshot{
|
||||
guilds: snap.guilds,
|
||||
channels: snap.channels,
|
||||
messages: make(map[string]store.MessageMutation, len(snap.messages)),
|
||||
routes: snap.routes,
|
||||
userLabels: snap.userLabels,
|
||||
}
|
||||
for id, message := range snap.messages {
|
||||
message.Options.AppendEvent = false
|
||||
out.messages[id] = message
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func newSnapshot() snapshot {
|
||||
return snapshot{
|
||||
guilds: map[string]store.GuildRecord{},
|
||||
channels: map[string]store.ChannelRecord{},
|
||||
messages: map[string]store.MessageMutation{},
|
||||
routes: map[string]string{},
|
||||
userLabels: map[string]userLabel{},
|
||||
}
|
||||
}
|
||||
|
||||
func newSnapshotWithContext(base snapshot) snapshot {
|
||||
snap := newSnapshot()
|
||||
maps.Copy(snap.routes, base.routes)
|
||||
maps.Copy(snap.userLabels, base.userLabels)
|
||||
return snap
|
||||
}
|
||||
|
||||
func mergeSnapshotContext(base snapshot, next snapshot) {
|
||||
for channelID, guildID := range next.routes {
|
||||
collectChannelRoute(base, channelID, guildID)
|
||||
}
|
||||
maps.Copy(base.userLabels, next.userLabels)
|
||||
maps.Copy(base.channels, next.channels)
|
||||
}
|
||||
|
||||
func copyChannelLookup(in map[string]store.ChannelRecord) map[string]store.ChannelRecord {
|
||||
out := make(map[string]store.ChannelRecord, len(in))
|
||||
maps.Copy(out, in)
|
||||
return out
|
||||
}
|
||||
|
||||
func sourceForPath(root, path, relPath string) fileSource {
|
||||
if isRouteFilteredCachePath(root, path, relPath) {
|
||||
return fileSourceCacheData
|
||||
}
|
||||
return fileSourceContext
|
||||
}
|
||||
|
||||
func isRouteFilteredCachePath(root, path, relPath string) bool {
|
||||
cleanRoot := filepath.ToSlash(root)
|
||||
cleanPath := filepath.ToSlash(path)
|
||||
cleanRel := filepath.ToSlash(relPath)
|
||||
return filepath.Base(cleanRoot) == "Cache_Data" ||
|
||||
filepath.Base(cleanRoot) == "CacheStorage" ||
|
||||
strings.Contains(cleanPath, "/Cache/Cache_Data/") ||
|
||||
strings.Contains(cleanPath, "/Service Worker/CacheStorage/") ||
|
||||
strings.HasPrefix(cleanRel, "Cache_Data/") ||
|
||||
strings.HasPrefix(cleanRel, "Service Worker/CacheStorage/")
|
||||
}
|
||||
|
||||
func cacheFileHasRouteHint(rootFS *os.Root, relPath string) (bool, error) {
|
||||
data, err := readFilePrefix(rootFS, relPath)
|
||||
if err != nil {
|
||||
return false, err
|
||||
}
|
||||
return channelRouteRE.Match(data) || apiMessagesRouteRE.Match(data), nil
|
||||
}
|
||||
|
||||
func readFilePrefix(rootFS *os.Root, relPath string) ([]byte, error) {
|
||||
file, err := rootFS.Open(relPath)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
defer func() { _ = file.Close() }()
|
||||
data, err := io.ReadAll(io.LimitReader(file, cacheSniffBytes))
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return data, nil
|
||||
stats.DMChannels = len(dmChannels)
|
||||
stats.SkippedChannels = len(skippedChannels)
|
||||
stats.Guilds = len(snap.guilds)
|
||||
stats.Channels = len(snap.channels)
|
||||
stats.Messages = len(snap.messages)
|
||||
stats.FinishedAt = now().UTC()
|
||||
return stats, snap, nil
|
||||
}
|
||||
|
||||
func ignoreCacheFileError(error) error {
|
||||
@ -774,9 +320,6 @@ func writeSnapshot(ctx context.Context, st *store.Store, snap snapshot, prune bo
|
||||
guilds := mapValues(snap.guilds)
|
||||
sort.Slice(guilds, func(i, j int) bool { return guilds[i].ID < guilds[j].ID })
|
||||
for _, guild := range guilds {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := st.UpsertGuild(ctx, guild); err != nil {
|
||||
return err
|
||||
}
|
||||
@ -784,9 +327,6 @@ func writeSnapshot(ctx context.Context, st *store.Store, snap snapshot, prune bo
|
||||
channels := mapValues(snap.channels)
|
||||
sort.Slice(channels, func(i, j int) bool { return channels[i].ID < channels[j].ID })
|
||||
for _, channel := range channels {
|
||||
if err := ctx.Err(); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := st.UpsertChannel(ctx, channel); err != nil {
|
||||
return err
|
||||
}
|
||||
|
||||
@ -1,198 +0,0 @@
|
||||
package discorddesktop
|
||||
|
||||
import (
|
||||
"context"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"runtime"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/openclaw/discrawl/internal/store"
|
||||
)
|
||||
|
||||
func TestFileFingerprintStatusHelpers(t *testing.T) {
|
||||
base := fileFingerprint{Size: 123, ModUnixNS: 456}
|
||||
require.True(t, sameFileFingerprint(base, fileFingerprint{Size: 123, ModUnixNS: 456, Status: fileStatusSkipped}))
|
||||
require.False(t, sameFileFingerprint(base, fileFingerprint{Size: 124, ModUnixNS: 456}))
|
||||
require.False(t, sameFileFingerprint(base, fileFingerprint{Size: 123, ModUnixNS: 457}))
|
||||
|
||||
require.True(t, isImportedFingerprint(base))
|
||||
require.True(t, isImportedFingerprint(importedFingerprint(base)))
|
||||
require.False(t, isImportedFingerprint(skippedFingerprint(base)))
|
||||
require.Equal(t, fileStatusImported, importedFingerprint(base).Status)
|
||||
require.Equal(t, fileStatusSkipped, skippedFingerprint(base).Status)
|
||||
require.Equal(t, wiretapFileIndexScope, fileIndexScope(Options{}))
|
||||
require.Equal(t, wiretapFileIndexScope, fileIndexScope(Options{FullCache: true}))
|
||||
}
|
||||
|
||||
func TestSnapshotCopyHelpers(t *testing.T) {
|
||||
base := newSnapshot()
|
||||
base.routes["111111111111111121"] = "999999999999999996"
|
||||
base.userLabels["222222222222222232"] = userLabel{Name: "Alice"}
|
||||
base.channels["111111111111111121"] = store.ChannelRecord{ID: "111111111111111121", GuildID: "999999999999999996", Name: "general"}
|
||||
|
||||
snap := newSnapshotWithContext(base)
|
||||
require.Equal(t, base.routes, snap.routes)
|
||||
require.Equal(t, base.userLabels, snap.userLabels)
|
||||
require.Empty(t, snap.channels)
|
||||
|
||||
next := newSnapshot()
|
||||
next.routes["111111111111111122"] = "999999999999999996"
|
||||
next.userLabels["222222222222222233"] = userLabel{Name: "Bob"}
|
||||
next.channels["111111111111111122"] = store.ChannelRecord{ID: "111111111111111122", GuildID: "999999999999999996", Name: "random"}
|
||||
mergeSnapshotContext(base, next)
|
||||
|
||||
require.Equal(t, "999999999999999996", base.routes["111111111111111122"])
|
||||
require.Equal(t, "Bob", base.userLabels["222222222222222233"].Name)
|
||||
require.Equal(t, "random", base.channels["111111111111111122"].Name)
|
||||
|
||||
lookup := copyChannelLookup(base.channels)
|
||||
lookup["111111111111111122"] = store.ChannelRecord{ID: "changed"}
|
||||
require.Equal(t, "random", base.channels["111111111111111122"].Name)
|
||||
}
|
||||
|
||||
func TestSnapshotWithoutMessageEvents(t *testing.T) {
|
||||
snap := newSnapshot()
|
||||
snap.messages["333333333333333346"] = store.MessageMutation{
|
||||
Record: store.MessageRecord{ID: "333333333333333346"},
|
||||
Options: store.WriteOptions{
|
||||
AppendEvent: true,
|
||||
EnqueueEmbedding: true,
|
||||
},
|
||||
}
|
||||
stripped := snapshotWithoutMessageEvents(snap)
|
||||
require.False(t, stripped.messages["333333333333333346"].Options.AppendEvent)
|
||||
require.True(t, stripped.messages["333333333333333346"].Options.EnqueueEmbedding)
|
||||
require.True(t, snap.messages["333333333333333346"].Options.AppendEvent)
|
||||
}
|
||||
|
||||
func TestRouteFilteredCacheHelpers(t *testing.T) {
|
||||
require.Equal(t, fileSourceCacheData, sourceForPath("/tmp/discord", "/tmp/discord/Cache/Cache_Data/entry", "Cache/Cache_Data/entry"))
|
||||
require.Equal(t, fileSourceCacheData, sourceForPath("/tmp/discord", "/tmp/discord/Service Worker/CacheStorage/cache/entry", "Service Worker/CacheStorage/cache/entry"))
|
||||
require.Equal(t, fileSourceContext, sourceForPath("/tmp/discord", "/tmp/discord/Local Storage/leveldb/000001.log", "Local Storage/leveldb/000001.log"))
|
||||
}
|
||||
|
||||
func TestCacheFileHasRouteHint(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
require.NoError(t, os.WriteFile(filepath.Join(dir, "route"), []byte("https://discord.com/api/v9/channels/111111111111111121/messages?limit=50"), 0o600))
|
||||
require.NoError(t, os.WriteFile(filepath.Join(dir, "plain"), []byte("no discord route here"), 0o600))
|
||||
|
||||
root, err := os.OpenRoot(dir)
|
||||
require.NoError(t, err)
|
||||
defer func() { _ = root.Close() }()
|
||||
|
||||
ok, err := cacheFileHasRouteHint(root, "route")
|
||||
require.NoError(t, err)
|
||||
require.True(t, ok)
|
||||
ok, err = cacheFileHasRouteHint(root, "plain")
|
||||
require.NoError(t, err)
|
||||
require.False(t, ok)
|
||||
_, err = cacheFileHasRouteHint(root, "missing")
|
||||
require.Error(t, err)
|
||||
}
|
||||
|
||||
func TestImportAndStateEdgeBranches(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
_, err := Import(ctx, nil, Options{})
|
||||
require.ErrorContains(t, err, "store is required")
|
||||
|
||||
configHome := t.TempDir()
|
||||
t.Setenv("XDG_CONFIG_HOME", configHome)
|
||||
if runtime.GOOS == "linux" {
|
||||
require.Equal(t, filepath.Join(configHome, "discord"), DefaultPath())
|
||||
}
|
||||
|
||||
dir := t.TempDir()
|
||||
s, err := store.Open(ctx, filepath.Join(dir, "discrawl.db"))
|
||||
require.NoError(t, err)
|
||||
defer func() { _ = s.Close() }()
|
||||
|
||||
stats, err := Import(ctx, s, Options{
|
||||
Path: dir,
|
||||
Now: func() time.Time { return time.Date(2026, 5, 8, 12, 0, 0, 0, time.UTC) },
|
||||
})
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, dir, stats.Path)
|
||||
require.Equal(t, 1, stats.Checkpoints)
|
||||
|
||||
stats, err = Import(ctx, nil, Options{Path: filepath.Join(dir, "missing"), DryRun: true})
|
||||
require.NoError(t, err)
|
||||
require.True(t, stats.DryRun)
|
||||
|
||||
stats, err = Import(ctx, nil, Options{Path: dir, DryRun: true, FullCache: true})
|
||||
require.NoError(t, err)
|
||||
require.True(t, stats.FullCache)
|
||||
|
||||
require.NoError(t, s.SetSyncState(ctx, fileIndexScope(Options{}), "{not-json"))
|
||||
require.NoError(t, s.UpsertChannel(ctx, store.ChannelRecord{ID: "c1", GuildID: "g1", Kind: "text", Name: "general", RawJSON: `{}`}))
|
||||
state, err := loadScanState(ctx, s, Options{})
|
||||
require.NoError(t, err)
|
||||
require.Empty(t, state.previous)
|
||||
require.Equal(t, "general", state.channels["c1"].Name)
|
||||
}
|
||||
|
||||
func TestSnapshotFinalizeAndCommitBranches(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
s, err := store.Open(ctx, filepath.Join(t.TempDir(), "discrawl.db"))
|
||||
require.NoError(t, err)
|
||||
defer func() { _ = s.Close() }()
|
||||
|
||||
snap := newSnapshot()
|
||||
snap.messages["m-missing"] = store.MessageMutation{
|
||||
Record: store.MessageRecord{ID: "m-missing", ChannelID: "c-missing", RawJSON: `{}`},
|
||||
}
|
||||
snap.messages["m-known"] = store.MessageMutation{
|
||||
Record: store.MessageRecord{ID: "m-known", GuildID: "g1", ChannelID: "c1", ChannelName: "general", RawJSON: `{}`},
|
||||
}
|
||||
stats := &Stats{}
|
||||
totals := newScanTotals()
|
||||
unresolved := finalizeSnapshot(snap, map[string]store.ChannelRecord{
|
||||
"c1": {ID: "c1", GuildID: "g1", Kind: "text", Name: "general", RawJSON: `{}`},
|
||||
}, totals, stats, true)
|
||||
require.Equal(t, unresolvedMessages{"m-missing": "c-missing"}, unresolved)
|
||||
require.Equal(t, 1, stats.Messages)
|
||||
require.Equal(t, 1, stats.SkippedMessages)
|
||||
require.Equal(t, "general", snap.channels["c1"].Name)
|
||||
require.Equal(t, "g1", snap.guilds["g1"].ID)
|
||||
|
||||
more := unresolvedMessages{"m2": "c2"}
|
||||
mergeUnresolved(unresolved, more)
|
||||
recordUnresolved(unresolved, totals, stats)
|
||||
require.Equal(t, 2, stats.SkippedMessages)
|
||||
|
||||
state := scanState{current: map[string]fileFingerprint{}}
|
||||
candidates := []fileCandidate{{relKey: "Cache_Data/entry", fingerprint: fileFingerprint{Size: 10, ModUnixNS: 20}}}
|
||||
require.NoError(t, commitSnapshot(ctx, s, Options{DryRun: true}, state, candidates, newSnapshot(), true, stats))
|
||||
require.NoError(t, commitSnapshot(ctx, s, Options{}, state, candidates, newSnapshot(), false, stats))
|
||||
require.NoError(t, commitSnapshot(ctx, s, Options{}, state, candidates, newSnapshot(), true, stats))
|
||||
require.True(t, isImportedFingerprint(state.current["Cache_Data/entry"]))
|
||||
|
||||
require.NoError(t, checkpointScannedCandidates(ctx, s, Options{DryRun: true}, state, candidates, stats))
|
||||
require.NoError(t, checkpointScannedCandidates(ctx, s, Options{}, state, candidates, stats))
|
||||
}
|
||||
|
||||
func TestRouteHintCollectionBranches(t *testing.T) {
|
||||
dir := t.TempDir()
|
||||
require.NoError(t, os.WriteFile(filepath.Join(dir, "route"), []byte("https://discord.com/channels/123456789012/111111111111111121"), 0o600))
|
||||
require.NoError(t, os.WriteFile(filepath.Join(dir, "plain"), []byte("plain"), 0o600))
|
||||
|
||||
root, err := os.OpenRoot(dir)
|
||||
require.NoError(t, err)
|
||||
defer func() { _ = root.Close() }()
|
||||
|
||||
snap := newSnapshot()
|
||||
err = collectCacheRouteHints(context.Background(), root, []fileCandidate{
|
||||
{relPath: "missing"},
|
||||
{relPath: "plain"},
|
||||
{relPath: "route"},
|
||||
}, snap)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, "123456789012", snap.routes["111111111111111121"])
|
||||
|
||||
canceled, cancel := context.WithCancel(context.Background())
|
||||
cancel()
|
||||
require.ErrorIs(t, collectCacheRouteHints(canceled, root, []fileCandidate{{relPath: "route"}}, newSnapshot()), context.Canceled)
|
||||
}
|
||||
@ -1,387 +0,0 @@
|
||||
package discorddesktop
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strconv"
|
||||
"testing"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/openclaw/discrawl/internal/store"
|
||||
)
|
||||
|
||||
func TestImportFastCacheSkipsUnroutedCacheDataUnlessFullCache(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
dir := t.TempDir()
|
||||
cachePath := filepath.Join(dir, "Cache", "Cache_Data")
|
||||
require.NoError(t, os.MkdirAll(cachePath, 0o755))
|
||||
require.NoError(t, os.WriteFile(filepath.Join(cachePath, "entry_0"), []byte(`
|
||||
{"id":"111111111111111121","guild_id":"999999999999999996","type":0,"name":"slow-cache"}
|
||||
{"id":"333333333333333346","channel_id":"111111111111111121","content":"unrouted historical cache","timestamp":"2026-04-23T18:20:43Z","author":{"id":"222222222222222232","username":"alice"}}
|
||||
`), 0o600))
|
||||
|
||||
fastStore, err := store.Open(ctx, filepath.Join(dir, "fast.db"))
|
||||
require.NoError(t, err)
|
||||
defer func() { _ = fastStore.Close() }()
|
||||
|
||||
stats, err := Import(ctx, fastStore, Options{Path: dir})
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, 0, stats.FilesScanned)
|
||||
require.Equal(t, 1, stats.CacheFilesFastSkipped)
|
||||
require.Equal(t, 0, stats.Messages)
|
||||
|
||||
results, err := fastStore.SearchMessages(ctx, store.SearchOptions{Query: "unrouted historical", Limit: 10})
|
||||
require.NoError(t, err)
|
||||
require.Empty(t, results)
|
||||
|
||||
stats, err = Import(ctx, fastStore, Options{Path: dir})
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, 0, stats.FilesScanned)
|
||||
require.Equal(t, 0, stats.CacheFilesFastSkipped)
|
||||
require.Equal(t, 1, stats.FilesUnchanged)
|
||||
|
||||
stats, err = Import(ctx, fastStore, Options{Path: dir, FullCache: true})
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, 1, stats.FilesScanned)
|
||||
require.Equal(t, 1, stats.Messages)
|
||||
|
||||
fullStore, err := store.Open(ctx, filepath.Join(dir, "full.db"))
|
||||
require.NoError(t, err)
|
||||
defer func() { _ = fullStore.Close() }()
|
||||
|
||||
stats, err = Import(ctx, fullStore, Options{Path: dir, FullCache: true})
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, 1, stats.FilesScanned)
|
||||
require.Equal(t, 0, stats.CacheFilesFastSkipped)
|
||||
require.Equal(t, 1, stats.Messages)
|
||||
|
||||
results, err = fullStore.SearchMessages(ctx, store.SearchOptions{Query: "unrouted historical", Limit: 10})
|
||||
require.NoError(t, err)
|
||||
require.Len(t, results, 1)
|
||||
require.Equal(t, "slow-cache", results[0].ChannelName)
|
||||
}
|
||||
|
||||
func TestImportCheckpointsCacheBatches(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
dir := t.TempDir()
|
||||
cachePath := filepath.Join(dir, "Cache", "Cache_Data")
|
||||
require.NoError(t, os.MkdirAll(cachePath, 0o755))
|
||||
for i := range checkpointEveryFiles + 1 {
|
||||
channelID := "111111111111111121"
|
||||
messageID := 333333333333333346 + i
|
||||
body := bytesf(`https://discord.com/channels/999999999999999996/%s
|
||||
{"id":"%d","channel_id":"%s","content":"checkpoint cache %d","timestamp":"2026-04-23T18:20:43Z","author":{"id":"222222222222222232","username":"alice"}}
|
||||
`, channelID, messageID, channelID, i)
|
||||
require.NoError(t, os.WriteFile(filepath.Join(cachePath, fmt.Sprintf("entry_%03d", i)), body, 0o600))
|
||||
}
|
||||
|
||||
st, err := store.Open(ctx, filepath.Join(dir, "discrawl.db"))
|
||||
require.NoError(t, err)
|
||||
defer func() { _ = st.Close() }()
|
||||
|
||||
stats, err := Import(ctx, st, Options{Path: dir})
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, checkpointEveryFiles+1, stats.FilesScanned)
|
||||
require.Equal(t, checkpointEveryFiles+1, stats.Messages)
|
||||
require.GreaterOrEqual(t, stats.Checkpoints, 2)
|
||||
|
||||
stats, err = Import(ctx, st, Options{Path: dir})
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, 0, stats.FilesScanned)
|
||||
require.Equal(t, checkpointEveryFiles+1, stats.FilesUnchanged)
|
||||
}
|
||||
|
||||
func TestImportUsesLaterCacheMetadataBeforeCheckpointingEarlierBatch(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
dir := t.TempDir()
|
||||
cachePath := filepath.Join(dir, "Cache", "Cache_Data")
|
||||
require.NoError(t, os.MkdirAll(cachePath, 0o755))
|
||||
|
||||
channelID := "111111111111111121"
|
||||
guildID := "999999999999999996"
|
||||
require.NoError(t, os.WriteFile(filepath.Join(cachePath, "entry_000"), bytesf(`https://discord.com/api/v9/channels/%s/messages?limit=50
|
||||
{"id":"333333333333333346","channel_id":"%s","content":"needs later channel metadata","timestamp":"2026-04-23T18:20:43Z","author":{"id":"222222222222222232","username":"alice"}}
|
||||
`, channelID, channelID), 0o600))
|
||||
for i := 1; i < checkpointEveryFiles; i++ {
|
||||
require.NoError(t, os.WriteFile(filepath.Join(cachePath, fmt.Sprintf("entry_%03d", i)), bytesf(
|
||||
"https://discord.com/api/v9/channels/%s/messages?limit=50\n",
|
||||
channelID,
|
||||
), 0o600))
|
||||
}
|
||||
require.NoError(t, os.WriteFile(filepath.Join(cachePath, fmt.Sprintf("entry_%03d", checkpointEveryFiles)), bytesf(`https://discord.com/api/v9/channels/%s/messages?limit=50
|
||||
{"id":"%s","guild_id":"%s","type":0,"name":"later-metadata"}
|
||||
`, channelID, channelID, guildID), 0o600))
|
||||
|
||||
st, err := store.Open(ctx, filepath.Join(dir, "discrawl.db"))
|
||||
require.NoError(t, err)
|
||||
defer func() { _ = st.Close() }()
|
||||
|
||||
stats, err := Import(ctx, st, Options{Path: dir})
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, checkpointEveryFiles+1+checkpointEveryFiles, stats.FilesScanned)
|
||||
require.Equal(t, 1, stats.Messages)
|
||||
require.GreaterOrEqual(t, stats.Checkpoints, 2)
|
||||
|
||||
results, err := st.SearchMessages(ctx, store.SearchOptions{Query: "needs later channel metadata", Limit: 10})
|
||||
require.NoError(t, err)
|
||||
require.Len(t, results, 1)
|
||||
require.Equal(t, guildID, results[0].GuildID)
|
||||
require.Equal(t, "later-metadata", results[0].ChannelName)
|
||||
requireMessageCount(t, ctx, st, "message_events", 1)
|
||||
|
||||
stats, err = Import(ctx, st, Options{Path: dir})
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, 0, stats.FilesScanned)
|
||||
require.Equal(t, checkpointEveryFiles+1, stats.FilesUnchanged)
|
||||
requireMessageCount(t, ctx, st, "message_events", 1)
|
||||
}
|
||||
|
||||
func TestImportCheckpointsPartiallyResolvedRetryBatch(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
dir := t.TempDir()
|
||||
cachePath := filepath.Join(dir, "Cache", "Cache_Data")
|
||||
require.NoError(t, os.MkdirAll(cachePath, 0o755))
|
||||
|
||||
resolvedChannelID := "111111111111111121"
|
||||
unresolvedChannelID := "111111111111111122"
|
||||
guildID := "999999999999999996"
|
||||
require.NoError(t, os.WriteFile(filepath.Join(cachePath, "entry_000"), bytesf(`https://discord.com/api/v10/channels/%s/messages?limit=50
|
||||
https://discord.com/api/v9/channels/%s/messages?limit=50
|
||||
{"id":"333333333333333346","channel_id":"%s","content":"partially resolved retry message","timestamp":"2026-04-23T18:20:43Z","author":{"id":"222222222222222232","username":"alice"}}
|
||||
{"id":"333333333333333347","channel_id":"%s","content":"still unresolved retry message","timestamp":"2026-04-23T18:20:44Z","author":{"id":"222222222222222232","username":"alice"}}
|
||||
`, resolvedChannelID, unresolvedChannelID, resolvedChannelID, unresolvedChannelID), 0o600))
|
||||
for i := 1; i < checkpointEveryFiles; i++ {
|
||||
require.NoError(t, os.WriteFile(filepath.Join(cachePath, fmt.Sprintf("entry_%03d", i)), bytesf(
|
||||
"https://discord.com/api/v9/channels/%s/messages?limit=50\n",
|
||||
resolvedChannelID,
|
||||
), 0o600))
|
||||
}
|
||||
require.NoError(t, os.WriteFile(filepath.Join(cachePath, fmt.Sprintf("entry_%03d", checkpointEveryFiles)), bytesf(`https://discord.com/api/v9/channels/%s/messages?limit=50
|
||||
{"id":"%s","guild_id":"%s","type":0,"name":"partially-resolved"}
|
||||
`, resolvedChannelID, resolvedChannelID, guildID), 0o600))
|
||||
|
||||
st, err := store.Open(ctx, filepath.Join(dir, "discrawl.db"))
|
||||
require.NoError(t, err)
|
||||
defer func() { _ = st.Close() }()
|
||||
|
||||
stats, err := Import(ctx, st, Options{Path: dir})
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, checkpointEveryFiles+1+checkpointEveryFiles, stats.FilesScanned)
|
||||
require.Equal(t, 1, stats.Messages)
|
||||
require.Equal(t, 1, stats.SkippedMessages)
|
||||
require.GreaterOrEqual(t, stats.Checkpoints, 2)
|
||||
|
||||
results, err := st.SearchMessages(ctx, store.SearchOptions{Query: "partially resolved retry", Limit: 10})
|
||||
require.NoError(t, err)
|
||||
require.Len(t, results, 1)
|
||||
require.Equal(t, "partially-resolved", results[0].ChannelName)
|
||||
results, err = st.SearchMessages(ctx, store.SearchOptions{Query: "still unresolved retry", Limit: 10})
|
||||
require.NoError(t, err)
|
||||
require.Empty(t, results)
|
||||
requireMessageCount(t, ctx, st, "message_events", 1)
|
||||
|
||||
stats, err = Import(ctx, st, Options{Path: dir})
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, 0, stats.FilesScanned)
|
||||
require.Equal(t, checkpointEveryFiles+1, stats.FilesUnchanged)
|
||||
requireMessageCount(t, ctx, st, "message_events", 1)
|
||||
}
|
||||
|
||||
func TestImportCheckpointsUnresolvableRouteBearingCacheMisses(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
dir := t.TempDir()
|
||||
cachePath := filepath.Join(dir, "Cache", "Cache_Data")
|
||||
require.NoError(t, os.MkdirAll(cachePath, 0o755))
|
||||
|
||||
channelID := "111111111111111121"
|
||||
require.NoError(t, os.WriteFile(filepath.Join(cachePath, "entry_000"), bytesf(`https://discord.com/api/v9/channels/%s/messages?limit=50
|
||||
{"id":"333333333333333346","channel_id":"%s","content":"permanent unresolved cache miss","timestamp":"2026-04-23T18:20:43Z","author":{"id":"222222222222222232","username":"alice"}}
|
||||
`, channelID, channelID), 0o600))
|
||||
|
||||
st, err := store.Open(ctx, filepath.Join(dir, "discrawl.db"))
|
||||
require.NoError(t, err)
|
||||
defer func() { _ = st.Close() }()
|
||||
|
||||
stats, err := Import(ctx, st, Options{Path: dir})
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, 1, stats.FilesScanned)
|
||||
require.Equal(t, 1, stats.SkippedMessages)
|
||||
require.Equal(t, 1, stats.Checkpoints)
|
||||
|
||||
results, err := st.SearchMessages(ctx, store.SearchOptions{Query: "permanent unresolved", Limit: 10})
|
||||
require.NoError(t, err)
|
||||
require.Empty(t, results)
|
||||
|
||||
stats, err = Import(ctx, st, Options{Path: dir})
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, 0, stats.FilesScanned)
|
||||
require.Equal(t, 1, stats.FilesUnchanged)
|
||||
}
|
||||
|
||||
func TestImportDoesNotAppendEventsForSkippedMixedBatch(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
dir := t.TempDir()
|
||||
cachePath := filepath.Join(dir, "Cache", "Cache_Data")
|
||||
require.NoError(t, os.MkdirAll(cachePath, 0o755))
|
||||
|
||||
guildID := "999999999999999996"
|
||||
resolvedChannelID := "111111111111111121"
|
||||
unresolvedChannelID := "111111111111111122"
|
||||
require.NoError(t, os.WriteFile(filepath.Join(cachePath, "entry_000"), bytesf(`https://discord.com/channels/%s/%s
|
||||
https://discord.com/api/v9/channels/%s/messages?limit=50
|
||||
{"id":"333333333333333346","channel_id":"%s","content":"mixed resolved message","timestamp":"2026-04-23T18:20:43Z","author":{"id":"222222222222222232","username":"alice"}}
|
||||
{"id":"333333333333333347","channel_id":"%s","content":"mixed unresolved message","timestamp":"2026-04-23T18:20:44Z","author":{"id":"222222222222222232","username":"alice"}}
|
||||
`, guildID, resolvedChannelID, unresolvedChannelID, resolvedChannelID, unresolvedChannelID), 0o600))
|
||||
|
||||
st, err := store.Open(ctx, filepath.Join(dir, "discrawl.db"))
|
||||
require.NoError(t, err)
|
||||
defer func() { _ = st.Close() }()
|
||||
|
||||
stats, err := Import(ctx, st, Options{Path: dir})
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, 1, stats.FilesScanned)
|
||||
require.Equal(t, 1, stats.Checkpoints)
|
||||
requireMessageCount(t, ctx, st, "message_events", 0)
|
||||
|
||||
results, err := st.SearchMessages(ctx, store.SearchOptions{Query: "mixed resolved", Limit: 10})
|
||||
require.NoError(t, err)
|
||||
require.Len(t, results, 1)
|
||||
results, err = st.SearchMessages(ctx, store.SearchOptions{Query: "mixed unresolved", Limit: 10})
|
||||
require.NoError(t, err)
|
||||
require.Empty(t, results)
|
||||
|
||||
stats, err = Import(ctx, st, Options{Path: dir})
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, 0, stats.FilesScanned)
|
||||
require.Equal(t, 1, stats.FilesUnchanged)
|
||||
requireMessageCount(t, ctx, st, "message_events", 0)
|
||||
}
|
||||
|
||||
func TestImportDoesNotDuplicateEventsWhenSwitchingFullCacheModes(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
dir := t.TempDir()
|
||||
cachePath := filepath.Join(dir, "Cache", "Cache_Data")
|
||||
require.NoError(t, os.MkdirAll(cachePath, 0o755))
|
||||
|
||||
channelID := "111111111111111121"
|
||||
guildID := "999999999999999996"
|
||||
require.NoError(t, os.WriteFile(filepath.Join(cachePath, "entry_000"), bytesf(`https://discord.com/channels/%s/%s
|
||||
{"id":"%s","guild_id":"%s","type":0,"name":"mode-switch"}
|
||||
{"id":"333333333333333346","channel_id":"%s","content":"mode switch event once","timestamp":"2026-04-23T18:20:43Z","author":{"id":"222222222222222232","username":"alice"}}
|
||||
`, guildID, channelID, channelID, guildID, channelID), 0o600))
|
||||
|
||||
t.Run("full then default", func(t *testing.T) {
|
||||
st, err := store.Open(ctx, filepath.Join(dir, "full-first.db"))
|
||||
require.NoError(t, err)
|
||||
defer func() { _ = st.Close() }()
|
||||
|
||||
stats, err := Import(ctx, st, Options{Path: dir, FullCache: true})
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, 1, stats.FilesScanned)
|
||||
require.Equal(t, 1, stats.Messages)
|
||||
requireMessageCount(t, ctx, st, "message_events", 1)
|
||||
|
||||
stats, err = Import(ctx, st, Options{Path: dir})
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, 0, stats.FilesScanned)
|
||||
require.Equal(t, 1, stats.FilesUnchanged)
|
||||
requireMessageCount(t, ctx, st, "message_events", 1)
|
||||
})
|
||||
|
||||
t.Run("default then full", func(t *testing.T) {
|
||||
st, err := store.Open(ctx, filepath.Join(dir, "default-first.db"))
|
||||
require.NoError(t, err)
|
||||
defer func() { _ = st.Close() }()
|
||||
|
||||
stats, err := Import(ctx, st, Options{Path: dir})
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, 1, stats.FilesScanned)
|
||||
require.Equal(t, 1, stats.Messages)
|
||||
requireMessageCount(t, ctx, st, "message_events", 1)
|
||||
|
||||
stats, err = Import(ctx, st, Options{Path: dir, FullCache: true})
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, 0, stats.FilesScanned)
|
||||
require.Equal(t, 1, stats.FilesUnchanged)
|
||||
requireMessageCount(t, ctx, st, "message_events", 1)
|
||||
})
|
||||
}
|
||||
|
||||
func TestImportFastCachePreservesKnownChannelMetadataAcrossBatches(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
dir := t.TempDir()
|
||||
leveldbPath := filepath.Join(dir, "Local Storage", "leveldb")
|
||||
cachePath := filepath.Join(dir, "Cache", "Cache_Data")
|
||||
require.NoError(t, os.MkdirAll(leveldbPath, 0o755))
|
||||
require.NoError(t, os.MkdirAll(cachePath, 0o755))
|
||||
|
||||
channelID := "111111111111111121"
|
||||
guildID := "999999999999999996"
|
||||
require.NoError(t, os.WriteFile(filepath.Join(leveldbPath, "000001.log"), bytesf(
|
||||
`{"id":"%s","guild_id":"%s","type":11,"name":"known-thread","thread_metadata":{"archived":false}}`,
|
||||
channelID,
|
||||
guildID,
|
||||
), 0o600))
|
||||
require.NoError(t, os.WriteFile(filepath.Join(cachePath, "entry_0"), bytesf(`https://discord.com/channels/%s/%s
|
||||
{"id":"333333333333333346","channel_id":"%s","content":"thread metadata cache","timestamp":"2026-04-23T18:20:43Z","author":{"id":"222222222222222232","username":"alice"}}
|
||||
`, guildID, channelID, channelID), 0o600))
|
||||
|
||||
st, err := store.Open(ctx, filepath.Join(dir, "discrawl.db"))
|
||||
require.NoError(t, err)
|
||||
defer func() { _ = st.Close() }()
|
||||
|
||||
stats, err := Import(ctx, st, Options{Path: dir})
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, 1, stats.Messages)
|
||||
|
||||
channels, err := st.Channels(ctx, guildID)
|
||||
require.NoError(t, err)
|
||||
require.Len(t, channels, 1)
|
||||
require.Equal(t, "known-thread", channels[0].Name)
|
||||
require.Equal(t, "thread_public", channels[0].Kind)
|
||||
|
||||
_, rows, err := st.ReadOnlyQuery(ctx, "select raw_json from channels where id = '111111111111111121'")
|
||||
require.NoError(t, err)
|
||||
require.Len(t, rows, 1)
|
||||
require.Contains(t, rows[0][0], `"type":11`)
|
||||
}
|
||||
|
||||
func TestImportFastCacheRouteFiltersServiceWorkerCacheStorage(t *testing.T) {
|
||||
ctx := context.Background()
|
||||
dir := t.TempDir()
|
||||
cachePath := filepath.Join(dir, "Service Worker", "CacheStorage", "cache-id")
|
||||
require.NoError(t, os.MkdirAll(cachePath, 0o755))
|
||||
require.NoError(t, os.WriteFile(filepath.Join(cachePath, "unrouted"), []byte(`
|
||||
{"id":"111111111111111121","guild_id":"999999999999999996","type":0,"name":"service-worker-cache"}
|
||||
{"id":"333333333333333346","channel_id":"111111111111111121","content":"service worker historical cache","timestamp":"2026-04-23T18:20:43Z","author":{"id":"222222222222222232","username":"alice"}}
|
||||
`), 0o600))
|
||||
|
||||
st, err := store.Open(ctx, filepath.Join(dir, "discrawl.db"))
|
||||
require.NoError(t, err)
|
||||
defer func() { _ = st.Close() }()
|
||||
|
||||
stats, err := Import(ctx, st, Options{Path: dir})
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, 0, stats.FilesScanned)
|
||||
require.Equal(t, 1, stats.CacheFilesFastSkipped)
|
||||
|
||||
results, err := st.SearchMessages(ctx, store.SearchOptions{Query: "service worker historical", Limit: 10})
|
||||
require.NoError(t, err)
|
||||
require.Empty(t, results)
|
||||
}
|
||||
|
||||
func requireMessageCount(t *testing.T, ctx context.Context, st *store.Store, table string, expected int) {
|
||||
t.Helper()
|
||||
_, rows, err := st.ReadOnlyQuery(ctx, "select count(*) from "+table)
|
||||
require.NoError(t, err)
|
||||
require.Len(t, rows, 1)
|
||||
require.Len(t, rows[0], 1)
|
||||
require.Equal(t, strconv.Itoa(expected), rows[0][0])
|
||||
}
|
||||
|
||||
func bytesf(format string, args ...any) []byte {
|
||||
return fmt.Appendf(nil, format, args...)
|
||||
}
|
||||
@ -1,110 +0,0 @@
|
||||
package discorddesktop
|
||||
|
||||
import (
|
||||
"context"
|
||||
"os"
|
||||
|
||||
"github.com/openclaw/discrawl/internal/store"
|
||||
)
|
||||
|
||||
type importRun struct {
|
||||
ctx context.Context
|
||||
st *store.Store
|
||||
opts Options
|
||||
state scanState
|
||||
rootFS *os.Root
|
||||
channelLookup map[string]store.ChannelRecord
|
||||
totals scanTotals
|
||||
stats *Stats
|
||||
base snapshot
|
||||
pending []fileCandidate
|
||||
pendingUnresolved unresolvedMessages
|
||||
pendingLookupSize int
|
||||
pendingRouteSize int
|
||||
}
|
||||
|
||||
func newImportRun(ctx context.Context, st *store.Store, opts Options, state scanState, rootFS *os.Root, stats *Stats) *importRun {
|
||||
return &importRun{
|
||||
ctx: ctx,
|
||||
st: st,
|
||||
opts: opts,
|
||||
state: state,
|
||||
rootFS: rootFS,
|
||||
channelLookup: copyChannelLookup(state.channels),
|
||||
totals: newScanTotals(),
|
||||
stats: stats,
|
||||
base: newSnapshot(),
|
||||
pendingUnresolved: unresolvedMessages{},
|
||||
pendingLookupSize: -1,
|
||||
pendingRouteSize: -1,
|
||||
}
|
||||
}
|
||||
|
||||
func (r *importRun) scanContext(candidates []fileCandidate) error {
|
||||
if err := scanCandidates(r.ctx, r.rootFS, r.opts, candidates, r.base, r.channelLookup, r.stats); err != nil {
|
||||
return err
|
||||
}
|
||||
return r.finalizeAndCommit(candidates, r.base, false)
|
||||
}
|
||||
|
||||
func (r *importRun) scanCacheBatches(candidates []fileCandidate) error {
|
||||
for start := 0; start < len(candidates); start += checkpointEveryFiles {
|
||||
end := min(start+checkpointEveryFiles, len(candidates))
|
||||
batchCandidates := candidates[start:end]
|
||||
batch := newSnapshotWithContext(r.base)
|
||||
if err := scanCandidates(r.ctx, r.rootFS, r.opts, batchCandidates, batch, r.channelLookup, r.stats); err != nil {
|
||||
return err
|
||||
}
|
||||
if err := r.finalizeAndCommit(batchCandidates, batch, false); err != nil {
|
||||
return err
|
||||
}
|
||||
mergeSnapshotContext(r.base, batch)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *importRun) finalizeAndCommit(candidates []fileCandidate, snap snapshot, recordSkipped bool) error {
|
||||
unresolved := finalizeSnapshot(snap, r.channelLookup, r.totals, r.stats, recordSkipped)
|
||||
checkpoint := len(unresolved) == 0
|
||||
if !checkpoint {
|
||||
r.deferCandidates(candidates, unresolved)
|
||||
}
|
||||
if len(candidates) == 0 && !snapshotHasChanges(snap) {
|
||||
return nil
|
||||
}
|
||||
return commitSnapshot(r.ctx, r.st, r.opts, r.state, candidates, snap, checkpoint, r.stats)
|
||||
}
|
||||
|
||||
func (r *importRun) deferCandidates(candidates []fileCandidate, unresolved unresolvedMessages) {
|
||||
r.pending = append(r.pending, candidates...)
|
||||
mergeUnresolved(r.pendingUnresolved, unresolved)
|
||||
if r.pendingLookupSize >= 0 {
|
||||
return
|
||||
}
|
||||
r.pendingLookupSize = len(r.channelLookup)
|
||||
r.pendingRouteSize = len(r.base.routes)
|
||||
}
|
||||
|
||||
func (r *importRun) retryPending() error {
|
||||
if len(r.pending) == 0 {
|
||||
return nil
|
||||
}
|
||||
if !r.pendingCanResolve() {
|
||||
recordUnresolved(r.pendingUnresolved, r.totals, r.stats)
|
||||
return checkpointScannedCandidates(r.ctx, r.st, r.opts, r.state, r.pending, r.stats)
|
||||
}
|
||||
retry := newSnapshotWithContext(r.base)
|
||||
if err := scanCandidates(r.ctx, r.rootFS, r.opts, r.pending, retry, r.channelLookup, r.stats); err != nil {
|
||||
return err
|
||||
}
|
||||
finalizeSnapshot(retry, r.channelLookup, r.totals, r.stats, true)
|
||||
if err := commitSnapshot(r.ctx, r.st, r.opts, r.state, r.pending, retry, true, r.stats); err != nil {
|
||||
return err
|
||||
}
|
||||
mergeSnapshotContext(r.base, retry)
|
||||
return nil
|
||||
}
|
||||
|
||||
func (r *importRun) pendingCanResolve() bool {
|
||||
return len(r.channelLookup) > r.pendingLookupSize || len(r.base.routes) > r.pendingRouteSize
|
||||
}
|
||||
@ -13,7 +13,7 @@ import (
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/openclaw/discrawl/internal/store"
|
||||
"github.com/steipete/discrawl/internal/store"
|
||||
)
|
||||
|
||||
func TestDesktopPathAndImportHelpers(t *testing.T) {
|
||||
|
||||
@ -1,165 +0,0 @@
|
||||
package discorddesktop
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/openclaw/discrawl/internal/store"
|
||||
)
|
||||
|
||||
func TestPrimitiveValueHelpers(t *testing.T) {
|
||||
raw := map[string]any{
|
||||
"string": "value",
|
||||
"blank": " ",
|
||||
"int": 3,
|
||||
"int64": int64(4),
|
||||
"float": float64(5),
|
||||
"json_number": json.Number("6"),
|
||||
"numeric": "7",
|
||||
"bad_numeric": "nope",
|
||||
"truthy": true,
|
||||
"array": []any{"one", "two"},
|
||||
}
|
||||
|
||||
require.Equal(t, "value", stringField(raw, "string"))
|
||||
require.Empty(t, stringField(raw, "blank"))
|
||||
require.Equal(t, "6", stringField(raw, "json_number"))
|
||||
require.Empty(t, stringField(raw, "int"))
|
||||
require.Empty(t, stringField(raw, "missing"))
|
||||
|
||||
for key, want := range map[string]int{
|
||||
"int": 3,
|
||||
"float": 5,
|
||||
"json_number": 6,
|
||||
} {
|
||||
got, ok := intField(raw, key)
|
||||
require.True(t, ok, key)
|
||||
require.Equal(t, want, got, key)
|
||||
}
|
||||
_, ok := intField(raw, "bad_numeric")
|
||||
require.False(t, ok)
|
||||
_, ok = intField(raw, "int64")
|
||||
require.False(t, ok)
|
||||
_, ok = intField(raw, "numeric")
|
||||
require.False(t, ok)
|
||||
_, ok = intField(raw, "missing")
|
||||
require.False(t, ok)
|
||||
|
||||
require.Equal(t, int64(3), int64Field(raw, "int"))
|
||||
require.Equal(t, int64(4), int64Field(raw, "int64"))
|
||||
require.Equal(t, int64(5), int64Field(raw, "float"))
|
||||
require.Equal(t, int64(6), int64Field(raw, "json_number"))
|
||||
require.Zero(t, int64Field(raw, "numeric"))
|
||||
require.Zero(t, int64Field(raw, "bad_numeric"))
|
||||
|
||||
require.True(t, boolField(raw, "truthy"))
|
||||
require.False(t, boolField(raw, "missing"))
|
||||
require.Equal(t, 2, lenArray(raw["array"]))
|
||||
require.Zero(t, lenArray(raw["string"]))
|
||||
require.Equal(t, "fallback", firstNonEmpty("", " ", "fallback", "later"))
|
||||
require.Empty(t, firstNonEmpty("", " "))
|
||||
}
|
||||
|
||||
func TestDiscordValueFormatHelpers(t *testing.T) {
|
||||
require.Equal(t, "456789", shortID("123456789"))
|
||||
require.Equal(t, "short", shortID("short"))
|
||||
require.Equal(t, "Discord Direct Messages", guildName(DirectMessageGuildID))
|
||||
require.Equal(t, "Discord Desktop Guild 123456", guildName("123456"))
|
||||
|
||||
require.Equal(t, "dm", kindForChannelType(1, true))
|
||||
require.Equal(t, "group_dm", kindForChannelType(3, true))
|
||||
require.Equal(t, "thread_public", kindForChannelType(11, false))
|
||||
require.Equal(t, "thread_private", kindForChannelType(12, false))
|
||||
require.Equal(t, "thread_announcement", kindForChannelType(10, false))
|
||||
require.Equal(t, "desktop", kindForChannelType(2, false))
|
||||
require.Equal(t, "desktop", kindForChannelType(4, false))
|
||||
require.Equal(t, "announcement", kindForChannelType(5, false))
|
||||
require.Equal(t, "forum", kindForChannelType(15, false))
|
||||
require.Equal(t, "desktop", kindForChannelType(16, false))
|
||||
require.Equal(t, "text", kindForChannelType(0, false))
|
||||
}
|
||||
|
||||
func TestDiscordMessagePayloadHelpers(t *testing.T) {
|
||||
raw := map[string]any{
|
||||
"id": "333333333333333333",
|
||||
"channel_id": "111111111111111111",
|
||||
"guild_id": "999999999999999999",
|
||||
"type": float64(0),
|
||||
"timestamp": "2026-05-08T12:00:00Z",
|
||||
"edited_timestamp": "2026-05-08T12:05:00Z",
|
||||
"content": "hello\u200b\nworld",
|
||||
"message_reference": map[string]any{"message_id": "222222222222222222"},
|
||||
"author": map[string]any{
|
||||
"id": "444444444444444444",
|
||||
"username": "peter",
|
||||
"global_name": "Peter",
|
||||
"display_name": "Peter S",
|
||||
"discriminator": "0",
|
||||
"bot": true,
|
||||
},
|
||||
"attachments": []any{
|
||||
map[string]any{"filename": "trace.txt", "content_type": "text/plain", "size": float64(12), "url": "https://cdn.example/trace.txt"},
|
||||
map[string]any{"id": "att2"},
|
||||
"ignored",
|
||||
},
|
||||
"mentions": []any{
|
||||
map[string]any{"id": "555555555555555555", "username": "alice", "global_name": "Alice"},
|
||||
map[string]any{"username": "missing"},
|
||||
},
|
||||
"embeds": []any{
|
||||
map[string]any{"title": "Deploy", "description": "Ready"},
|
||||
map[string]any{"title": " "},
|
||||
},
|
||||
}
|
||||
at := parseDiscordTime("2026-05-08T12:00:00Z")
|
||||
attachments := parseAttachments(raw, "333333333333333333", "999999999999999999", "111111111111111111", "444444444444444444")
|
||||
require.Len(t, attachments, 2)
|
||||
require.Equal(t, "333333333333333333:0", attachments[0].AttachmentID)
|
||||
require.Equal(t, "trace.txt", attachments[0].Filename)
|
||||
require.Equal(t, "att2", attachments[1].Filename)
|
||||
require.Equal(t, []string{"trace.txt", "att2"}, attachmentText(attachments))
|
||||
|
||||
mentions := parseMentions(raw, "333333333333333333", "999999999999999999", "111111111111111111", "444444444444444444", at)
|
||||
require.Equal(t, []store.MentionEventRecord{{
|
||||
MessageID: "333333333333333333",
|
||||
GuildID: "999999999999999999",
|
||||
ChannelID: "111111111111111111",
|
||||
AuthorID: "444444444444444444",
|
||||
TargetType: "user",
|
||||
TargetID: "555555555555555555",
|
||||
TargetName: "Alice",
|
||||
EventAt: at.Format(time.RFC3339Nano),
|
||||
}}, mentions)
|
||||
|
||||
require.Equal(t, []string{"Deploy", "Ready"}, embedText(raw))
|
||||
require.Equal(t, "helloworld\ntrace.txt\natt2\nDeploy\nReady", normalizeText(raw["content"], attachmentText(attachments), embedText(raw)))
|
||||
require.Equal(t, "hidden text", cleanText("\u200bhidden\x00 text\n"))
|
||||
require.Equal(t, "222222222222222222", messageReferenceID(raw))
|
||||
require.Empty(t, messageReferenceID(map[string]any{}))
|
||||
|
||||
require.Contains(t, syntheticGuild("g1", "Guild").RawJSON, "discord_desktop")
|
||||
require.Equal(t, "dm", syntheticChannel("c1", DirectMessageGuildID, "Alice").Kind)
|
||||
require.Equal(t, "group_dm", syntheticChannel("c2", DirectMessageGuildID, "Alice, Bob").Kind)
|
||||
require.Equal(t, "channel-123456", syntheticChannel("123456123456", "g1", "").Name)
|
||||
require.Contains(t, channelRawJSON(raw, "c1", "g1", "general", "text"), `"kind":"text"`)
|
||||
require.Contains(t, messageRawJSON(raw, "333333333333333333", "999999999999999999", "111111111111111111", "444444444444444444"), "desktop_cache_note")
|
||||
require.Equal(t, "Alice, Bob", recipientLabel([]any{
|
||||
map[string]any{"username": "Bob"},
|
||||
map[string]any{"global_name": "Alice"},
|
||||
map[string]any{},
|
||||
}))
|
||||
|
||||
require.True(t, parseDiscordTime("2026-05-08T12:00:00.123Z").Equal(time.Date(2026, 5, 8, 12, 0, 0, 123000000, time.UTC)))
|
||||
require.True(t, parseDiscordTime("bad").IsZero())
|
||||
require.True(t, parseDiscordTime("").IsZero())
|
||||
require.False(t, snowflakeTime("175928847299117063").IsZero())
|
||||
require.True(t, snowflakeTime("bad").IsZero())
|
||||
require.Empty(t, formatOptionalTime(time.Time{}))
|
||||
require.Equal(t, "2026-05-08T12:00:00Z", formatOptionalTime(at))
|
||||
require.True(t, looksSnowflake("123456789012"))
|
||||
require.False(t, looksSnowflake("123"))
|
||||
require.False(t, looksSnowflake("12345678901x"))
|
||||
}
|
||||
91
internal/embed/ollama.go
Normal file
91
internal/embed/ollama.go
Normal file
@ -0,0 +1,91 @@
|
||||
package embed
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"context"
|
||||
"encoding/json"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
)
|
||||
|
||||
type ollamaProvider struct {
|
||||
client *http.Client
|
||||
baseURL string
|
||||
model string
|
||||
maxInputChars int
|
||||
}
|
||||
|
||||
type ollamaEmbedRequest struct {
|
||||
Model string `json:"model"`
|
||||
Input []string `json:"input"`
|
||||
}
|
||||
|
||||
type ollamaEmbedResponse struct {
|
||||
Model string `json:"model"`
|
||||
Embeddings [][]float32 `json:"embeddings"`
|
||||
}
|
||||
|
||||
func newOllamaProvider(settings providerSettings) Provider {
|
||||
return &ollamaProvider{
|
||||
client: settings.HTTPClient,
|
||||
baseURL: settings.BaseURL,
|
||||
model: settings.Model,
|
||||
maxInputChars: settings.MaxInputChars,
|
||||
}
|
||||
}
|
||||
|
||||
func (p *ollamaProvider) Embed(ctx context.Context, inputs []string) (EmbeddingBatch, error) {
|
||||
if len(inputs) == 0 {
|
||||
return EmbeddingBatch{Model: p.model}, nil
|
||||
}
|
||||
payload := ollamaEmbedRequest{
|
||||
Model: p.model,
|
||||
Input: trimInputs(inputs, p.maxInputChars),
|
||||
}
|
||||
var response ollamaEmbedResponse
|
||||
if err := postJSON(ctx, p.client, p.baseURL+"/api/embed", "", payload, &response); err != nil {
|
||||
return EmbeddingBatch{}, err
|
||||
}
|
||||
if len(response.Embeddings) != len(inputs) {
|
||||
return EmbeddingBatch{}, fmt.Errorf("ollama embedding response returned %d vectors for %d inputs", len(response.Embeddings), len(inputs))
|
||||
}
|
||||
dimensions, err := inferDimensions(response.Embeddings)
|
||||
if err != nil {
|
||||
return EmbeddingBatch{}, err
|
||||
}
|
||||
model := response.Model
|
||||
if model == "" {
|
||||
model = p.model
|
||||
}
|
||||
return EmbeddingBatch{Model: model, Dimensions: dimensions, Vectors: response.Embeddings}, nil
|
||||
}
|
||||
|
||||
func postJSON(ctx context.Context, client *http.Client, endpoint, apiKey string, payload any, target any) error {
|
||||
body, err := json.Marshal(payload)
|
||||
if err != nil {
|
||||
return fmt.Errorf("marshal embedding request: %w", err)
|
||||
}
|
||||
req, err := http.NewRequestWithContext(ctx, http.MethodPost, endpoint, bytes.NewReader(body))
|
||||
if err != nil {
|
||||
return fmt.Errorf("build embedding request: %w", err)
|
||||
}
|
||||
req.Header.Set("Content-Type", "application/json")
|
||||
req.Header.Set("Accept", "application/json")
|
||||
if apiKey != "" {
|
||||
req.Header.Set("Authorization", "Bearer "+apiKey)
|
||||
}
|
||||
resp, err := client.Do(req)
|
||||
if err != nil {
|
||||
return fmt.Errorf("embedding request failed: %w", err)
|
||||
}
|
||||
defer func() { _ = resp.Body.Close() }()
|
||||
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||||
msg, _ := io.ReadAll(io.LimitReader(resp.Body, 4096))
|
||||
return &HTTPError{StatusCode: resp.StatusCode, Body: string(msg)}
|
||||
}
|
||||
if err := json.NewDecoder(resp.Body).Decode(target); err != nil {
|
||||
return fmt.Errorf("decode embedding response: %w", err)
|
||||
}
|
||||
return nil
|
||||
}
|
||||
82
internal/embed/openai_compatible.go
Normal file
82
internal/embed/openai_compatible.go
Normal file
@ -0,0 +1,82 @@
|
||||
package embed
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"net/http"
|
||||
)
|
||||
|
||||
type openAICompatibleProvider struct {
|
||||
client *http.Client
|
||||
baseURL string
|
||||
apiKey string
|
||||
model string
|
||||
maxInputChars int
|
||||
}
|
||||
|
||||
type openAIEmbeddingRequest struct {
|
||||
Model string `json:"model"`
|
||||
Input []string `json:"input"`
|
||||
}
|
||||
|
||||
type openAIEmbeddingResponse struct {
|
||||
Model string `json:"model"`
|
||||
Data []openAIEmbeddingItem `json:"data"`
|
||||
}
|
||||
|
||||
type openAIEmbeddingItem struct {
|
||||
Index *int `json:"index"`
|
||||
Embedding []float32 `json:"embedding"`
|
||||
}
|
||||
|
||||
func newOpenAICompatibleProvider(settings providerSettings) Provider {
|
||||
return &openAICompatibleProvider{
|
||||
client: settings.HTTPClient,
|
||||
baseURL: settings.BaseURL,
|
||||
apiKey: settings.APIKey,
|
||||
model: settings.Model,
|
||||
maxInputChars: settings.MaxInputChars,
|
||||
}
|
||||
}
|
||||
|
||||
func (p *openAICompatibleProvider) Embed(ctx context.Context, inputs []string) (EmbeddingBatch, error) {
|
||||
if len(inputs) == 0 {
|
||||
return EmbeddingBatch{Model: p.model}, nil
|
||||
}
|
||||
payload := openAIEmbeddingRequest{
|
||||
Model: p.model,
|
||||
Input: trimInputs(inputs, p.maxInputChars),
|
||||
}
|
||||
var response openAIEmbeddingResponse
|
||||
if err := postJSON(ctx, p.client, p.baseURL+"/embeddings", p.apiKey, payload, &response); err != nil {
|
||||
return EmbeddingBatch{}, err
|
||||
}
|
||||
if len(response.Data) != len(inputs) {
|
||||
return EmbeddingBatch{}, fmt.Errorf("openai-compatible embedding response returned %d vectors for %d inputs", len(response.Data), len(inputs))
|
||||
}
|
||||
vectors := make([][]float32, len(inputs))
|
||||
seen := make([]bool, len(inputs))
|
||||
for position, item := range response.Data {
|
||||
index := position
|
||||
if item.Index != nil {
|
||||
index = *item.Index
|
||||
}
|
||||
if index < 0 || index >= len(inputs) {
|
||||
return EmbeddingBatch{}, fmt.Errorf("openai-compatible embedding response index %d out of range", index)
|
||||
}
|
||||
if seen[index] {
|
||||
return EmbeddingBatch{}, fmt.Errorf("openai-compatible embedding response duplicated index %d", index)
|
||||
}
|
||||
seen[index] = true
|
||||
vectors[index] = item.Embedding
|
||||
}
|
||||
dimensions, err := inferDimensions(vectors)
|
||||
if err != nil {
|
||||
return EmbeddingBatch{}, err
|
||||
}
|
||||
model := response.Model
|
||||
if model == "" {
|
||||
model = p.model
|
||||
}
|
||||
return EmbeddingBatch{Model: model, Dimensions: dimensions, Vectors: vectors}, nil
|
||||
}
|
||||
310
internal/embed/provider.go
Normal file
310
internal/embed/provider.go
Normal file
@ -0,0 +1,310 @@
|
||||
package embed
|
||||
|
||||
import (
|
||||
"context"
|
||||
"errors"
|
||||
"fmt"
|
||||
"net"
|
||||
"net/http"
|
||||
"net/url"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/steipete/discrawl/internal/config"
|
||||
)
|
||||
|
||||
const (
|
||||
ProviderOpenAI = "openai"
|
||||
ProviderOllama = "ollama"
|
||||
ProviderLlamaCpp = "llamacpp"
|
||||
ProviderOpenAICompatible = "openai_compatible"
|
||||
DefaultOpenAIBaseURL = "https://api.openai.com/v1"
|
||||
DefaultOllamaBaseURL = "http://127.0.0.1:11434"
|
||||
DefaultLlamaCppBaseURL = "http://127.0.0.1:8080/v1"
|
||||
DefaultOpenAIModel = "text-embedding-3-small"
|
||||
DefaultLocalEmbeddingModel = "nomic-embed-text"
|
||||
DefaultBatchSize = 64
|
||||
DefaultMaxInputChars = 12000
|
||||
DefaultRequestTimeout = 2 * time.Minute
|
||||
DefaultProbeTimeout = 2 * time.Second
|
||||
)
|
||||
|
||||
// Provider is the narrow embedding surface used by later queue/search work.
|
||||
type Provider interface {
|
||||
Embed(ctx context.Context, inputs []string) (EmbeddingBatch, error)
|
||||
}
|
||||
|
||||
type EmbeddingBatch struct {
|
||||
Model string
|
||||
Dimensions int
|
||||
Vectors [][]float32
|
||||
}
|
||||
|
||||
type HTTPError struct {
|
||||
StatusCode int
|
||||
Body string
|
||||
}
|
||||
|
||||
func (e *HTTPError) Error() string {
|
||||
return fmt.Sprintf("embedding request failed with HTTP %d: %s", e.StatusCode, e.Body)
|
||||
}
|
||||
|
||||
func IsRateLimitError(err error) bool {
|
||||
var httpErr *HTTPError
|
||||
return errors.As(err, &httpErr) && httpErr.StatusCode == http.StatusTooManyRequests
|
||||
}
|
||||
|
||||
type CheckResult struct {
|
||||
Provider string
|
||||
Model string
|
||||
BaseURL string
|
||||
Status string
|
||||
Warning string
|
||||
Probed bool
|
||||
}
|
||||
|
||||
type Option func(*providerOptions)
|
||||
|
||||
type providerOptions struct {
|
||||
httpClient *http.Client
|
||||
timeoutOverride time.Duration
|
||||
}
|
||||
|
||||
type providerSettings struct {
|
||||
Name string
|
||||
Model string
|
||||
BaseURL string
|
||||
APIKey string
|
||||
MaxInputChars int
|
||||
Timeout time.Duration
|
||||
HTTPClient *http.Client
|
||||
}
|
||||
|
||||
func WithHTTPClient(client *http.Client) Option {
|
||||
return func(opts *providerOptions) {
|
||||
opts.httpClient = client
|
||||
}
|
||||
}
|
||||
|
||||
func WithRequestTimeout(timeout time.Duration) Option {
|
||||
return func(opts *providerOptions) {
|
||||
opts.timeoutOverride = timeout
|
||||
}
|
||||
}
|
||||
|
||||
func NewProvider(cfg config.EmbeddingsConfig, opts ...Option) (Provider, error) {
|
||||
settings, err := resolveProviderConfig(cfg, true, opts...)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
return newProvider(settings)
|
||||
}
|
||||
|
||||
func CheckProvider(ctx context.Context, cfg config.EmbeddingsConfig) CheckResult {
|
||||
settings, err := resolveProviderConfig(cfg, true, WithRequestTimeout(DefaultProbeTimeout))
|
||||
if err != nil {
|
||||
return CheckResult{
|
||||
Provider: normalizedProviderName(cfg.Provider),
|
||||
Model: strings.TrimSpace(cfg.Model),
|
||||
BaseURL: strings.TrimSpace(cfg.BaseURL),
|
||||
Status: "warning",
|
||||
Warning: err.Error(),
|
||||
}
|
||||
}
|
||||
result := CheckResult{
|
||||
Provider: settings.Name,
|
||||
Model: settings.Model,
|
||||
BaseURL: settings.BaseURL,
|
||||
Status: "ok",
|
||||
}
|
||||
if !shouldProbe(settings) {
|
||||
return result
|
||||
}
|
||||
provider, err := newProvider(settings)
|
||||
if err != nil {
|
||||
result.Status = "warning"
|
||||
result.Warning = err.Error()
|
||||
return result
|
||||
}
|
||||
probeCtx, cancel := context.WithTimeout(ctx, DefaultProbeTimeout)
|
||||
defer cancel()
|
||||
if _, err := provider.Embed(probeCtx, []string{"discrawl probe"}); err != nil {
|
||||
result.Status = "warning"
|
||||
result.Warning = err.Error()
|
||||
return result
|
||||
}
|
||||
result.Probed = true
|
||||
return result
|
||||
}
|
||||
|
||||
func resolveProviderConfig(cfg config.EmbeddingsConfig, validateAPIKey bool, opts ...Option) (providerSettings, error) {
|
||||
options := providerOptions{}
|
||||
for _, opt := range opts {
|
||||
opt(&options)
|
||||
}
|
||||
name := normalizedProviderName(cfg.Provider)
|
||||
if name == "" {
|
||||
name = ProviderOpenAI
|
||||
}
|
||||
model := strings.TrimSpace(cfg.Model)
|
||||
if model == "" {
|
||||
model = defaultModel(name)
|
||||
}
|
||||
baseURL := strings.TrimRight(strings.TrimSpace(cfg.BaseURL), "/")
|
||||
if baseURL == "" {
|
||||
switch name {
|
||||
case ProviderOpenAI:
|
||||
baseURL = DefaultOpenAIBaseURL
|
||||
case ProviderOllama:
|
||||
baseURL = DefaultOllamaBaseURL
|
||||
case ProviderLlamaCpp:
|
||||
baseURL = DefaultLlamaCppBaseURL
|
||||
case ProviderOpenAICompatible:
|
||||
return providerSettings{}, fmt.Errorf("embedding provider %q requires base_url", name)
|
||||
}
|
||||
}
|
||||
timeout := DefaultRequestTimeout
|
||||
if strings.TrimSpace(cfg.RequestTimeout) != "" {
|
||||
parsed, err := time.ParseDuration(cfg.RequestTimeout)
|
||||
if err != nil {
|
||||
return providerSettings{}, fmt.Errorf("parse embeddings request_timeout: %w", err)
|
||||
}
|
||||
if parsed <= 0 {
|
||||
return providerSettings{}, errors.New("embeddings request_timeout must be positive")
|
||||
}
|
||||
timeout = parsed
|
||||
}
|
||||
if options.timeoutOverride > 0 && options.timeoutOverride < timeout {
|
||||
timeout = options.timeoutOverride
|
||||
}
|
||||
maxInputChars := cfg.MaxInputChars
|
||||
if maxInputChars <= 0 {
|
||||
maxInputChars = DefaultMaxInputChars
|
||||
}
|
||||
switch name {
|
||||
case ProviderOpenAI, ProviderOllama, ProviderLlamaCpp, ProviderOpenAICompatible:
|
||||
default:
|
||||
return providerSettings{}, fmt.Errorf("unsupported embedding provider %q", name)
|
||||
}
|
||||
apiKey, err := resolveAPIKey(name, cfg.APIKeyEnv, validateAPIKey)
|
||||
if err != nil {
|
||||
return providerSettings{}, err
|
||||
}
|
||||
client := options.httpClient
|
||||
if client == nil {
|
||||
client = &http.Client{Timeout: timeout}
|
||||
}
|
||||
if _, err := url.ParseRequestURI(baseURL); err != nil {
|
||||
return providerSettings{}, fmt.Errorf("invalid embeddings base_url %q: %w", baseURL, err)
|
||||
}
|
||||
return providerSettings{
|
||||
Name: name,
|
||||
Model: model,
|
||||
BaseURL: baseURL,
|
||||
APIKey: apiKey,
|
||||
MaxInputChars: maxInputChars,
|
||||
Timeout: timeout,
|
||||
HTTPClient: client,
|
||||
}, nil
|
||||
}
|
||||
|
||||
func newProvider(settings providerSettings) (Provider, error) {
|
||||
switch settings.Name {
|
||||
case ProviderOllama:
|
||||
return newOllamaProvider(settings), nil
|
||||
case ProviderOpenAI, ProviderLlamaCpp, ProviderOpenAICompatible:
|
||||
return newOpenAICompatibleProvider(settings), nil
|
||||
default:
|
||||
return nil, fmt.Errorf("unsupported embedding provider %q", settings.Name)
|
||||
}
|
||||
}
|
||||
|
||||
func resolveAPIKey(provider, apiKeyEnv string, validate bool) (string, error) {
|
||||
envName := strings.TrimSpace(apiKeyEnv)
|
||||
required := provider == ProviderOpenAI
|
||||
if envName == "" {
|
||||
if required {
|
||||
envName = "OPENAI_API_KEY"
|
||||
} else {
|
||||
return "", nil
|
||||
}
|
||||
}
|
||||
value := strings.TrimSpace(os.Getenv(envName))
|
||||
if value == "" {
|
||||
if required || validate {
|
||||
return "", fmt.Errorf("embedding provider %q requires API key env %s", provider, envName)
|
||||
}
|
||||
return "", nil
|
||||
}
|
||||
return value, nil
|
||||
}
|
||||
|
||||
func normalizedProviderName(provider string) string {
|
||||
return strings.ToLower(strings.TrimSpace(provider))
|
||||
}
|
||||
|
||||
func defaultModel(provider string) string {
|
||||
switch provider {
|
||||
case ProviderOllama, ProviderLlamaCpp:
|
||||
return DefaultLocalEmbeddingModel
|
||||
default:
|
||||
return DefaultOpenAIModel
|
||||
}
|
||||
}
|
||||
|
||||
func shouldProbe(settings providerSettings) bool {
|
||||
switch settings.Name {
|
||||
case ProviderOllama, ProviderLlamaCpp:
|
||||
return true
|
||||
case ProviderOpenAICompatible:
|
||||
return isLoopbackBaseURL(settings.BaseURL)
|
||||
default:
|
||||
return false
|
||||
}
|
||||
}
|
||||
|
||||
func isLoopbackBaseURL(rawURL string) bool {
|
||||
parsed, err := url.Parse(rawURL)
|
||||
if err != nil {
|
||||
return false
|
||||
}
|
||||
host := parsed.Hostname()
|
||||
if host == "localhost" {
|
||||
return true
|
||||
}
|
||||
ip := net.ParseIP(host)
|
||||
return ip != nil && ip.IsLoopback()
|
||||
}
|
||||
|
||||
func trimInputs(inputs []string, maxChars int) []string {
|
||||
if maxChars <= 0 {
|
||||
maxChars = DefaultMaxInputChars
|
||||
}
|
||||
out := make([]string, len(inputs))
|
||||
for i, input := range inputs {
|
||||
runes := []rune(input)
|
||||
if len(runes) > maxChars {
|
||||
runes = runes[:maxChars]
|
||||
}
|
||||
out[i] = string(runes)
|
||||
}
|
||||
return out
|
||||
}
|
||||
|
||||
func inferDimensions(vectors [][]float32) (int, error) {
|
||||
dimensions := 0
|
||||
for _, vector := range vectors {
|
||||
if len(vector) == 0 {
|
||||
return 0, errors.New("embedding response contained an empty vector")
|
||||
}
|
||||
if dimensions == 0 {
|
||||
dimensions = len(vector)
|
||||
continue
|
||||
}
|
||||
if len(vector) != dimensions {
|
||||
return 0, fmt.Errorf("embedding response dimensions mismatch: got %d want %d", len(vector), dimensions)
|
||||
}
|
||||
}
|
||||
return dimensions, nil
|
||||
}
|
||||
319
internal/embed/provider_test.go
Normal file
319
internal/embed/provider_test.go
Normal file
@ -0,0 +1,319 @@
|
||||
package embed
|
||||
|
||||
import (
|
||||
"context"
|
||||
"encoding/json"
|
||||
"net/http"
|
||||
"net/http/httptest"
|
||||
"testing"
|
||||
"time"
|
||||
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/steipete/discrawl/internal/config"
|
||||
)
|
||||
|
||||
func TestOllamaProviderEmbeds(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
assert.Equal(t, "/api/embed", r.URL.Path)
|
||||
assert.Equal(t, http.MethodPost, r.Method)
|
||||
var req ollamaEmbedRequest
|
||||
assert.NoError(t, json.NewDecoder(r.Body).Decode(&req))
|
||||
assert.Equal(t, "nomic-embed-text", req.Model)
|
||||
assert.Equal(t, []string{"abcd", "xy"}, req.Input)
|
||||
_, _ = w.Write([]byte(`{"model":"nomic-embed-text","embeddings":[[1,2,3],[4,5,6]]}`))
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
provider, err := NewProvider(config.EmbeddingsConfig{
|
||||
Provider: ProviderOllama,
|
||||
Model: "nomic-embed-text",
|
||||
BaseURL: server.URL,
|
||||
MaxInputChars: 4,
|
||||
RequestTimeout: "5s",
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
batch, err := provider.Embed(context.Background(), []string{"abcdef", "xy"})
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, "nomic-embed-text", batch.Model)
|
||||
require.Equal(t, 3, batch.Dimensions)
|
||||
require.Equal(t, [][]float32{{1, 2, 3}, {4, 5, 6}}, batch.Vectors)
|
||||
}
|
||||
|
||||
func TestOpenAICompatibleProviderEmbedsAndUsesAuth(t *testing.T) {
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
assert.Equal(t, "/embeddings", r.URL.Path)
|
||||
assert.Equal(t, "Bearer secret", r.Header.Get("Authorization"))
|
||||
var req openAIEmbeddingRequest
|
||||
assert.NoError(t, json.NewDecoder(r.Body).Decode(&req))
|
||||
assert.Equal(t, "local-model", req.Model)
|
||||
assert.Equal(t, []string{"one", "two"}, req.Input)
|
||||
_, _ = w.Write([]byte(`{
|
||||
"model":"local-model",
|
||||
"data":[
|
||||
{"index":1,"embedding":[3,4]},
|
||||
{"index":0,"embedding":[1,2]}
|
||||
]
|
||||
}`))
|
||||
}))
|
||||
defer server.Close()
|
||||
t.Setenv("DISCRAWL_EMBED_KEY", "secret")
|
||||
|
||||
provider, err := NewProvider(config.EmbeddingsConfig{
|
||||
Provider: ProviderOpenAICompatible,
|
||||
Model: "local-model",
|
||||
BaseURL: server.URL,
|
||||
APIKeyEnv: "DISCRAWL_EMBED_KEY",
|
||||
RequestTimeout: "5s",
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
batch, err := provider.Embed(context.Background(), []string{"one", "two"})
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, "local-model", batch.Model)
|
||||
require.Equal(t, 2, batch.Dimensions)
|
||||
require.Equal(t, [][]float32{{1, 2}, {3, 4}}, batch.Vectors)
|
||||
}
|
||||
|
||||
func TestProviderFactoryDefaultsAndValidation(t *testing.T) {
|
||||
t.Setenv("OPENAI_API_KEY", "openai-secret")
|
||||
|
||||
openAI, err := resolveProviderConfig(config.EmbeddingsConfig{
|
||||
Provider: ProviderOpenAI,
|
||||
RequestTimeout: "5s",
|
||||
}, true)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, DefaultOpenAIBaseURL, openAI.BaseURL)
|
||||
require.Equal(t, DefaultOpenAIModel, openAI.Model)
|
||||
require.Equal(t, "openai-secret", openAI.APIKey)
|
||||
|
||||
ollama, err := resolveProviderConfig(config.EmbeddingsConfig{
|
||||
Provider: ProviderOllama,
|
||||
RequestTimeout: "5s",
|
||||
}, true)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, DefaultOllamaBaseURL, ollama.BaseURL)
|
||||
require.Equal(t, DefaultLocalEmbeddingModel, ollama.Model)
|
||||
|
||||
llamaCpp, err := resolveProviderConfig(config.EmbeddingsConfig{
|
||||
Provider: ProviderLlamaCpp,
|
||||
RequestTimeout: "5s",
|
||||
}, true)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, DefaultLlamaCppBaseURL, llamaCpp.BaseURL)
|
||||
|
||||
_, err = resolveProviderConfig(config.EmbeddingsConfig{
|
||||
Provider: ProviderOpenAICompatible,
|
||||
RequestTimeout: "5s",
|
||||
}, true)
|
||||
require.ErrorContains(t, err, "requires base_url")
|
||||
}
|
||||
|
||||
func TestProviderFactoryRequiresOpenAIAPIKey(t *testing.T) {
|
||||
t.Setenv("OPENAI_API_KEY", "")
|
||||
|
||||
_, err := NewProvider(config.EmbeddingsConfig{
|
||||
Provider: ProviderOpenAI,
|
||||
RequestTimeout: "5s",
|
||||
})
|
||||
require.ErrorContains(t, err, "requires API key env OPENAI_API_KEY")
|
||||
}
|
||||
|
||||
func TestProviderFactoryReportsUnsupportedProviderBeforeAPIKey(t *testing.T) {
|
||||
t.Setenv("MISSING_EMBED_KEY", "")
|
||||
|
||||
_, err := NewProvider(config.EmbeddingsConfig{
|
||||
Provider: "bogus",
|
||||
APIKeyEnv: "MISSING_EMBED_KEY",
|
||||
RequestTimeout: "5s",
|
||||
})
|
||||
require.ErrorContains(t, err, "unsupported embedding provider \"bogus\"")
|
||||
}
|
||||
|
||||
func TestCheckProviderProbesLocalProvider(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
assert.Equal(t, "/api/embed", r.URL.Path)
|
||||
_, _ = w.Write([]byte(`{"model":"nomic-embed-text","embeddings":[[1,2]]}`))
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
result := CheckProvider(context.Background(), config.EmbeddingsConfig{
|
||||
Provider: ProviderOllama,
|
||||
Model: "nomic-embed-text",
|
||||
BaseURL: server.URL,
|
||||
RequestTimeout: "5s",
|
||||
})
|
||||
require.Equal(t, "ok", result.Status)
|
||||
require.True(t, result.Probed)
|
||||
require.Empty(t, result.Warning)
|
||||
require.Equal(t, server.URL, result.BaseURL)
|
||||
}
|
||||
|
||||
func TestCheckProviderWarnsOnLocalProbeFailure(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
http.Error(w, "not ready", http.StatusServiceUnavailable)
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
result := CheckProvider(context.Background(), config.EmbeddingsConfig{
|
||||
Provider: ProviderOllama,
|
||||
Model: "nomic-embed-text",
|
||||
BaseURL: server.URL,
|
||||
RequestTimeout: "5s",
|
||||
})
|
||||
require.Equal(t, "warning", result.Status)
|
||||
require.Contains(t, result.Warning, "HTTP 503")
|
||||
require.False(t, result.Probed)
|
||||
}
|
||||
|
||||
func TestProviderExposesRateLimitErrors(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
http.Error(w, "rate limited", http.StatusTooManyRequests)
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
provider, err := NewProvider(config.EmbeddingsConfig{
|
||||
Provider: ProviderOpenAICompatible,
|
||||
Model: "local-model",
|
||||
BaseURL: server.URL,
|
||||
RequestTimeout: "5s",
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
_, err = provider.Embed(context.Background(), []string{"one"})
|
||||
require.ErrorContains(t, err, "HTTP 429")
|
||||
require.True(t, IsRateLimitError(err))
|
||||
}
|
||||
|
||||
func TestProviderRejectsInvalidResponses(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
_, _ = w.Write([]byte(`{"data":[{"index":0,"embedding":[1]},{"index":1,"embedding":[2,3]}]}`))
|
||||
}))
|
||||
defer server.Close()
|
||||
|
||||
provider, err := NewProvider(config.EmbeddingsConfig{
|
||||
Provider: ProviderOpenAICompatible,
|
||||
Model: "local-model",
|
||||
BaseURL: server.URL,
|
||||
RequestTimeout: "5s",
|
||||
})
|
||||
require.NoError(t, err)
|
||||
|
||||
_, err = provider.Embed(context.Background(), []string{"one", "two"})
|
||||
require.ErrorContains(t, err, "dimensions mismatch")
|
||||
}
|
||||
|
||||
func TestEmbeddingProvidersHandleEmptyInputsAndIndexErrors(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
settings := providerSettings{
|
||||
Name: ProviderOllama,
|
||||
Model: "model",
|
||||
BaseURL: "http://127.0.0.1:1",
|
||||
MaxInputChars: 10,
|
||||
HTTPClient: http.DefaultClient,
|
||||
}
|
||||
ollama := newOllamaProvider(settings)
|
||||
batch, err := ollama.Embed(context.Background(), nil)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, "model", batch.Model)
|
||||
|
||||
settings.Name = ProviderOpenAICompatible
|
||||
openai := newOpenAICompatibleProvider(settings)
|
||||
batch, err = openai.Embed(context.Background(), nil)
|
||||
require.NoError(t, err)
|
||||
require.Equal(t, "model", batch.Model)
|
||||
|
||||
tests := []struct {
|
||||
name string
|
||||
body string
|
||||
inputs []string
|
||||
want string
|
||||
}{
|
||||
{
|
||||
name: "count",
|
||||
body: `{"data":[]}`,
|
||||
inputs: []string{"one"},
|
||||
want: "returned 0 vectors for 1 inputs",
|
||||
},
|
||||
{
|
||||
name: "range",
|
||||
body: `{"data":[{"index":2,"embedding":[1]}]}`,
|
||||
inputs: []string{"one"},
|
||||
want: "index 2 out of range",
|
||||
},
|
||||
{
|
||||
name: "duplicate",
|
||||
body: `{"data":[{"index":0,"embedding":[1]},{"index":0,"embedding":[2]}]}`,
|
||||
inputs: []string{"one", "two"},
|
||||
want: "duplicated index 0",
|
||||
},
|
||||
}
|
||||
for _, tc := range tests {
|
||||
t.Run(tc.name, func(t *testing.T) {
|
||||
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
|
||||
_, _ = w.Write([]byte(tc.body))
|
||||
}))
|
||||
defer server.Close()
|
||||
provider, err := NewProvider(config.EmbeddingsConfig{
|
||||
Provider: ProviderOpenAICompatible,
|
||||
Model: "model",
|
||||
BaseURL: server.URL,
|
||||
RequestTimeout: "5s",
|
||||
})
|
||||
require.NoError(t, err)
|
||||
_, err = provider.Embed(context.Background(), tc.inputs)
|
||||
require.ErrorContains(t, err, tc.want)
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestProviderOptionsAndProbeDecisions(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
client := &http.Client{Timeout: time.Second}
|
||||
settings, err := resolveProviderConfig(config.EmbeddingsConfig{
|
||||
Provider: ProviderOllama,
|
||||
BaseURL: "http://127.0.0.1:11434/",
|
||||
RequestTimeout: "30s",
|
||||
}, true, WithHTTPClient(client), WithRequestTimeout(50*time.Millisecond))
|
||||
require.NoError(t, err)
|
||||
require.Same(t, client, settings.HTTPClient)
|
||||
require.Equal(t, 50*time.Millisecond, settings.Timeout)
|
||||
require.Equal(t, "http://127.0.0.1:11434", settings.BaseURL)
|
||||
require.True(t, shouldProbe(settings))
|
||||
|
||||
require.True(t, isLoopbackBaseURL("http://localhost:8080/v1"))
|
||||
require.True(t, isLoopbackBaseURL("http://[::1]:8080/v1"))
|
||||
require.False(t, isLoopbackBaseURL("https://api.example.com/v1"))
|
||||
require.False(t, isLoopbackBaseURL("://bad"))
|
||||
require.False(t, shouldProbe(providerSettings{Name: ProviderOpenAI}))
|
||||
require.True(t, shouldProbe(providerSettings{Name: ProviderOpenAICompatible, BaseURL: "http://localhost:8080/v1"}))
|
||||
require.False(t, shouldProbe(providerSettings{Name: ProviderOpenAICompatible, BaseURL: "https://api.example.com/v1"}))
|
||||
}
|
||||
|
||||
func TestCheckProviderSkipsRemoteCompatibleProbe(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
result := CheckProvider(context.Background(), config.EmbeddingsConfig{
|
||||
Provider: ProviderOpenAICompatible,
|
||||
Model: "remote-model",
|
||||
BaseURL: "https://api.example.com/v1",
|
||||
RequestTimeout: "5s",
|
||||
})
|
||||
require.Equal(t, "ok", result.Status)
|
||||
require.False(t, result.Probed)
|
||||
require.Empty(t, result.Warning)
|
||||
}
|
||||
@ -7,7 +7,7 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/openclaw/discrawl/internal/store"
|
||||
"github.com/steipete/discrawl/internal/store"
|
||||
)
|
||||
|
||||
// DigestOptions controls how a Digest is built.
|
||||
|
||||
@ -8,7 +8,7 @@ import (
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/openclaw/discrawl/internal/store"
|
||||
"github.com/steipete/discrawl/internal/store"
|
||||
)
|
||||
|
||||
func TestBuildDigest(t *testing.T) {
|
||||
|
||||
@ -8,7 +8,7 @@ import (
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
"github.com/openclaw/discrawl/internal/store"
|
||||
"github.com/steipete/discrawl/internal/store"
|
||||
)
|
||||
|
||||
// QuietOptions controls how a Quiet report is built.
|
||||
|
||||
@ -8,7 +8,7 @@ import (
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/openclaw/discrawl/internal/store"
|
||||
"github.com/steipete/discrawl/internal/store"
|
||||
)
|
||||
|
||||
func TestBuildQuiet(t *testing.T) {
|
||||
|
||||
@ -14,7 +14,7 @@ import (
|
||||
"text/template"
|
||||
"time"
|
||||
|
||||
"github.com/openclaw/discrawl/internal/store"
|
||||
"github.com/steipete/discrawl/internal/store"
|
||||
)
|
||||
|
||||
const (
|
||||
|
||||
@ -10,7 +10,7 @@ import (
|
||||
|
||||
"github.com/stretchr/testify/require"
|
||||
|
||||
"github.com/openclaw/discrawl/internal/store"
|
||||
"github.com/steipete/discrawl/internal/store"
|
||||
)
|
||||
|
||||
func TestBuildRenderAndUpdateReadme(t *testing.T) {
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user