From 403b82445d6cc34d64c0e9b8992d8816503064cf Mon Sep 17 00:00:00 2001 From: Peter Steinberger Date: Wed, 6 May 2026 03:50:42 +0100 Subject: [PATCH] feat: publish source search index --- .github/workflows/pages.yml | 22 +++ package.json | 2 +- scripts/docs-site/source-index.mjs | 256 +++++++++++++++++++++++++++++ 3 files changed, 279 insertions(+), 1 deletion(-) create mode 100644 scripts/docs-site/source-index.mjs diff --git a/.github/workflows/pages.yml b/.github/workflows/pages.yml index 16587cd29..654f97995 100644 --- a/.github/workflows/pages.yml +++ b/.github/workflows/pages.yml @@ -34,6 +34,25 @@ jobs: - name: Check out uses: actions/checkout@v6 + - name: Read source metadata + id: source-meta + run: | + node - <<'NODE' + const fs = require("node:fs"); + const data = JSON.parse(fs.readFileSync(".openclaw-sync/source.json", "utf8")); + if (!data.repository || !data.sha) throw new Error("invalid .openclaw-sync/source.json"); + fs.appendFileSync(process.env.GITHUB_OUTPUT, `repository=${data.repository}\n`); + fs.appendFileSync(process.env.GITHUB_OUTPUT, `sha=${data.sha}\n`); + NODE + + - name: Check out OpenClaw source + uses: actions/checkout@v6 + with: + repository: ${{ steps.source-meta.outputs.repository }} + ref: ${{ steps.source-meta.outputs.sha }} + path: source + fetch-depth: 1 + - name: Set up Node uses: actions/setup-node@v6 with: @@ -46,6 +65,9 @@ jobs: - name: Build env: DOCS_SITE_CNAME: documentation.openclaw.ai + DOCS_SOURCE_REPO_DIR: source + DOCS_SOURCE_REPO_URL: https://github.com/${{ steps.source-meta.outputs.repository }} + DOCS_SOURCE_SHA: ${{ steps.source-meta.outputs.sha }} run: npm run docs:build - name: Smoke generated site diff --git a/package.json b/package.json index f84702a64..59df9351f 100644 --- a/package.json +++ b/package.json @@ -3,7 +3,7 @@ "private": true, "type": "module", "scripts": { - "docs:build": "node scripts/docs-site/build.mjs && pagefind --site dist/docs-site --output-path dist/docs-site/pagefind", + "docs:build": "node scripts/docs-site/build.mjs && node scripts/docs-site/source-index.mjs && pagefind --site dist/docs-site --output-path dist/docs-site/pagefind", "docs:smoke": "node scripts/docs-site/smoke.mjs", "docs:check": "npm run docs:build && npm run docs:smoke" }, diff --git a/scripts/docs-site/source-index.mjs b/scripts/docs-site/source-index.mjs new file mode 100644 index 000000000..ece1a1e80 --- /dev/null +++ b/scripts/docs-site/source-index.mjs @@ -0,0 +1,256 @@ +#!/usr/bin/env node +import { execFileSync } from "node:child_process"; +import fs from "node:fs"; +import path from "node:path"; + +const root = process.cwd(); +const outDir = path.join(root, "dist", "docs-site"); +const sourceMetaPath = path.join(root, ".openclaw-sync", "source.json"); +const defaultRepoUrl = "https://github.com/openclaw/openclaw"; +const maxFileBytes = 180_000; +const maxSearchChars = 600; +const maxIndexBytes = 18 * 1024 * 1024; + +const includeExts = new Set([ + ".cjs", + ".css", + ".go", + ".gql", + ".graphql", + ".html", + ".java", + ".js", + ".json", + ".jsonc", + ".jsx", + ".kt", + ".mjs", + ".py", + ".rb", + ".rs", + ".scss", + ".sh", + ".sql", + ".swift", + ".toml", + ".ts", + ".tsx", + ".vue", + ".yaml", + ".yml", +]); + +const excludedPrefixes = [ + ".git/", + ".github/codeql/", + "docs/", + "node_modules/", + "vendor/", +]; + +const excludedParts = new Set([ + "__fixtures__", + "__snapshots__", + ".next", + ".turbo", + "coverage", + "dist", + "generated", + "node_modules", + "snapshots", +]); + +const excludedFiles = new Set([ + "package-lock.json", + "pnpm-lock.yaml", + "yarn.lock", +]); + +fs.mkdirSync(outDir, { recursive: true }); + +const sourceMeta = readJson(sourceMetaPath) ?? {}; +const sourceDir = resolveSourceDir(); +const outPath = path.join(outDir, "source-index.jsonl"); +const metaPath = path.join(outDir, "source-index-meta.json"); + +if (!sourceDir) { + if (process.env.DOCS_SOURCE_REPO_DIR) { + throw new Error(`DOCS_SOURCE_REPO_DIR not found: ${process.env.DOCS_SOURCE_REPO_DIR}`); + } + writeEmptyIndex("source checkout not found"); + process.exit(0); +} + +const repoUrl = normalizeRepoUrl(process.env.DOCS_SOURCE_REPO_URL ?? repoUrlFromGit(sourceDir) ?? defaultRepoUrl); +const sourceSha = process.env.DOCS_SOURCE_SHA ?? sourceMeta.sha ?? git(sourceDir, ["rev-parse", "HEAD"]); +const files = git(sourceDir, ["ls-files"]).split("\n").filter(Boolean).filter(shouldIndexFile).sort(compareFilePriority); + +let bytes = 0; +let recordCount = 0; +let skippedLarge = 0; +let skippedBudget = 0; +const output = fs.createWriteStream(outPath, { encoding: "utf8" }); + +for (const rel of files) { + const full = path.join(sourceDir, rel); + let stat; + try { + stat = fs.statSync(full); + } catch { + continue; + } + if (!stat.isFile()) continue; + if (stat.size > maxFileBytes) { + skippedLarge += 1; + continue; + } + const text = fs.readFileSync(full, "utf8"); + if (text.includes("\0") || !text.trim()) continue; + const search = searchTextForFile(rel, text); + if (!search) continue; + const record = { + path: rel, + url: `${repoUrl}/blob/${sourceSha}/${encodeURI(rel)}`, + rawUrl: rawUrlFor(repoUrl, sourceSha, rel), + commit: `${repoUrl}/commit/${sourceSha}`, + search, + }; + const line = `${JSON.stringify(record)}\n`; + const lineBytes = Buffer.byteLength(line); + if (bytes + lineBytes > maxIndexBytes) { + skippedBudget += 1; + continue; + } + output.write(line); + bytes += lineBytes; + recordCount += 1; +} + +await new Promise((resolve) => output.end(resolve)); + +const meta = { + repository: sourceMeta.repository ?? "openclaw/openclaw", + repoUrl, + sha: sourceSha, + sourceDir: path.relative(root, sourceDir), + records: recordCount, + bytes, + filesConsidered: files.length, + skippedLarge, + skippedBudget, + generatedAt: new Date().toISOString(), +}; +fs.writeFileSync(metaPath, `${JSON.stringify(meta, null, 2)}\n`, "utf8"); +console.log(`indexed ${recordCount} source files from ${files.length} files (${Math.round(bytes / 1024)} KiB)`); +if (skippedLarge || skippedBudget) { + console.log(`source index skips: large=${skippedLarge} budget=${skippedBudget}`); +} + +function resolveSourceDir() { + const candidates = [ + process.env.DOCS_SOURCE_REPO_DIR, + path.join(root, "source"), + path.join(root, "..", "openclaw-source"), + path.join(root, "..", "openclaw"), + path.join(root, "..", "clawdbot5"), + path.join(root, "..", "clawdbot"), + ].filter(Boolean); + for (const candidate of candidates) { + const full = path.resolve(candidate); + if (!fs.existsSync(path.join(full, ".git"))) continue; + try { + const files = git(full, ["ls-files", "src"]).split("\n").filter(Boolean); + if (files.length > 100) return full; + } catch { + // Try the next candidate. + } + } +} + +function shouldIndexFile(rel) { + if (excludedFiles.has(path.basename(rel))) return false; + if (["AGENTS.md", "CLAUDE.md"].includes(path.basename(rel))) return false; + if (excludedPrefixes.some((prefix) => rel.startsWith(prefix))) return false; + if (rel.split("/").some((part) => excludedParts.has(part))) return false; + const ext = path.extname(rel); + if (includeExts.has(ext)) return true; + return !rel.includes("/") && ["Dockerfile", "Makefile", "README.md", "CHANGELOG.md", "CONTRIBUTING.md", "SECURITY.md", "VISION.md"].includes(path.basename(rel)); +} + +function compareFilePriority(a, b) { + return filePriority(a) - filePriority(b) || a.localeCompare(b); +} + +function filePriority(rel) { + if (/^(src|extensions|packages)\//.test(rel)) return 0; + if (/^(apps|ui|scripts|skills|config)\//.test(rel)) return 1; + if (/^\.github\//.test(rel)) return 2; + if (/^(qa|security)\//.test(rel)) return 3; + if (/^(test|patches)\//.test(rel)) return 4; + return rel.includes("/") ? 5 : 1; +} + +function searchTextForFile(rel, text) { + const lines = text.replace(/\r\n/g, "\n").split("\n"); + const chosen = []; + for (let i = 0; i < lines.length; i += 1) { + const line = lines[i].trim(); + if (!line) continue; + if (i < 40 || isSearchSignal(line)) chosen.push(`${i + 1}: ${line}`); + if (chosen.join("\n").length >= maxSearchChars) break; + } + return chosen.join("\n").slice(0, maxSearchChars).trim(); +} + +function isSearchSignal(line) { + return /^(#{1,4}\s|import\s|export\s|module\.exports|async\s+function\s|function\s|class\s|interface\s|type\s|enum\s|const\s|let\s|var\s|def\s|class\s|func\s|struct\s|protocol\s|extension\s|describe\s*\(|it\s*\(|test\s*\(|name:\s|command:\s|on:\s|jobs:\s)/.test(line); +} + +function rawUrlFor(repoUrl, sha, rel) { + const match = repoUrl.match(/^https:\/\/github\.com\/([^/]+)\/([^/]+)$/); + if (!match) return ""; + return `https://raw.githubusercontent.com/${match[1]}/${match[2]}/${sha}/${encodeURI(rel)}`; +} + +function languageForPath(rel) { + const ext = path.extname(rel).replace(/^\./, ""); + if (ext) return ext; + if (path.basename(rel) === "Dockerfile") return "dockerfile"; + return "text"; +} + +function readJson(file) { + try { + return JSON.parse(fs.readFileSync(file, "utf8")); + } catch { + return null; + } +} + +function writeEmptyIndex(reason) { + fs.writeFileSync(outPath, "", "utf8"); + fs.writeFileSync(metaPath, `${JSON.stringify({ records: 0, reason, generatedAt: new Date().toISOString() }, null, 2)}\n`, "utf8"); + console.warn(`source index skipped: ${reason}`); +} + +function repoUrlFromGit(dir) { + try { + const remotes = git(dir, ["remote", "-v"]).split("\n"); + const origin = remotes.find((line) => line.startsWith("origin\t") && line.includes("(fetch)")) ?? remotes.find((line) => line.includes("(fetch)")); + return origin?.split(/\s+/)[1]; + } catch { + return ""; + } +} + +function normalizeRepoUrl(value) { + return String(value) + .replace(/\.git$/, "") + .replace(/^git@github\.com:/, "https://github.com/") + .replace(/^ssh:\/\/git@github\.com\//, "https://github.com/") + .replace(/\/$/, ""); +} + +function git(dir, args) { + return execFileSync("git", ["-C", dir, ...args], { encoding: "utf8" }).trim(); +}