docs/scripts/docs-site/source-index.mjs
2026-05-06 03:51:08 +01:00

257 lines
7.3 KiB
JavaScript

#!/usr/bin/env node
import { execFileSync } from "node:child_process";
import fs from "node:fs";
import path from "node:path";
const root = process.cwd();
const outDir = path.join(root, "dist", "docs-site");
const sourceMetaPath = path.join(root, ".openclaw-sync", "source.json");
const defaultRepoUrl = "https://github.com/openclaw/openclaw";
const maxFileBytes = 180_000;
const maxSearchChars = 600;
const maxIndexBytes = 18 * 1024 * 1024;
const includeExts = new Set([
".cjs",
".css",
".go",
".gql",
".graphql",
".html",
".java",
".js",
".json",
".jsonc",
".jsx",
".kt",
".mjs",
".py",
".rb",
".rs",
".scss",
".sh",
".sql",
".swift",
".toml",
".ts",
".tsx",
".vue",
".yaml",
".yml",
]);
const excludedPrefixes = [
".git/",
".github/codeql/",
"docs/",
"node_modules/",
"vendor/",
];
const excludedParts = new Set([
"__fixtures__",
"__snapshots__",
".next",
".turbo",
"coverage",
"dist",
"generated",
"node_modules",
"snapshots",
]);
const excludedFiles = new Set([
"package-lock.json",
"pnpm-lock.yaml",
"yarn.lock",
]);
fs.mkdirSync(outDir, { recursive: true });
const sourceMeta = readJson(sourceMetaPath) ?? {};
const sourceDir = resolveSourceDir();
const outPath = path.join(outDir, "source-index.jsonl");
const metaPath = path.join(outDir, "source-index-meta.json");
if (!sourceDir) {
if (process.env.DOCS_SOURCE_REPO_DIR) {
throw new Error(`DOCS_SOURCE_REPO_DIR not found: ${process.env.DOCS_SOURCE_REPO_DIR}`);
}
writeEmptyIndex("source checkout not found");
process.exit(0);
}
const repoUrl = normalizeRepoUrl(process.env.DOCS_SOURCE_REPO_URL ?? repoUrlFromGit(sourceDir) ?? defaultRepoUrl);
const sourceSha = process.env.DOCS_SOURCE_SHA ?? sourceMeta.sha ?? git(sourceDir, ["rev-parse", "HEAD"]);
const files = git(sourceDir, ["ls-files"]).split("\n").filter(Boolean).filter(shouldIndexFile).sort(compareFilePriority);
let bytes = 0;
let recordCount = 0;
let skippedLarge = 0;
let skippedBudget = 0;
const output = fs.createWriteStream(outPath, { encoding: "utf8" });
for (const rel of files) {
const full = path.join(sourceDir, rel);
let stat;
try {
stat = fs.statSync(full);
} catch {
continue;
}
if (!stat.isFile()) continue;
if (stat.size > maxFileBytes) {
skippedLarge += 1;
continue;
}
const text = fs.readFileSync(full, "utf8");
if (text.includes("\0") || !text.trim()) continue;
const search = searchTextForFile(rel, text);
if (!search) continue;
const record = {
path: rel,
url: `${repoUrl}/blob/${sourceSha}/${encodeURI(rel)}`,
rawUrl: rawUrlFor(repoUrl, sourceSha, rel),
commit: `${repoUrl}/commit/${sourceSha}`,
search,
};
const line = `${JSON.stringify(record)}\n`;
const lineBytes = Buffer.byteLength(line);
if (bytes + lineBytes > maxIndexBytes) {
skippedBudget += 1;
continue;
}
output.write(line);
bytes += lineBytes;
recordCount += 1;
}
await new Promise((resolve) => output.end(resolve));
const meta = {
repository: sourceMeta.repository ?? "openclaw/openclaw",
repoUrl,
sha: sourceSha,
sourceDir: path.relative(root, sourceDir),
records: recordCount,
bytes,
filesConsidered: files.length,
skippedLarge,
skippedBudget,
generatedAt: new Date().toISOString(),
};
fs.writeFileSync(metaPath, `${JSON.stringify(meta, null, 2)}\n`, "utf8");
console.log(`indexed ${recordCount} source files from ${files.length} files (${Math.round(bytes / 1024)} KiB)`);
if (skippedLarge || skippedBudget) {
console.log(`source index skips: large=${skippedLarge} budget=${skippedBudget}`);
}
function resolveSourceDir() {
const candidates = [
process.env.DOCS_SOURCE_REPO_DIR,
path.join(root, "source"),
path.join(root, "..", "openclaw-source"),
path.join(root, "..", "openclaw"),
path.join(root, "..", "clawdbot5"),
path.join(root, "..", "clawdbot"),
].filter(Boolean);
for (const candidate of candidates) {
const full = path.resolve(candidate);
if (!fs.existsSync(path.join(full, ".git"))) continue;
try {
const files = git(full, ["ls-files", "src"]).split("\n").filter(Boolean);
if (files.length > 100) return full;
} catch {
// Try the next candidate.
}
}
}
function shouldIndexFile(rel) {
if (excludedFiles.has(path.basename(rel))) return false;
if (["AGENTS.md", "CLAUDE.md"].includes(path.basename(rel))) return false;
if (excludedPrefixes.some((prefix) => rel.startsWith(prefix))) return false;
if (rel.split("/").some((part) => excludedParts.has(part))) return false;
const ext = path.extname(rel);
if (includeExts.has(ext)) return true;
return !rel.includes("/") && ["Dockerfile", "Makefile", "README.md", "CHANGELOG.md", "CONTRIBUTING.md", "SECURITY.md", "VISION.md"].includes(path.basename(rel));
}
function compareFilePriority(a, b) {
return filePriority(a) - filePriority(b) || a.localeCompare(b);
}
function filePriority(rel) {
if (/^(src|extensions|packages)\//.test(rel)) return 0;
if (/^(apps|ui|scripts|skills|config)\//.test(rel)) return 1;
if (/^\.github\//.test(rel)) return 2;
if (/^(qa|security)\//.test(rel)) return 3;
if (/^(test|patches)\//.test(rel)) return 4;
return rel.includes("/") ? 5 : 1;
}
function searchTextForFile(rel, text) {
const lines = text.replace(/\r\n/g, "\n").split("\n");
const chosen = [];
for (let i = 0; i < lines.length; i += 1) {
const line = lines[i].trim();
if (!line) continue;
if (i < 40 || isSearchSignal(line)) chosen.push(`${i + 1}: ${line}`);
if (chosen.join("\n").length >= maxSearchChars) break;
}
return chosen.join("\n").slice(0, maxSearchChars).trim();
}
function isSearchSignal(line) {
return /^(#{1,4}\s|import\s|export\s|module\.exports|async\s+function\s|function\s|class\s|interface\s|type\s|enum\s|const\s|let\s|var\s|def\s|class\s|func\s|struct\s|protocol\s|extension\s|describe\s*\(|it\s*\(|test\s*\(|name:\s|command:\s|on:\s|jobs:\s)/.test(line);
}
function rawUrlFor(repoUrl, sha, rel) {
const match = repoUrl.match(/^https:\/\/github\.com\/([^/]+)\/([^/]+)$/);
if (!match) return "";
return `https://raw.githubusercontent.com/${match[1]}/${match[2]}/${sha}/${encodeURI(rel)}`;
}
function languageForPath(rel) {
const ext = path.extname(rel).replace(/^\./, "");
if (ext) return ext;
if (path.basename(rel) === "Dockerfile") return "dockerfile";
return "text";
}
function readJson(file) {
try {
return JSON.parse(fs.readFileSync(file, "utf8"));
} catch {
return null;
}
}
function writeEmptyIndex(reason) {
fs.writeFileSync(outPath, "", "utf8");
fs.writeFileSync(metaPath, `${JSON.stringify({ records: 0, reason, generatedAt: new Date().toISOString() }, null, 2)}\n`, "utf8");
console.warn(`source index skipped: ${reason}`);
}
function repoUrlFromGit(dir) {
try {
const remotes = git(dir, ["remote", "-v"]).split("\n");
const origin = remotes.find((line) => line.startsWith("origin\t") && line.includes("(fetch)")) ?? remotes.find((line) => line.includes("(fetch)"));
return origin?.split(/\s+/)[1];
} catch {
return "";
}
}
function normalizeRepoUrl(value) {
return String(value)
.replace(/\.git$/, "")
.replace(/^git@github\.com:/, "https://github.com/")
.replace(/^ssh:\/\/git@github\.com\//, "https://github.com/")
.replace(/\/$/, "");
}
function git(dir, args) {
return execFileSync("git", ["-C", dir, ...args], { encoding: "utf8" }).trim();
}