fix(docs): remove full LLM corpus endpoint

This commit is contained in:
Vincent Koc 2026-05-07 20:06:20 -07:00
parent 590b289f95
commit 691ce7cbe3
No known key found for this signature in database
5 changed files with 123 additions and 14 deletions

View File

@ -99,6 +99,34 @@ jobs:
if (!/^---\n/m.test(text)) throw new Error(`${path}: markdown frontmatter missing`);
console.log(`${path}: ok (markdown)`);
};
const assertText = async (path, pattern) => {
const url = new URL(path, baseUrl);
url.searchParams.set("_openclaw_smoke", process.env.GITHUB_SHA);
const response = await fetch(url, {
headers: {
"cache-control": "no-cache",
pragma: "no-cache",
},
});
if (!response.ok) throw new Error(`${path}: text HTTP ${response.status}`);
const text = await response.text();
const contentType = response.headers.get("content-type") ?? "";
if (!/text\/plain|text\/markdown|application\/xml/.test(contentType)) {
throw new Error(`${path}: unexpected content-type ${contentType}`);
}
if (!pattern.test(text)) throw new Error(`${path}: expected marker missing`);
console.log(`${path}: ok (${contentType})`);
};
const assertStatus = async (path, status) => {
const response = await fetch(new URL(path, baseUrl), {
headers: {
"cache-control": "no-cache",
pragma: "no-cache",
},
});
if (response.status !== status) throw new Error(`${path}: expected HTTP ${status}, got ${response.status}`);
console.log(`${path}: ok (HTTP ${status})`);
};
const deadline = Date.now() + 20 * 60 * 1000;
let lastError;
for (let attempt = 1; Date.now() < deadline; attempt += 1) {
@ -107,6 +135,12 @@ jobs:
await assertPage(page, attempt);
}
await assertMarkdown("/concepts/models");
await assertText("/robots.txt", /Sitemap: .*\/sitemap\.xml/);
await assertText("/llms.txt", /## Documentation Index/);
await assertText("/.well-known/llms.txt", /## Documentation Index/);
await assertText("/sitemap.xml", /<urlset/);
await assertStatus("/llms-full.txt", 410);
await assertStatus("/.well-known/llms-full.txt", 410);
process.exit(0);
} catch (error) {
lastError = error;

View File

@ -191,6 +191,10 @@ curl -I https://documentation.openclaw.ai/start/getting-started
curl -I https://documentation.openclaw.ai/concepts/models
curl -I https://documentation.openclaw.ai/concepts/models.md
curl -I https://documentation.openclaw.ai/docs/platforms/digitalocean
curl -I https://documentation.openclaw.ai/llms.txt
curl -I https://documentation.openclaw.ai/.well-known/llms.txt
curl -I https://documentation.openclaw.ai/robots.txt
curl -I https://documentation.openclaw.ai/sitemap.xml
curl -I https://documentation.openclaw.ai/llms-full.txt
curl -I https://documentation.openclaw.ai/assets/docs-site.css
curl -i https://documentation.openclaw.ai/ask-molty/api/session
@ -200,6 +204,10 @@ Expected after R2 cutover:
- slashless HTML paths return `200`.
- `.md` paths return `text/markdown`.
- `/llms.txt` and `/.well-known/llms.txt` return the lightweight docs index, not a full-site corpus.
- `/robots.txt` returns `200 text/plain`.
- `/sitemap.xml` returns `200 application/xml`.
- `/llms-full.txt` returns `410`; OpenClaw intentionally does not publish a full-site LLM corpus.
- docs responses include `X-OpenClaw-Docs-Origin: cloudflare-r2`.
- repeated router requests become `X-OpenClaw-Docs-Cache: HIT`.
- `/ask-molty/api/session` returns `401` when logged out.

View File

@ -63,7 +63,7 @@ copyPublicFiles();
await renderPageOgCards();
for (const page of pages) writePage(page);
writeLlmsIndex();
writeLlmsFull();
writeRobotsTxt();
writeSitemap();
writeRedirects();
writeStaticAssets();
@ -330,16 +330,6 @@ function searchModal() {
return `<div class="search-modal"><div class="search-panel"><div class="search-head"><input data-search-input placeholder="Search docs"><button data-search-close>Close</button></div><div class="search-results" data-search-results></div></div></div>`;
}
function writeLlmsFull() {
const llmsOrigin = (process.env.DOCS_SITE_CANONICAL_ORIGIN ?? (process.env.DOCS_SITE_CNAME ? `https://${process.env.DOCS_SITE_CNAME}` : "")).replace(/\/$/, "");
const englishPages = englishDocsPages();
const content = englishPages.map((page) => {
const source = llmsOrigin ? `${llmsOrigin}${pageRoute(page)}` : pageRoute(page);
return `# ${page.title}\nSource: ${source}\n\n${stripMdxForLlms(page.body).trim()}\n`;
}).join("\n\n---\n\n");
fs.writeFileSync(path.join(outDir, "llms-full.txt"), `${content}\n`, "utf8");
}
function writeLlmsIndex() {
const origin = docsOrigin();
const lines = [
@ -347,9 +337,13 @@ function writeLlmsIndex() {
"",
config.description ?? "OpenClaw documentation.",
"",
"## Full Documentation",
"> Use this file as a lightweight map of the OpenClaw documentation. Fetch individual pages as Markdown with `.md` URLs or `Accept: text/markdown`; OpenClaw does not publish a full-site LLM corpus.",
"",
`- [llms-full.txt](${origin}/llms-full.txt): Full plain-text documentation bundle for LLM context.`,
"## Agent Resources",
"",
`- [Markdown page export](${origin}/start/getting-started.md): Append \`.md\` to any docs page URL for clean Markdown.`,
`- [Sitemap](${origin}/sitemap.xml): Search crawler URL index.`,
`- [Robots policy](${origin}/robots.txt): Bot and crawler policy.`,
"",
"## Documentation Index",
"",
@ -361,6 +355,47 @@ function writeLlmsIndex() {
const content = `${lines.join("\n")}\n`;
fs.writeFileSync(path.join(outDir, "llms.txt"), content, "utf8");
fs.writeFileSync(path.join(outDir, "llm.txt"), content, "utf8");
const wellKnownDir = path.join(outDir, ".well-known");
fs.mkdirSync(wellKnownDir, { recursive: true });
fs.writeFileSync(path.join(wellKnownDir, "llms.txt"), content, "utf8");
}
function writeRobotsTxt() {
const origin = docsOrigin();
const botAgents = [
"GPTBot",
"OAI-SearchBot",
"ChatGPT-User",
"ClaudeBot",
"Claude-User",
"PerplexityBot",
"Perplexity-User",
"Google-Extended",
];
const lines = [
"# OpenClaw documentation crawler policy",
"# Human docs are HTML. Agent-optimized docs are available as Markdown via .md URLs or Accept: text/markdown.",
"# No full-site LLM corpus is published; use /llms.txt as the index and fetch only the pages you need.",
"",
"User-agent: *",
"Allow: /",
"Disallow: /ask-molty/api/",
"Disallow: /llms-full.txt",
"Disallow: /.well-known/llms-full.txt",
"",
];
for (const agent of botAgents) {
lines.push(`User-agent: ${agent}`);
lines.push("Allow: /");
lines.push("Disallow: /ask-molty/api/");
lines.push("Disallow: /llms-full.txt");
lines.push("Disallow: /.well-known/llms-full.txt");
lines.push("");
}
lines.push(`Sitemap: ${origin}/sitemap.xml`);
lines.push(`LLMS: ${origin}/llms.txt`);
lines.push("");
fs.writeFileSync(path.join(outDir, "robots.txt"), lines.join("\n"), "utf8");
}
function writeSitemap() {

View File

@ -12,7 +12,8 @@ const required = [
"concepts/models.md",
"llm.txt",
"llms.txt",
"llms-full.txt",
".well-known/llms.txt",
"robots.txt",
"sitemap.xml",
"de/tools/reactions/index.html",
"de/gateway/heartbeat/index.html",
@ -40,6 +41,21 @@ for (const rel of required) {
if (pattern.test(html)) throw new Error(`${rel}: poison matched ${pattern}`);
}
}
for (const rel of ["llms-full.txt", ".well-known/llms-full.txt"]) {
if (fs.existsSync(path.join(site, rel))) throw new Error(`${rel}: full-site LLM corpus should not be emitted`);
}
const llms = fs.readFileSync(path.join(site, "llms.txt"), "utf8");
if (/llms-full\.txt/.test(llms)) throw new Error("llms.txt: should not advertise llms-full.txt");
if (!/Accept: text\/markdown|\.md/.test(llms)) throw new Error("llms.txt: should advertise page-level Markdown");
const wellKnownLlms = fs.readFileSync(path.join(site, ".well-known/llms.txt"), "utf8");
if (wellKnownLlms !== llms) throw new Error(".well-known/llms.txt: does not match root llms.txt");
const robots = fs.readFileSync(path.join(site, "robots.txt"), "utf8");
if (!/Sitemap: https:\/\/documentation\.openclaw\.ai\/sitemap\.xml/.test(robots)) {
throw new Error("robots.txt: sitemap directive missing");
}
if (!/Disallow: \/llms-full\.txt/.test(robots) || !/LLMS: https:\/\/documentation\.openclaw\.ai\/llms\.txt/.test(robots)) {
throw new Error("robots.txt: LLM directives missing");
}
const zhReactions = fs.readFileSync(path.join(site, "zh-CN/tools/reactions/index.html"), "utf8");
if (!/href="(?:\/docs)?\/zh-CN\/tools\/reactions"/.test(zhReactions)) {
throw new Error("zh-CN reactions: language picker does not preserve current page");

View File

@ -20,6 +20,17 @@ export default {
});
}
if (isFullLlmsPath(url.pathname)) {
return new Response(request.method === "HEAD" ? null : "OpenClaw does not publish a full-site LLM corpus. Use /llms.txt and page-level Markdown instead.\n", {
status: 410,
headers: {
"Cache-Control": "public, max-age=300",
"Content-Type": "text/plain; charset=utf-8",
"X-OpenClaw-Docs-Origin": "worker",
},
});
}
if (url.pathname.endsWith(".md")) {
return markdownResponse(env, ctx, request, url.pathname);
}
@ -177,6 +188,11 @@ function isHtmlPath(pathname: string): boolean {
return pathname.endsWith(".html") || !/\.[^/]+$/.test(pathname);
}
function isFullLlmsPath(pathname: string): boolean {
const clean = pathname.replace(/\/+$/, "");
return clean === "/llms-full.txt" || clean === "/.well-known/llms-full.txt";
}
function appendVary(current: string | null, value: string): string {
const parts = new Set((current ?? "").split(",").map((part) => part.trim()).filter(Boolean));
parts.add(value);