fix(docs): remove full LLM corpus endpoint
This commit is contained in:
parent
590b289f95
commit
691ce7cbe3
34
.github/workflows/docs-live-smoke.yml
vendored
34
.github/workflows/docs-live-smoke.yml
vendored
@ -99,6 +99,34 @@ jobs:
|
||||
if (!/^---\n/m.test(text)) throw new Error(`${path}: markdown frontmatter missing`);
|
||||
console.log(`${path}: ok (markdown)`);
|
||||
};
|
||||
const assertText = async (path, pattern) => {
|
||||
const url = new URL(path, baseUrl);
|
||||
url.searchParams.set("_openclaw_smoke", process.env.GITHUB_SHA);
|
||||
const response = await fetch(url, {
|
||||
headers: {
|
||||
"cache-control": "no-cache",
|
||||
pragma: "no-cache",
|
||||
},
|
||||
});
|
||||
if (!response.ok) throw new Error(`${path}: text HTTP ${response.status}`);
|
||||
const text = await response.text();
|
||||
const contentType = response.headers.get("content-type") ?? "";
|
||||
if (!/text\/plain|text\/markdown|application\/xml/.test(contentType)) {
|
||||
throw new Error(`${path}: unexpected content-type ${contentType}`);
|
||||
}
|
||||
if (!pattern.test(text)) throw new Error(`${path}: expected marker missing`);
|
||||
console.log(`${path}: ok (${contentType})`);
|
||||
};
|
||||
const assertStatus = async (path, status) => {
|
||||
const response = await fetch(new URL(path, baseUrl), {
|
||||
headers: {
|
||||
"cache-control": "no-cache",
|
||||
pragma: "no-cache",
|
||||
},
|
||||
});
|
||||
if (response.status !== status) throw new Error(`${path}: expected HTTP ${status}, got ${response.status}`);
|
||||
console.log(`${path}: ok (HTTP ${status})`);
|
||||
};
|
||||
const deadline = Date.now() + 20 * 60 * 1000;
|
||||
let lastError;
|
||||
for (let attempt = 1; Date.now() < deadline; attempt += 1) {
|
||||
@ -107,6 +135,12 @@ jobs:
|
||||
await assertPage(page, attempt);
|
||||
}
|
||||
await assertMarkdown("/concepts/models");
|
||||
await assertText("/robots.txt", /Sitemap: .*\/sitemap\.xml/);
|
||||
await assertText("/llms.txt", /## Documentation Index/);
|
||||
await assertText("/.well-known/llms.txt", /## Documentation Index/);
|
||||
await assertText("/sitemap.xml", /<urlset/);
|
||||
await assertStatus("/llms-full.txt", 410);
|
||||
await assertStatus("/.well-known/llms-full.txt", 410);
|
||||
process.exit(0);
|
||||
} catch (error) {
|
||||
lastError = error;
|
||||
|
||||
@ -191,6 +191,10 @@ curl -I https://documentation.openclaw.ai/start/getting-started
|
||||
curl -I https://documentation.openclaw.ai/concepts/models
|
||||
curl -I https://documentation.openclaw.ai/concepts/models.md
|
||||
curl -I https://documentation.openclaw.ai/docs/platforms/digitalocean
|
||||
curl -I https://documentation.openclaw.ai/llms.txt
|
||||
curl -I https://documentation.openclaw.ai/.well-known/llms.txt
|
||||
curl -I https://documentation.openclaw.ai/robots.txt
|
||||
curl -I https://documentation.openclaw.ai/sitemap.xml
|
||||
curl -I https://documentation.openclaw.ai/llms-full.txt
|
||||
curl -I https://documentation.openclaw.ai/assets/docs-site.css
|
||||
curl -i https://documentation.openclaw.ai/ask-molty/api/session
|
||||
@ -200,6 +204,10 @@ Expected after R2 cutover:
|
||||
|
||||
- slashless HTML paths return `200`.
|
||||
- `.md` paths return `text/markdown`.
|
||||
- `/llms.txt` and `/.well-known/llms.txt` return the lightweight docs index, not a full-site corpus.
|
||||
- `/robots.txt` returns `200 text/plain`.
|
||||
- `/sitemap.xml` returns `200 application/xml`.
|
||||
- `/llms-full.txt` returns `410`; OpenClaw intentionally does not publish a full-site LLM corpus.
|
||||
- docs responses include `X-OpenClaw-Docs-Origin: cloudflare-r2`.
|
||||
- repeated router requests become `X-OpenClaw-Docs-Cache: HIT`.
|
||||
- `/ask-molty/api/session` returns `401` when logged out.
|
||||
|
||||
@ -63,7 +63,7 @@ copyPublicFiles();
|
||||
await renderPageOgCards();
|
||||
for (const page of pages) writePage(page);
|
||||
writeLlmsIndex();
|
||||
writeLlmsFull();
|
||||
writeRobotsTxt();
|
||||
writeSitemap();
|
||||
writeRedirects();
|
||||
writeStaticAssets();
|
||||
@ -330,16 +330,6 @@ function searchModal() {
|
||||
return `<div class="search-modal"><div class="search-panel"><div class="search-head"><input data-search-input placeholder="Search docs"><button data-search-close>Close</button></div><div class="search-results" data-search-results></div></div></div>`;
|
||||
}
|
||||
|
||||
function writeLlmsFull() {
|
||||
const llmsOrigin = (process.env.DOCS_SITE_CANONICAL_ORIGIN ?? (process.env.DOCS_SITE_CNAME ? `https://${process.env.DOCS_SITE_CNAME}` : "")).replace(/\/$/, "");
|
||||
const englishPages = englishDocsPages();
|
||||
const content = englishPages.map((page) => {
|
||||
const source = llmsOrigin ? `${llmsOrigin}${pageRoute(page)}` : pageRoute(page);
|
||||
return `# ${page.title}\nSource: ${source}\n\n${stripMdxForLlms(page.body).trim()}\n`;
|
||||
}).join("\n\n---\n\n");
|
||||
fs.writeFileSync(path.join(outDir, "llms-full.txt"), `${content}\n`, "utf8");
|
||||
}
|
||||
|
||||
function writeLlmsIndex() {
|
||||
const origin = docsOrigin();
|
||||
const lines = [
|
||||
@ -347,9 +337,13 @@ function writeLlmsIndex() {
|
||||
"",
|
||||
config.description ?? "OpenClaw documentation.",
|
||||
"",
|
||||
"## Full Documentation",
|
||||
"> Use this file as a lightweight map of the OpenClaw documentation. Fetch individual pages as Markdown with `.md` URLs or `Accept: text/markdown`; OpenClaw does not publish a full-site LLM corpus.",
|
||||
"",
|
||||
`- [llms-full.txt](${origin}/llms-full.txt): Full plain-text documentation bundle for LLM context.`,
|
||||
"## Agent Resources",
|
||||
"",
|
||||
`- [Markdown page export](${origin}/start/getting-started.md): Append \`.md\` to any docs page URL for clean Markdown.`,
|
||||
`- [Sitemap](${origin}/sitemap.xml): Search crawler URL index.`,
|
||||
`- [Robots policy](${origin}/robots.txt): Bot and crawler policy.`,
|
||||
"",
|
||||
"## Documentation Index",
|
||||
"",
|
||||
@ -361,6 +355,47 @@ function writeLlmsIndex() {
|
||||
const content = `${lines.join("\n")}\n`;
|
||||
fs.writeFileSync(path.join(outDir, "llms.txt"), content, "utf8");
|
||||
fs.writeFileSync(path.join(outDir, "llm.txt"), content, "utf8");
|
||||
const wellKnownDir = path.join(outDir, ".well-known");
|
||||
fs.mkdirSync(wellKnownDir, { recursive: true });
|
||||
fs.writeFileSync(path.join(wellKnownDir, "llms.txt"), content, "utf8");
|
||||
}
|
||||
|
||||
function writeRobotsTxt() {
|
||||
const origin = docsOrigin();
|
||||
const botAgents = [
|
||||
"GPTBot",
|
||||
"OAI-SearchBot",
|
||||
"ChatGPT-User",
|
||||
"ClaudeBot",
|
||||
"Claude-User",
|
||||
"PerplexityBot",
|
||||
"Perplexity-User",
|
||||
"Google-Extended",
|
||||
];
|
||||
const lines = [
|
||||
"# OpenClaw documentation crawler policy",
|
||||
"# Human docs are HTML. Agent-optimized docs are available as Markdown via .md URLs or Accept: text/markdown.",
|
||||
"# No full-site LLM corpus is published; use /llms.txt as the index and fetch only the pages you need.",
|
||||
"",
|
||||
"User-agent: *",
|
||||
"Allow: /",
|
||||
"Disallow: /ask-molty/api/",
|
||||
"Disallow: /llms-full.txt",
|
||||
"Disallow: /.well-known/llms-full.txt",
|
||||
"",
|
||||
];
|
||||
for (const agent of botAgents) {
|
||||
lines.push(`User-agent: ${agent}`);
|
||||
lines.push("Allow: /");
|
||||
lines.push("Disallow: /ask-molty/api/");
|
||||
lines.push("Disallow: /llms-full.txt");
|
||||
lines.push("Disallow: /.well-known/llms-full.txt");
|
||||
lines.push("");
|
||||
}
|
||||
lines.push(`Sitemap: ${origin}/sitemap.xml`);
|
||||
lines.push(`LLMS: ${origin}/llms.txt`);
|
||||
lines.push("");
|
||||
fs.writeFileSync(path.join(outDir, "robots.txt"), lines.join("\n"), "utf8");
|
||||
}
|
||||
|
||||
function writeSitemap() {
|
||||
|
||||
@ -12,7 +12,8 @@ const required = [
|
||||
"concepts/models.md",
|
||||
"llm.txt",
|
||||
"llms.txt",
|
||||
"llms-full.txt",
|
||||
".well-known/llms.txt",
|
||||
"robots.txt",
|
||||
"sitemap.xml",
|
||||
"de/tools/reactions/index.html",
|
||||
"de/gateway/heartbeat/index.html",
|
||||
@ -40,6 +41,21 @@ for (const rel of required) {
|
||||
if (pattern.test(html)) throw new Error(`${rel}: poison matched ${pattern}`);
|
||||
}
|
||||
}
|
||||
for (const rel of ["llms-full.txt", ".well-known/llms-full.txt"]) {
|
||||
if (fs.existsSync(path.join(site, rel))) throw new Error(`${rel}: full-site LLM corpus should not be emitted`);
|
||||
}
|
||||
const llms = fs.readFileSync(path.join(site, "llms.txt"), "utf8");
|
||||
if (/llms-full\.txt/.test(llms)) throw new Error("llms.txt: should not advertise llms-full.txt");
|
||||
if (!/Accept: text\/markdown|\.md/.test(llms)) throw new Error("llms.txt: should advertise page-level Markdown");
|
||||
const wellKnownLlms = fs.readFileSync(path.join(site, ".well-known/llms.txt"), "utf8");
|
||||
if (wellKnownLlms !== llms) throw new Error(".well-known/llms.txt: does not match root llms.txt");
|
||||
const robots = fs.readFileSync(path.join(site, "robots.txt"), "utf8");
|
||||
if (!/Sitemap: https:\/\/documentation\.openclaw\.ai\/sitemap\.xml/.test(robots)) {
|
||||
throw new Error("robots.txt: sitemap directive missing");
|
||||
}
|
||||
if (!/Disallow: \/llms-full\.txt/.test(robots) || !/LLMS: https:\/\/documentation\.openclaw\.ai\/llms\.txt/.test(robots)) {
|
||||
throw new Error("robots.txt: LLM directives missing");
|
||||
}
|
||||
const zhReactions = fs.readFileSync(path.join(site, "zh-CN/tools/reactions/index.html"), "utf8");
|
||||
if (!/href="(?:\/docs)?\/zh-CN\/tools\/reactions"/.test(zhReactions)) {
|
||||
throw new Error("zh-CN reactions: language picker does not preserve current page");
|
||||
|
||||
@ -20,6 +20,17 @@ export default {
|
||||
});
|
||||
}
|
||||
|
||||
if (isFullLlmsPath(url.pathname)) {
|
||||
return new Response(request.method === "HEAD" ? null : "OpenClaw does not publish a full-site LLM corpus. Use /llms.txt and page-level Markdown instead.\n", {
|
||||
status: 410,
|
||||
headers: {
|
||||
"Cache-Control": "public, max-age=300",
|
||||
"Content-Type": "text/plain; charset=utf-8",
|
||||
"X-OpenClaw-Docs-Origin": "worker",
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
if (url.pathname.endsWith(".md")) {
|
||||
return markdownResponse(env, ctx, request, url.pathname);
|
||||
}
|
||||
@ -177,6 +188,11 @@ function isHtmlPath(pathname: string): boolean {
|
||||
return pathname.endsWith(".html") || !/\.[^/]+$/.test(pathname);
|
||||
}
|
||||
|
||||
function isFullLlmsPath(pathname: string): boolean {
|
||||
const clean = pathname.replace(/\/+$/, "");
|
||||
return clean === "/llms-full.txt" || clean === "/.well-known/llms-full.txt";
|
||||
}
|
||||
|
||||
function appendVary(current: string | null, value: string): string {
|
||||
const parts = new Set((current ?? "").split(",").map((part) => part.trim()).filter(Boolean));
|
||||
parts.add(value);
|
||||
|
||||
Loading…
Reference in New Issue
Block a user