diff --git a/.github/workflows/docs-live-smoke.yml b/.github/workflows/docs-live-smoke.yml index 7a4bdd32c..47817761e 100644 --- a/.github/workflows/docs-live-smoke.yml +++ b/.github/workflows/docs-live-smoke.yml @@ -99,6 +99,34 @@ jobs: if (!/^---\n/m.test(text)) throw new Error(`${path}: markdown frontmatter missing`); console.log(`${path}: ok (markdown)`); }; + const assertText = async (path, pattern) => { + const url = new URL(path, baseUrl); + url.searchParams.set("_openclaw_smoke", process.env.GITHUB_SHA); + const response = await fetch(url, { + headers: { + "cache-control": "no-cache", + pragma: "no-cache", + }, + }); + if (!response.ok) throw new Error(`${path}: text HTTP ${response.status}`); + const text = await response.text(); + const contentType = response.headers.get("content-type") ?? ""; + if (!/text\/plain|text\/markdown|application\/xml/.test(contentType)) { + throw new Error(`${path}: unexpected content-type ${contentType}`); + } + if (!pattern.test(text)) throw new Error(`${path}: expected marker missing`); + console.log(`${path}: ok (${contentType})`); + }; + const assertStatus = async (path, status) => { + const response = await fetch(new URL(path, baseUrl), { + headers: { + "cache-control": "no-cache", + pragma: "no-cache", + }, + }); + if (response.status !== status) throw new Error(`${path}: expected HTTP ${status}, got ${response.status}`); + console.log(`${path}: ok (HTTP ${status})`); + }; const deadline = Date.now() + 20 * 60 * 1000; let lastError; for (let attempt = 1; Date.now() < deadline; attempt += 1) { @@ -107,6 +135,12 @@ jobs: await assertPage(page, attempt); } await assertMarkdown("/concepts/models"); + await assertText("/robots.txt", /Sitemap: .*\/sitemap\.xml/); + await assertText("/llms.txt", /## Documentation Index/); + await assertText("/.well-known/llms.txt", /## Documentation Index/); + await assertText("/sitemap.xml", /
`; } -function writeLlmsFull() { - const llmsOrigin = (process.env.DOCS_SITE_CANONICAL_ORIGIN ?? (process.env.DOCS_SITE_CNAME ? `https://${process.env.DOCS_SITE_CNAME}` : "")).replace(/\/$/, ""); - const englishPages = englishDocsPages(); - const content = englishPages.map((page) => { - const source = llmsOrigin ? `${llmsOrigin}${pageRoute(page)}` : pageRoute(page); - return `# ${page.title}\nSource: ${source}\n\n${stripMdxForLlms(page.body).trim()}\n`; - }).join("\n\n---\n\n"); - fs.writeFileSync(path.join(outDir, "llms-full.txt"), `${content}\n`, "utf8"); -} - function writeLlmsIndex() { const origin = docsOrigin(); const lines = [ @@ -347,9 +337,13 @@ function writeLlmsIndex() { "", config.description ?? "OpenClaw documentation.", "", - "## Full Documentation", + "> Use this file as a lightweight map of the OpenClaw documentation. Fetch individual pages as Markdown with `.md` URLs or `Accept: text/markdown`; OpenClaw does not publish a full-site LLM corpus.", "", - `- [llms-full.txt](${origin}/llms-full.txt): Full plain-text documentation bundle for LLM context.`, + "## Agent Resources", + "", + `- [Markdown page export](${origin}/start/getting-started.md): Append \`.md\` to any docs page URL for clean Markdown.`, + `- [Sitemap](${origin}/sitemap.xml): Search crawler URL index.`, + `- [Robots policy](${origin}/robots.txt): Bot and crawler policy.`, "", "## Documentation Index", "", @@ -361,6 +355,47 @@ function writeLlmsIndex() { const content = `${lines.join("\n")}\n`; fs.writeFileSync(path.join(outDir, "llms.txt"), content, "utf8"); fs.writeFileSync(path.join(outDir, "llm.txt"), content, "utf8"); + const wellKnownDir = path.join(outDir, ".well-known"); + fs.mkdirSync(wellKnownDir, { recursive: true }); + fs.writeFileSync(path.join(wellKnownDir, "llms.txt"), content, "utf8"); +} + +function writeRobotsTxt() { + const origin = docsOrigin(); + const botAgents = [ + "GPTBot", + "OAI-SearchBot", + "ChatGPT-User", + "ClaudeBot", + "Claude-User", + "PerplexityBot", + "Perplexity-User", + "Google-Extended", + ]; + const lines = [ + "# OpenClaw documentation crawler policy", + "# Human docs are HTML. Agent-optimized docs are available as Markdown via .md URLs or Accept: text/markdown.", + "# No full-site LLM corpus is published; use /llms.txt as the index and fetch only the pages you need.", + "", + "User-agent: *", + "Allow: /", + "Disallow: /ask-molty/api/", + "Disallow: /llms-full.txt", + "Disallow: /.well-known/llms-full.txt", + "", + ]; + for (const agent of botAgents) { + lines.push(`User-agent: ${agent}`); + lines.push("Allow: /"); + lines.push("Disallow: /ask-molty/api/"); + lines.push("Disallow: /llms-full.txt"); + lines.push("Disallow: /.well-known/llms-full.txt"); + lines.push(""); + } + lines.push(`Sitemap: ${origin}/sitemap.xml`); + lines.push(`LLMS: ${origin}/llms.txt`); + lines.push(""); + fs.writeFileSync(path.join(outDir, "robots.txt"), lines.join("\n"), "utf8"); } function writeSitemap() { diff --git a/scripts/docs-site/smoke.mjs b/scripts/docs-site/smoke.mjs index 66c76083b..202131086 100644 --- a/scripts/docs-site/smoke.mjs +++ b/scripts/docs-site/smoke.mjs @@ -12,7 +12,8 @@ const required = [ "concepts/models.md", "llm.txt", "llms.txt", - "llms-full.txt", + ".well-known/llms.txt", + "robots.txt", "sitemap.xml", "de/tools/reactions/index.html", "de/gateway/heartbeat/index.html", @@ -40,6 +41,21 @@ for (const rel of required) { if (pattern.test(html)) throw new Error(`${rel}: poison matched ${pattern}`); } } +for (const rel of ["llms-full.txt", ".well-known/llms-full.txt"]) { + if (fs.existsSync(path.join(site, rel))) throw new Error(`${rel}: full-site LLM corpus should not be emitted`); +} +const llms = fs.readFileSync(path.join(site, "llms.txt"), "utf8"); +if (/llms-full\.txt/.test(llms)) throw new Error("llms.txt: should not advertise llms-full.txt"); +if (!/Accept: text\/markdown|\.md/.test(llms)) throw new Error("llms.txt: should advertise page-level Markdown"); +const wellKnownLlms = fs.readFileSync(path.join(site, ".well-known/llms.txt"), "utf8"); +if (wellKnownLlms !== llms) throw new Error(".well-known/llms.txt: does not match root llms.txt"); +const robots = fs.readFileSync(path.join(site, "robots.txt"), "utf8"); +if (!/Sitemap: https:\/\/documentation\.openclaw\.ai\/sitemap\.xml/.test(robots)) { + throw new Error("robots.txt: sitemap directive missing"); +} +if (!/Disallow: \/llms-full\.txt/.test(robots) || !/LLMS: https:\/\/documentation\.openclaw\.ai\/llms\.txt/.test(robots)) { + throw new Error("robots.txt: LLM directives missing"); +} const zhReactions = fs.readFileSync(path.join(site, "zh-CN/tools/reactions/index.html"), "utf8"); if (!/href="(?:\/docs)?\/zh-CN\/tools\/reactions"/.test(zhReactions)) { throw new Error("zh-CN reactions: language picker does not preserve current page"); diff --git a/workers/docs-router.ts b/workers/docs-router.ts index 2ad09c54a..20c5abb64 100644 --- a/workers/docs-router.ts +++ b/workers/docs-router.ts @@ -20,6 +20,17 @@ export default { }); } + if (isFullLlmsPath(url.pathname)) { + return new Response(request.method === "HEAD" ? null : "OpenClaw does not publish a full-site LLM corpus. Use /llms.txt and page-level Markdown instead.\n", { + status: 410, + headers: { + "Cache-Control": "public, max-age=300", + "Content-Type": "text/plain; charset=utf-8", + "X-OpenClaw-Docs-Origin": "worker", + }, + }); + } + if (url.pathname.endsWith(".md")) { return markdownResponse(env, ctx, request, url.pathname); } @@ -177,6 +188,11 @@ function isHtmlPath(pathname: string): boolean { return pathname.endsWith(".html") || !/\.[^/]+$/.test(pathname); } +function isFullLlmsPath(pathname: string): boolean { + const clean = pathname.replace(/\/+$/, ""); + return clean === "/llms-full.txt" || clean === "/.well-known/llms-full.txt"; +} + function appendVary(current: string | null, value: string): string { const parts = new Set((current ?? "").split(",").map((part) => part.trim()).filter(Boolean)); parts.add(value);