clawhub/convex/lib/githubImport.ts
2026-01-09 09:10:49 +01:00

426 lines
13 KiB
TypeScript

import { TEXT_FILE_EXTENSION_SET } from 'clawdhub-schema'
import { zipSync } from 'fflate'
import semver from 'semver'
import { parseFrontmatter } from './skills'
export type GitHubImportUrl = {
owner: string
repo: string
ref?: string
path?: string
originalUrl: string
}
export type GitHubImportResolved = {
owner: string
repo: string
ref: string
commit: string
path: string
repoUrl: string
originalUrl: string
}
export type GitHubImportCandidate = {
path: string
readmePath: string
name?: string
description?: string
}
export type GitHubImportFileEntry = {
path: string
size: number
defaultSelected: boolean
}
const MAX_REDIRECTS = 6
const GITHUB_HOST = 'github.com'
const CODELOAD_HOST = 'codeload.github.com'
const SKILL_FILENAMES = ['skill.md', 'skills.md']
export function parseGitHubImportUrl(input: string): GitHubImportUrl {
const originalUrl = input.trim()
let url: URL
try {
url = new URL(originalUrl)
} catch {
throw new Error('Invalid URL')
}
if (url.protocol !== 'https:') throw new Error('Only https:// URLs are supported')
if (url.hostname !== GITHUB_HOST) throw new Error('Only github.com URLs are supported')
const segments = url.pathname
.split('/')
.map((segment) => segment.trim())
.filter(Boolean)
.map((segment) => {
try {
return decodeURIComponent(segment)
} catch {
throw new Error('Invalid URL')
}
})
const owner = segments[0] ?? ''
const repo = (segments[1] ?? '').replace(/\.git$/, '')
if (!owner || !repo) throw new Error('GitHub URL must be /<owner>/<repo>')
const kind = segments[2] ?? ''
if (!kind) return { owner, repo, originalUrl }
if (kind !== 'tree' && kind !== 'blob') {
return { owner, repo, originalUrl }
}
const ref = segments[3] ?? ''
if (!ref) throw new Error('Missing ref in GitHub URL')
const rest = segments.slice(4).join('/')
const normalizedRest = normalizeRepoPath(rest)
if (kind === 'blob') {
if (!rest) throw new Error('Missing path in GitHub URL')
if (!normalizedRest) throw new Error('Invalid path in GitHub URL')
const dir = normalizedRest.split('/').slice(0, -1).join('/')
return { owner, repo, ref, path: dir || undefined, originalUrl }
}
if (rest && !normalizedRest) throw new Error('Invalid path in GitHub URL')
return { owner, repo, ref, path: normalizedRest || undefined, originalUrl }
}
export async function resolveGitHubCommit(
parsed: GitHubImportUrl,
fetcher: typeof fetch,
): Promise<GitHubImportResolved> {
const repoUrl = `https://${GITHUB_HOST}/${parsed.owner}/${parsed.repo}`
const ref = parsed.ref?.trim() || 'HEAD'
const path = normalizeRepoPath(parsed.path ?? '')
const commit =
ref === 'HEAD'
? await resolveHeadCommit(parsed, fetcher)
: await resolveRefCommit(parsed, ref, fetcher)
return {
owner: parsed.owner,
repo: parsed.repo,
ref,
commit,
path,
repoUrl,
originalUrl: parsed.originalUrl,
}
}
async function resolveRefCommit(parsed: GitHubImportUrl, ref: string, fetcher: typeof fetch) {
const apiUrl = `https://api.github.com/repos/${parsed.owner}/${parsed.repo}/commits/${encodeURIComponent(ref)}`
const response = await fetcher(apiUrl, {
headers: {
Accept: 'application/vnd.github+json',
'User-Agent': 'clawdhub/github-import',
},
})
if (!response.ok) throw new Error('GitHub ref not found')
const body = (await response.json()) as { sha?: unknown }
const sha = typeof body.sha === 'string' ? body.sha : ''
if (!/^[a-f0-9]{40}$/i.test(sha)) throw new Error('GitHub commit sha missing')
return sha.toLowerCase()
}
async function resolveHeadCommit(parsed: GitHubImportUrl, fetcher: typeof fetch) {
let url = `https://${GITHUB_HOST}/${parsed.owner}/${parsed.repo}/archive/HEAD.zip`
for (let i = 0; i < MAX_REDIRECTS; i += 1) {
const response = await fetcher(url, { redirect: 'manual' })
const location = response.headers.get('location')
if (!location) break
const next = new URL(location, url)
if (next.hostname !== GITHUB_HOST && next.hostname !== CODELOAD_HOST) {
throw new Error('Unexpected redirect host')
}
url = next.toString()
}
const maybe = url.split('/').at(-1) ?? ''
if (!/^[a-f0-9]{40}$/i.test(maybe)) {
throw new Error('Could not resolve commit for HEAD')
}
return maybe.toLowerCase()
}
export async function fetchGitHubZipBytes(
resolved: GitHubImportResolved,
fetcher: typeof fetch,
limits?: { maxZipBytes?: number },
): Promise<Uint8Array> {
const maxZipBytes = limits?.maxZipBytes ?? 25 * 1024 * 1024
const url = `https://${CODELOAD_HOST}/${resolved.owner}/${resolved.repo}/zip/${resolved.commit}`
const response = await fetcher(url, {
headers: { 'User-Agent': 'clawdhub/github-import' },
})
if (!response.ok) throw new Error('GitHub archive download failed')
const lengthHeader = response.headers.get('content-length')
if (lengthHeader) {
const contentLength = Number.parseInt(lengthHeader, 10)
if (Number.isFinite(contentLength) && contentLength > maxZipBytes) {
throw new Error('GitHub archive too large')
}
}
const reader = response.body?.getReader()
if (!reader) {
const buffer = new Uint8Array(await response.arrayBuffer())
if (buffer.byteLength > maxZipBytes) throw new Error('GitHub archive too large')
return buffer
}
const chunks: Uint8Array[] = []
let total = 0
while (true) {
const { done, value } = await reader.read()
if (done) break
if (!value) continue
total += value.byteLength
if (total > maxZipBytes) throw new Error('GitHub archive too large')
chunks.push(value)
}
const out = new Uint8Array(total)
let offset = 0
for (const chunk of chunks) {
out.set(chunk, offset)
offset += chunk.byteLength
}
return out
}
export type ZipEntryMap = Record<string, Uint8Array>
export function buildGitHubZipForTests(entries: Record<string, string>) {
const asBytes = Object.fromEntries(
Object.entries(entries).map(([path, text]) => [path, new TextEncoder().encode(text)]),
)
return Uint8Array.from(zipSync(asBytes, { level: 1 }))
}
export function stripGitHubZipRoot(entries: ZipEntryMap): ZipEntryMap {
const paths = Object.keys(entries)
if (paths.length === 0) return {}
const first = paths[0] ?? ''
const firstRoot = first.split('/')[0] ?? ''
if (!firstRoot) return entries
const prefix = `${firstRoot}/`
if (!paths.every((path) => path.startsWith(prefix))) return entries
const out: ZipEntryMap = {}
for (const [path, data] of Object.entries(entries)) {
const stripped = path.slice(prefix.length)
if (!stripped) continue
out[stripped] = data
}
return out
}
export function detectGitHubImportCandidates(entries: ZipEntryMap): GitHubImportCandidate[] {
const candidates: GitHubImportCandidate[] = []
for (const path of Object.keys(entries)) {
const normalized = normalizeRepoPath(path)
const lower = normalized.toLowerCase()
const isSkill = SKILL_FILENAMES.some((name) => lower === name || lower.endsWith(`/${name}`))
if (!isSkill) continue
const dir = normalized.split('/').slice(0, -1).join('/')
const readmePath = normalized
const raw = new TextDecoder().decode(entries[path] ?? new Uint8Array())
const frontmatter = parseFrontmatter(raw)
const name = typeof frontmatter.name === 'string' ? frontmatter.name : undefined
const description =
typeof frontmatter.description === 'string' ? frontmatter.description : undefined
candidates.push({
path: normalizeRepoPath(dir),
readmePath,
name: name?.trim() || undefined,
description: description?.trim() || undefined,
})
}
return uniqCandidates(candidates)
}
function uniqCandidates(candidates: GitHubImportCandidate[]) {
const seen = new Set<string>()
const out: GitHubImportCandidate[] = []
for (const candidate of candidates) {
const key = `${candidate.path}::${candidate.readmePath}`
if (seen.has(key)) continue
seen.add(key)
out.push(candidate)
}
return out.sort((a, b) => a.path.localeCompare(b.path))
}
export function listTextFilesUnderCandidate(
entries: ZipEntryMap,
candidatePath: string,
): Array<{ path: string; bytes: Uint8Array }> {
const root = normalizeCandidateRoot(candidatePath)
const out: Array<{ path: string; bytes: Uint8Array }> = []
for (const [path, bytes] of Object.entries(entries)) {
const normalized = normalizeRepoPath(path)
if (!isUnderRoot(normalized, root)) continue
if (!isTextPath(normalized)) continue
out.push({ path: normalized, bytes })
}
return out.sort((a, b) => a.path.localeCompare(b.path))
}
export function computeDefaultSelectedPaths(params: {
candidate: GitHubImportCandidate
files: Array<{ path: string; bytes: Uint8Array }>
maxDepth?: number
maxAdds?: number
}) {
const maxDepth = params.maxDepth ?? 4
const maxAdds = params.maxAdds ?? 200
const byPath = new Map(params.files.map((file) => [file.path, file.bytes]))
const candidateRoot = normalizeCandidateRoot(params.candidate.path)
const selected = new Set<string>()
let added = 0
const add = (path: string) => {
const normalized = normalizeRepoPath(path)
if (!isUnderRoot(normalized, candidateRoot)) return
if (!byPath.has(normalized)) return
if (!selected.has(normalized)) {
selected.add(normalized)
added += 1
}
}
add(params.candidate.readmePath)
const visited = new Set<string>()
const queue: Array<{ path: string; depth: number }> = [
{ path: params.candidate.readmePath, depth: 0 },
]
while (queue.length > 0) {
const item = queue.shift()
if (!item) break
if (item.depth >= maxDepth) continue
if (visited.has(item.path)) continue
visited.add(item.path)
const bytes = byPath.get(item.path)
if (!bytes) continue
if (!item.path.toLowerCase().endsWith('.md')) continue
const text = new TextDecoder().decode(bytes)
const refs = extractMarkdownRelativeTargets(text)
for (const ref of refs) {
if (added >= maxAdds) break
const resolved = resolveMarkdownTarget(item.path, ref)
if (!resolved) continue
add(resolved)
if (resolved.toLowerCase().endsWith('.md') && byPath.has(resolved)) {
queue.push({ path: resolved, depth: item.depth + 1 })
}
}
if (added >= maxAdds) break
}
return Array.from(selected).sort()
}
export function buildGitHubImportFileList(params: {
candidate: GitHubImportCandidate
files: Array<{ path: string; bytes: Uint8Array }>
defaultSelectedPaths: string[]
}): GitHubImportFileEntry[] {
const selected = new Set(params.defaultSelectedPaths)
return params.files.map((file) => ({
path: file.path,
size: file.bytes.byteLength,
defaultSelected: selected.has(file.path),
}))
}
export function normalizeRepoPath(path: string) {
const stripped = path.replace(/^\/+/, '').trim()
if (!stripped) return ''
const cleaned = stripped.split('/').filter(Boolean).join('/')
if (!cleaned || cleaned.includes('\\') || cleaned.includes('..')) return ''
return cleaned
}
export function normalizeCandidateRoot(candidatePath: string) {
const normalized = normalizeRepoPath(candidatePath)
return normalized ? `${normalized}/` : ''
}
function isUnderRoot(path: string, rootWithSlash: string) {
if (!rootWithSlash) return true
return path === rootWithSlash.slice(0, -1) || path.startsWith(rootWithSlash)
}
function isTextPath(path: string) {
const lower = path.toLowerCase()
const ext = lower.split('.').at(-1) ?? ''
if (!ext) return false
return TEXT_FILE_EXTENSION_SET.has(ext)
}
export function suggestDisplayName(candidate: GitHubImportCandidate, fallbackBase: string) {
const base = candidate.name?.trim() || fallbackBase.trim()
if (!base) return ''
return base
.replace(/[-_]+/g, ' ')
.replace(/\s+/g, ' ')
.replace(/\b\w/g, (char) => char.toUpperCase())
}
export function suggestVersion(latestVersion?: string | null) {
const latest = latestVersion?.trim() || ''
if (latest && semver.valid(latest)) {
return semver.inc(latest, 'patch') ?? '0.1.0'
}
return '0.1.0'
}
export function extractMarkdownRelativeTargets(markdown: string): string[] {
const out: string[] = []
const pattern = /!?\[[^\]]*]\(([^)]+)\)/g
for (const match of markdown.matchAll(pattern)) {
const raw = (match[1] ?? '').trim()
if (!raw) continue
const isAngleWrapped = raw.startsWith('<') && raw.endsWith('>')
const cleaned = raw.replace(/^<|>$/g, '').trim()
if (!cleaned) continue
const target = isAngleWrapped ? cleaned : (cleaned.split(/\s+/)[0] ?? '')
if (!target) continue
if (target.startsWith('#')) continue
const lower = target.toLowerCase()
if (lower.startsWith('http:') || lower.startsWith('https:')) continue
if (lower.startsWith('mailto:')) continue
out.push(target)
}
return out
}
export function resolveMarkdownTarget(fromPath: string, target: string) {
const withoutHash = target.split('#')[0] ?? ''
const withoutQuery = (withoutHash.split('?')[0] ?? '').trim()
if (!withoutQuery) return null
if (withoutQuery.startsWith('/')) return null
if (withoutQuery.includes('\\') || withoutQuery.includes('..')) return null
const fromDirParts = normalizeRepoPath(fromPath).split('/').slice(0, -1)
const targetParts = withoutQuery.split('/').filter(Boolean)
const combined = [...fromDirParts, ...targetParts]
const normalized: string[] = []
for (const part of combined) {
if (part === '.') continue
if (part === '..') return null
normalized.push(part)
}
return normalizeRepoPath(normalized.join('/')) || null
}