clawhub/scripts/security-dataset/normalize.ts

import { createHash } from "node:crypto";

export type SourceKind = "skill" | "package";
export type DatasetLabel = "clean" | "suspicious" | "malicious" | "unknown";
export type DatasetSplit = "train" | "validation" | "test" | "eval_holdout";
export type ScannerName = "static" | "virustotal" | "llm" | "moderation_consensus";

export type ExportFileInput = {
  path: string;
  size: number;
  sha256: string;
  contentType: string | null;
};

export type VtAnalysisInput = {
  status: string;
  verdict: string | null;
  analysis: string | null;
  source: string | null;
  scanner: string | null;
  engineStats: {
    malicious?: number;
    suspicious?: number;
    undetected?: number;
    harmless?: number;
  } | null;
  checkedAt: number;
};

export type StaticScanInput = {
  status: DatasetLabel;
  reasonCodes: string[];
  findings: Array<{
    code: string;
    severity: "info" | "warn" | "critical";
    file: string;
    line: number;
    message: string;
    evidence: string;
  }>;
  summary: string;
  engineVersion: string;
  checkedAt: number;
};

export type LlmAnalysisInput = {
  status: string;
  verdict: string | null;
  confidence: string | null;
  summary: string | null;
  dimensions: Array<{
    name: string;
    label: string;
    rating: string;
    detail: string;
  }> | null;
  guidance: string | null;
  findings: string | null;
  model: string | null;
  checkedAt: number;
};

export type ModerationConsensusInput = {
  verdict: DatasetLabel | null;
  reasonCodes: string[];
  summary: string | null;
  engineVersion: string | null;
  evaluatedAt: number | null;
};

export type ArtifactExportInput = {
  sourceKind: SourceKind;
  sourceDocId: string;
  parentDocId: string;
  publicName: string;
  publicSlug: string | null;
  version: string;
  artifactSha256: string | null;
  skillMdContentRedacted?: string | null;
  createdAt: number;
  softDeletedAt: number | null;
  files: ExportFileInput[];
  capabilityTags: string[];
  packageFamily: string | null;
  packageChannel: string | null;
  packageExecutesCode: boolean | null;
  sourceRepoHost: string | null;
  vtAnalysis: VtAnalysisInput | null;
  staticScan: StaticScanInput | null;
  llmAnalysis: LlmAnalysisInput | null;
  moderationConsensus: ModerationConsensusInput | null;
};

export type ArtifactRow = {
  artifact_id: string;
  source_kind: SourceKind;
  source_table: "skillVersions" | "packageReleases";
  source_doc_id_hash: string;
  parent_doc_id_hash: string;
  public_name: string;
  public_slug: string | null;
  version: string;
  artifact_sha256: string | null;
  skill_md_content_redacted?: string | null;
  created_at: number;
  created_month: string;
  soft_deleted: boolean;
  is_public: boolean;
  file_count: number;
  total_bytes: number;
  file_ext_counts: Record<string, number>;
  capability_tags: string[];
  package_family: string | null;
  package_channel: string | null;
  package_executes_code: boolean | null;
  source_repo_host: string | null;
  has_vt_scan: boolean;
  has_static_scan: boolean;
  has_llm_scan: boolean;
};

export type ScanResultRow = {
  artifact_id: string;
  scanner: ScannerName;
  scanner_version: string | null;
  model: string | null;
  status: string;
  verdict: string | null;
  confidence: string | null;
  checked_at: number | null;
  reason_codes: string[];
  engine_stats: VtAnalysisInput["engineStats"];
  summary_redacted: string | null;
  raw_status_family: DatasetLabel;
};

export type StaticFindingRow = {
  artifact_id: string;
  finding_id: string;
  code: string;
  severity: "info" | "warn" | "critical";
  file_path_hash: string;
  file_ext: string;
  line_bucket: string;
  message: string;
  evidence_redacted: string;
};

export type LabelRow = {
  artifact_id: string;
  label: DatasetLabel;
  label_source: "static_scan" | "virustotal" | "llm_scan" | "moderation_consensus";
  label_confidence: string;
  reason_codes: string[];
  scanner_agreement: number;
  notes_redacted: string | null;
};

export type SplitRow = {
  artifact_id: string;
  split: DatasetSplit;
  split_version: string;
  split_key: string;
};

export type NormalizedDatasetRows = {
  artifacts: ArtifactRow[];
  scanResults: ScanResultRow[];
  staticFindings: StaticFindingRow[];
  labels: LabelRow[];
  splits: SplitRow[];
};

const SPLIT_VERSION = "sha256-v1";
const MAX_REDACTED_TEXT_LENGTH = 240;
const MAX_REDACTED_SKILL_CONTENT_LENGTH = 120_000;

const SECRET_PATTERNS: RegExp[] = [
  /\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,}\b/gi,
  /\bgh[pousr]_[A-Za-z0-9_]{20,}\b/g,
  /\bsk-[A-Za-z0-9_-]{20,}\b/g,
  /\bAKIA[0-9A-Z]{16}\b/g,
  /\b(?:api[_-]?key|token|secret|password|passwd|pwd|authorization code|auth code)\s*[:=]\s*["']?[^"',\s;)`]{6,}/gi,
  /\b(?:authorization|x-api-key)\s*[:=]\s*["']?(?:bearer|basic)?\s+[A-Za-z0-9._~+/=-]{12,}/gi,
  /-----BEGIN [A-Z0-9 ]*(?:PRIVATE KEY|CERTIFICATE)-----[\s\S]*?-----END [A-Z0-9 ]*(?:PRIVATE KEY|CERTIFICATE)-----/g,
  /\bhttps?:\/\/[^/\s:@]+:[^/\s@]+@[^\s)'"`]+/gi,
  /(["'`])(?=[A-Za-z0-9+/=_-]{32,}\1)(?=.*[A-Z])(?=.*[a-z])(?=.*\d)[A-Za-z0-9+/=_-]+\1/g,
];

export function hashString(value: string) {
  return createHash("sha256").update(value).digest("hex");
}

export function redactText(value: string | null | undefined, maxLength = MAX_REDACTED_TEXT_LENGTH) {
  if (!value) return null;
  let redacted = "";
  for (let index = 0; index < value.length; index += 1) {
    const code = value.charCodeAt(index);
    redacted += code < 32 || code === 127 ? " " : value.charAt(index);
  }
  for (const pattern of SECRET_PATTERNS) {
    redacted = redacted.replace(pattern, "[REDACTED_SECRET]");
  }
  redacted = redacted.replace(/\s+/g, " ").trim();
  if (redacted.length <= maxLength) return redacted;
  return `${redacted.slice(0, maxLength - 1)}...`;
}

export function redactSkillContent(value: string | null | undefined) {
  return redactText(value, MAX_REDACTED_SKILL_CONTENT_LENGTH);
}

export function normalizeArtifactExport(inputs: ArtifactExportInput[]): NormalizedDatasetRows {
  const artifacts: ArtifactRow[] = [];
  const scanResults: ScanResultRow[] = [];
  const staticFindings: StaticFindingRow[] = [];
  const labels: LabelRow[] = [];
  const splits: SplitRow[] = [];

  for (const input of inputs) {
    const artifactId = buildArtifactId(input);
    const artifact = buildArtifactRow(input, artifactId);
    artifacts.push(artifact);
    scanResults.push(...buildScanResultRows(input, artifactId));
    staticFindings.push(...buildStaticFindingRows(input, artifactId));
    labels.push(...buildLabelRows(input, artifactId));
    splits.push(buildSplitRow(input, artifactId));
  }

  return { artifacts, scanResults, staticFindings, labels, splits };
}

export function buildArtifactId(input: ArtifactExportInput) {
  const hash = input.artifactSha256?.trim();
  if (hash) return `${input.sourceKind}:${hash}`;
  return `${input.sourceKind}:doc:${hashString(input.sourceDocId).slice(0, 24)}`;
}

export function assignSplit(splitKey: string): DatasetSplit {
  const digest = hashString(splitKey);
  const bucket = Number.parseInt(digest.slice(0, 8), 16) / 0xffffffff;
  if (bucket < 0.7) return "train";
  if (bucket < 0.85) return "validation";
  if (bucket < 0.95) return "test";
  return "eval_holdout";
}

function buildArtifactRow(input: ArtifactExportInput, artifactId: string): ArtifactRow {
  return {
    artifact_id: artifactId,
    source_kind: input.sourceKind,
    source_table: input.sourceKind === "skill" ? "skillVersions" : "packageReleases",
    source_doc_id_hash: hashString(input.sourceDocId),
    parent_doc_id_hash: hashString(input.parentDocId),
    public_name: input.publicName,
    public_slug: input.publicSlug,
    version: input.version,
    artifact_sha256: input.artifactSha256,
    ...(input.sourceKind === "skill" && input.skillMdContentRedacted
      ? { skill_md_content_redacted: redactSkillContent(input.skillMdContentRedacted) }
      : {}),
    created_at: input.createdAt,
    created_month: createdMonth(input.createdAt),
    soft_deleted: input.softDeletedAt !== null,
    is_public: input.softDeletedAt === null,
    file_count: input.files.length,
    total_bytes: input.files.reduce((sum, file) => sum + file.size, 0),
    file_ext_counts: countFileExtensions(input.files),
    capability_tags: [...input.capabilityTags].sort((a, b) => a.localeCompare(b)),
    package_family: input.packageFamily,
    package_channel: input.packageChannel,
    package_executes_code: input.packageExecutesCode,
    source_repo_host: input.sourceRepoHost,
    has_vt_scan: input.vtAnalysis !== null,
    has_static_scan: input.staticScan !== null,
    has_llm_scan: input.llmAnalysis !== null,
  };
}

function buildScanResultRows(input: ArtifactExportInput, artifactId: string): ScanResultRow[] {
  const rows: ScanResultRow[] = [];
  if (input.staticScan) {
    rows.push({
      artifact_id: artifactId,
      scanner: "static",
      scanner_version: input.staticScan.engineVersion,
      model: null,
      status: input.staticScan.status,
      verdict: input.staticScan.status,
      confidence: null,
      checked_at: input.staticScan.checkedAt,
      reason_codes: [...input.staticScan.reasonCodes].sort((a, b) => a.localeCompare(b)),
      engine_stats: null,
      summary_redacted: redactText(input.staticScan.summary),
      raw_status_family: normalizeLabel(input.staticScan.status),
    });
  }
  if (input.vtAnalysis) {
    const label = labelFromVirusTotal(input.vtAnalysis);
    rows.push({
      artifact_id: artifactId,
      scanner: "virustotal",
      scanner_version: input.vtAnalysis.scanner,
      model: null,
      status: input.vtAnalysis.status,
      verdict: input.vtAnalysis.verdict,
      confidence: null,
      checked_at: input.vtAnalysis.checkedAt,
      reason_codes: [],
      engine_stats: input.vtAnalysis.engineStats,
      summary_redacted: redactText(input.vtAnalysis.analysis),
      raw_status_family: label,
    });
  }
  if (input.llmAnalysis) {
    const label = labelFromText(input.llmAnalysis.verdict ?? input.llmAnalysis.status);
    rows.push({
      artifact_id: artifactId,
      scanner: "llm",
      scanner_version: null,
      model: input.llmAnalysis.model,
      status: input.llmAnalysis.status,
      verdict: input.llmAnalysis.verdict,
      confidence: input.llmAnalysis.confidence,
      checked_at: input.llmAnalysis.checkedAt,
      reason_codes: [],
      engine_stats: null,
      summary_redacted: redactText(input.llmAnalysis.summary ?? input.llmAnalysis.findings),
      raw_status_family: label,
    });
  }
  if (input.moderationConsensus?.verdict) {
    rows.push({
      artifact_id: artifactId,
      scanner: "moderation_consensus",
      scanner_version: input.moderationConsensus.engineVersion,
      model: null,
      status: input.moderationConsensus.verdict,
      verdict: input.moderationConsensus.verdict,
      confidence: "consensus",
      checked_at: input.moderationConsensus.evaluatedAt,
      reason_codes: [...input.moderationConsensus.reasonCodes].sort((a, b) => a.localeCompare(b)),
      engine_stats: null,
      summary_redacted: redactText(input.moderationConsensus.summary),
      raw_status_family: input.moderationConsensus.verdict,
    });
  }
  return rows;
}

function buildStaticFindingRows(
  input: ArtifactExportInput,
  artifactId: string,
): StaticFindingRow[] {
  return (input.staticScan?.findings ?? []).map((finding, index) => ({
    artifact_id: artifactId,
    finding_id: `${artifactId}:static:${index}:${hashString(
      `${finding.code}:${finding.file}:${finding.line}:${finding.message}`,
    ).slice(0, 12)}`,
    code: finding.code,
    severity: finding.severity,
    file_path_hash: hashString(finding.file),
    file_ext: fileExtension(finding.file),
    line_bucket: lineBucket(finding.line),
    message: finding.message,
    evidence_redacted: redactText(finding.evidence) ?? "",
  }));
}

function buildLabelRows(input: ArtifactExportInput, artifactId: string): LabelRow[] {
  const scannerLabels: DatasetLabel[] = [];
  const rows: LabelRow[] = [];
  if (input.staticScan) {
    const label = normalizeLabel(input.staticScan.status);
    scannerLabels.push(label);
    rows.push({
      artifact_id: artifactId,
      label,
      label_source: "static_scan",
      label_confidence: "scanner",
      reason_codes: [...input.staticScan.reasonCodes].sort((a, b) => a.localeCompare(b)),
      scanner_agreement: 0,
      notes_redacted: redactText(input.staticScan.summary),
    });
  }
  if (input.vtAnalysis) {
    const label = labelFromVirusTotal(input.vtAnalysis);
    scannerLabels.push(label);
    rows.push({
      artifact_id: artifactId,
      label,
      label_source: "virustotal",
      label_confidence: "scanner",
      reason_codes: [],
      scanner_agreement: 0,
      notes_redacted: redactText(input.vtAnalysis.analysis),
    });
  }
  if (input.llmAnalysis) {
    const label = labelFromText(input.llmAnalysis.verdict ?? input.llmAnalysis.status);
    scannerLabels.push(label);
    rows.push({
      artifact_id: artifactId,
      label,
      label_source: "llm_scan",
      label_confidence: input.llmAnalysis.confidence ?? "scanner",
      reason_codes: [],
      scanner_agreement: 0,
      notes_redacted: redactText(input.llmAnalysis.summary ?? input.llmAnalysis.findings),
    });
  }
  if (input.moderationConsensus?.verdict) {
    rows.push({
      artifact_id: artifactId,
      label: input.moderationConsensus.verdict,
      label_source: "moderation_consensus",
      label_confidence: "consensus",
      reason_codes: [...input.moderationConsensus.reasonCodes].sort((a, b) => a.localeCompare(b)),
      scanner_agreement: countAgreement(scannerLabels, input.moderationConsensus.verdict),
      notes_redacted: redactText(input.moderationConsensus.summary),
    });
  }

  const consensus = consensusLabel(scannerLabels);
  if (!input.moderationConsensus?.verdict && consensus !== "unknown") {
    rows.push({
      artifact_id: artifactId,
      label: consensus,
      label_source: "moderation_consensus",
      label_confidence: "derived_consensus",
      reason_codes: input.staticScan?.reasonCodes ?? [],
      scanner_agreement: countAgreement(scannerLabels, consensus),
      notes_redacted: null,
    });
  }

  return rows.map((row) => ({
    ...row,
    scanner_agreement:
      row.scanner_agreement > 0 ? row.scanner_agreement : countAgreement(scannerLabels, row.label),
  }));
}

function buildSplitRow(input: ArtifactExportInput, artifactId: string): SplitRow {
  const splitKey = input.artifactSha256 ?? `${input.sourceKind}:${input.sourceDocId}`;
  return {
    artifact_id: artifactId,
    split: assignSplit(splitKey),
    split_version: SPLIT_VERSION,
    split_key: hashString(splitKey),
  };
}

function consensusLabel(labels: DatasetLabel[]): DatasetLabel {
  if (labels.length === 0) return "unknown";
  if (labels.includes("malicious")) return "malicious";
  if (labels.includes("suspicious")) return "suspicious";
  if (labels.every((label) => label === "clean")) return "clean";
  return "unknown";
}

function countAgreement(labels: DatasetLabel[], label: DatasetLabel) {
  return labels.filter((candidate) => candidate === label).length;
}

function labelFromVirusTotal(analysis: VtAnalysisInput): DatasetLabel {
  if ((analysis.engineStats?.malicious ?? 0) > 0) return "malicious";
  if ((analysis.engineStats?.suspicious ?? 0) > 0) return "suspicious";
  return labelFromText(analysis.verdict ?? analysis.status);
}

function labelFromText(value: string | null | undefined): DatasetLabel {
  const normalized = value?.toLowerCase() ?? "";
  if (normalized.includes("malicious") || normalized.includes("malware")) return "malicious";
  if (normalized.includes("suspicious")) return "suspicious";
  if (
    normalized.includes("clean") ||
    normalized.includes("safe") ||
    normalized.includes("harmless")
  ) {
    return "clean";
  }
  return "unknown";
}

function normalizeLabel(value: string): DatasetLabel {
  if (value === "clean" || value === "suspicious" || value === "malicious") return value;
  return labelFromText(value);
}

function countFileExtensions(files: ExportFileInput[]) {
  const counts: Record<string, number> = {};
  for (const file of files) {
    const ext = fileExtension(file.path);
    counts[ext] = (counts[ext] ?? 0) + 1;
  }
  return Object.fromEntries(Object.entries(counts).sort(([a], [b]) => a.localeCompare(b)));
}

function fileExtension(path: string) {
  const fileName = path.split("/").at(-1) ?? path;
  const dotIndex = fileName.lastIndexOf(".");
  if (dotIndex <= 0 || dotIndex === fileName.length - 1) return "";
  return fileName.slice(dotIndex).toLowerCase();
}

function lineBucket(line: number) {
  if (line <= 20) return "1-20";
  if (line <= 50) return "21-50";
  if (line <= 100) return "51-100";
  if (line <= 250) return "101-250";
  return "251+";
}

function createdMonth(timestamp: number) {
  return new Date(timestamp).toISOString().slice(0, 7);
}