feat(mcp): expose capture tool

This commit is contained in:
Peter Steinberger 2026-06-07 05:46:39 +01:00
parent 87d4721e29
commit 3608d9c782
No known key found for this signature in database
16 changed files with 253 additions and 46 deletions

View File

@ -118,10 +118,10 @@ struct CLIRuntimeSmokeTests {
#expect(tools?.isEmpty == false)
#expect((dataPayload?["count"] as? Int ?? 0) > 0)
#expect(names.contains("clipboard"))
#expect(names.contains("capture"))
#expect(names.contains("paste"))
#expect(names.contains("set_value"))
#expect(names.contains("perform_action"))
#expect(!names.contains("capture"))
}
@Test

View File

@ -2,6 +2,9 @@
## [3.3.1] - Unreleased
### Added
- MCP now exposes the bounded `capture` tool for live/video frame capture, contact sheets, metadata, and optional MP4 output. Thanks @coygeek for #169.
### Changed
- Documented background vs. foreground input delivery across the README, automation guide, quickstart, permissions, and interaction command docs.
- Clarified that `peekaboo tools` lists the MCP/agent tool catalog rather than top-level CLI commands. Thanks @lonexreb for #174.

View File

@ -6,8 +6,7 @@ import PeekabooFoundation
/// Frame source that samples frames from a video asset.
public final class VideoFrameSource: CaptureFrameSource {
private let generator: AVAssetImageGenerator
private let times: [CMTime]
private var index: Int = 0
private var timeline: VideoFrameTimeline
private let mode: CaptureMode = .screen
public let effectiveFPS: Double
@ -40,22 +39,16 @@ public final class VideoFrameSource: CaptureFrameSource {
interval = CMTime(milliseconds: everyMs)
self.effectiveFPS = everyMs > 0 ? min(240, max(0.1, 1000.0 / Double(everyMs))) : 2.0
} else {
let fps = sampleFps ?? 2.0
let fps = min(240, max(sampleFps ?? 2.0, 0.1))
interval = CMTime(seconds: 1.0 / max(fps, 0.1), preferredTimescale: 1_000_000)
self.effectiveFPS = fps
}
var cursor = start
var requested: [CMTime] = []
while cursor <= end {
requested.append(cursor)
cursor = CMTimeAdd(cursor, interval)
}
if requested.count < 2 {
requested.append(end)
}
self.timeline = VideoFrameTimeline(
start: start,
end: end,
interval: interval)
self.times = requested
self.generator = AVAssetImageGenerator(asset: asset)
self.generator.appliesPreferredTrackTransform = true
self.generator.requestedTimeToleranceBefore = .zero
@ -67,9 +60,7 @@ public final class VideoFrameSource: CaptureFrameSource {
@MainActor
public func nextFrame() async throws -> (cgImage: CGImage?, metadata: CaptureMetadata)? {
guard self.index < self.times.count else { return nil }
let time = self.times[self.index]
self.index += 1
guard let time = self.timeline.next() else { return nil }
var actual = CMTime.zero
do {
@ -108,6 +99,37 @@ public final class VideoFrameSource: CaptureFrameSource {
}
}
struct VideoFrameTimeline {
private var nextTime: CMTime
private let end: CMTime
private let interval: CMTime
private var exhausted = false
init(start: CMTime, end: CMTime, interval: CMTime) {
self.nextTime = start
self.end = end
self.interval = interval
}
mutating func next() -> CMTime? {
guard !self.exhausted else { return nil }
let current = self.nextTime
if current >= self.end {
self.exhausted = true
return current
}
let next = CMTimeAdd(current, self.interval)
guard next.isNumeric, next > current else {
self.exhausted = true
return current
}
self.nextTime = next >= self.end ? self.end : next
return current
}
}
extension CMTime {
fileprivate init(milliseconds: Int) {
self.init(value: CMTimeValue(milliseconds), timescale: 1000)

View File

@ -15,8 +15,9 @@ struct WatchCaptureSessionStore {
func performAutoclean() -> WatchWarning? {
guard self.managedAutoclean else { return nil }
guard self.autocleanMinutes > 0 else { return nil }
let root = self.outputRoot.deletingLastPathComponent()
guard root.lastPathComponent == "watch-sessions" else { return nil }
guard Self.autocleanRootNames.contains(root.lastPathComponent) else { return nil }
guard let contents = try? self.fileManager.contentsOfDirectory(
at: root,
includingPropertiesForKeys: [.contentModificationDateKey],
@ -26,6 +27,7 @@ struct WatchCaptureSessionStore {
let deadline = Date().addingTimeInterval(TimeInterval(-self.autocleanMinutes) * 60)
var removed = 0
for url in contents {
guard url.standardizedFileURL != self.outputRoot.standardizedFileURL else { continue }
guard let attrs = try? url.resourceValues(forKeys: [.contentModificationDateKey]),
let modified = attrs.contentModificationDate else { continue }
if modified < deadline {
@ -38,7 +40,7 @@ struct WatchCaptureSessionStore {
guard removed > 0 else { return nil }
return WatchWarning(
code: .autoclean,
message: "Autoclean removed \(removed) old watch sessions",
message: "Autoclean removed \(removed) old capture sessions",
details: ["session": self.sessionId])
}
@ -46,4 +48,6 @@ struct WatchCaptureSessionStore {
let data = try JSONEncoder().encode(value)
try data.write(to: url, options: .atomic)
}
private static let autocleanRootNames: Set<String> = ["watch-sessions", "capture-sessions"]
}

View File

@ -25,6 +25,7 @@ public enum MCPToolCatalog {
[
// Core tools
ImageTool(context: context),
CaptureTool(context: context),
AnalyzeTool(),
BrowserTool(context: context),
ListTool(context: context),

View File

@ -3,7 +3,34 @@ import PeekabooAutomationKit
enum CaptureMetaBuilder {
static func buildMeta(from summary: CaptureMetaSummary) -> Value {
let meta: [String: Value] = [
.object(self.summaryMeta(from: summary))
}
static func buildMeta(from result: CaptureSessionResult) -> Value {
var meta = self.summaryMeta(from: .make(from: result))
meta["source"] = .string(result.source.rawValue)
if let videoIn = result.videoIn {
meta["video_in"] = .string(videoIn)
}
if let videoOut = result.videoOut {
meta["video_out"] = .string(videoOut)
}
meta["stats"] = .object([
"duration_ms": .int(result.stats.durationMs),
"fps_idle": .double(result.stats.fpsIdle),
"fps_active": .double(result.stats.fpsActive),
"fps_effective": .double(result.stats.fpsEffective),
"frames_kept": .int(result.stats.framesKept),
"frames_dropped": .int(result.stats.framesDropped),
"max_frames_hit": .bool(result.stats.maxFramesHit),
"max_mb_hit": .bool(result.stats.maxMbHit),
])
meta["warnings"] = .array(result.warnings.map(self.warningMeta))
return .object(meta)
}
private static func summaryMeta(from summary: CaptureMetaSummary) -> [String: Value] {
[
"frames": .array(summary.frames.map { .string($0) }),
"contact": .string(summary.contactPath),
"metadata": .string(summary.metadataPath),
@ -15,6 +42,16 @@ enum CaptureMetaBuilder {
"contact_thumb_height": .string("\(summary.contactThumbSize.height)"),
"contact_sampled_indexes": .array(summary.contactSampledIndexes.map { .string("\($0)") }),
]
}
private static func warningMeta(_ warning: CaptureWarning) -> Value {
var meta: [String: Value] = [
"code": .string(warning.code.rawValue),
"message": .string(warning.message),
]
if let details = warning.details {
meta["details"] = .object(details.mapValues(Value.string))
}
return .object(meta)
}
}

View File

@ -18,11 +18,11 @@ struct CaptureRequest {
let videoOut: String?
init(arguments: ToolArguments, windows: any WindowManagementServiceProtocol) async throws {
let input = try arguments.decode(CaptureInput.self)
let input = try CaptureInput(arguments: arguments)
self.source = try CaptureToolArgumentResolver.source(from: input.source)
let constraints = try CaptureRequest.constraints(from: input)
let outputDir = if let dir = input.output_dir {
let outputDir = if let dir = input.outputDir {
CaptureToolPathResolver.outputDirectory(from: dir)
} else {
URL(fileURLWithPath: NSTemporaryDirectory(), isDirectory: true)
@ -30,7 +30,7 @@ struct CaptureRequest {
}
self.outputDirectory = outputDir
self.autocleanMinutes = input.autocleanMinutes ?? 120
self.usesDefaultOutput = input.output_dir == nil
self.usesDefaultOutput = input.outputDir == nil
self.videoOut = CaptureToolPathResolver.filePath(from: input.videoOut)
switch self.source {
@ -82,11 +82,11 @@ private struct CaptureInput: Codable {
let mode: String?
let app: String?
let pid: Int?
let window_title: String?
let window_index: Int?
let screen_index: Int?
let windowTitle: String?
let windowIndex: Int?
let screenIndex: Int?
let region: String?
let capture_focus: String?
let captureFocus: String?
let durationSeconds: Double?
let idleFps: Double?
@ -108,9 +108,16 @@ private struct CaptureInput: Codable {
let resolutionCap: Double?
let diffStrategy: String?
let diffBudgetMs: Int?
let output_dir: String?
let outputDir: String?
let autocleanMinutes: Int?
let videoOut: String?
init(arguments: ToolArguments) throws {
let data = try JSONEncoder().encode(arguments.rawValue)
let decoder = JSONDecoder()
decoder.keyDecodingStrategy = .convertFromSnakeCase
self = try decoder.decode(Self.self, from: data)
}
}
extension CaptureRequest {
@ -131,8 +138,8 @@ extension CaptureRequest {
{
let modeStr = input.mode
let explicitApp = input.app
let windowTitle = input.window_title
let windowIndex = input.window_index
let windowTitle = input.windowTitle
let windowIndex = input.windowIndex
let mode = try CaptureToolArgumentResolver.mode(
from: modeStr,
@ -141,7 +148,7 @@ extension CaptureRequest {
switch mode {
case .screen:
let screenIndex = input.screen_index
let screenIndex = input.screenIndex
return CaptureScope(
kind: .screen,
screenIndex: screenIndex,
@ -190,7 +197,7 @@ extension CaptureRequest {
let quiet = max(Int(input.quietMs ?? 1000), 0)
let maxFrames = max(constraints.maxFrames, 1)
let maxMbAdjusted = constraints.maxMb.flatMap { $0 > 0 ? $0 : nil }
let focus = try CaptureToolArgumentResolver.captureFocus(from: input.capture_focus)
let focus = try CaptureToolArgumentResolver.captureFocus(from: input.captureFocus)
return CaptureOptions(
duration: duration,

View File

@ -104,19 +104,28 @@ public struct CaptureTool: MCPTool {
configuration: configuration)
let result = try await session.run()
let summary = """
capture kept \(result.stats.framesKept) frames (dropped \(result.stats.framesDropped)),
contact sheet \(result.contactSheet.path)
"""
var summaryLines = [
"capture kept \(result.stats.framesKept) frames (dropped \(result.stats.framesDropped))",
"contact: \(result.contactSheet.path)",
"metadata: \(result.metadataFile)",
"frames: \(result.frames.count) files",
]
if let videoOut = result.videoOut {
summaryLines.insert("video: \(videoOut)", at: 3)
}
if !result.warnings.isEmpty {
let warnings = result.warnings.map(\.message).joined(separator: "; ")
summaryLines.append("warnings: \(warnings)")
}
let summary = summaryLines.joined(separator: "\n")
let meta = ToolEventSummary(
actionDescription: "Capture",
notes: summary)
let metaSummary = CaptureMetaSummary.make(from: result)
return ToolResponse.text(
summary,
meta: ToolEventSummary.merge(
summary: meta,
into: CaptureMetaBuilder.buildMeta(from: metaSummary)))
into: CaptureMetaBuilder.buildMeta(from: result)))
}
}

View File

@ -1,3 +1,4 @@
@preconcurrency import AVFoundation
import CoreGraphics
import Foundation
import ImageIO
@ -98,6 +99,65 @@ struct WatchCaptureSessionTests {
#expect(result.boundingBoxes.count <= 5)
}
@Test
func `Video frame timeline samples lazily without precomputed frame cap`() {
var timeline = VideoFrameTimeline(
start: .zero,
end: CMTime(seconds: 60, preferredTimescale: 1000),
interval: CMTime(value: 1, timescale: 1000))
#expect(timeline.next() == .zero)
#expect(timeline.next() == CMTime(value: 1, timescale: 1000))
#expect(timeline.next() == CMTime(value: 2, timescale: 1000))
}
@Test
func `Autoclean removes old default capture sessions`() throws {
let root = URL(fileURLWithPath: NSTemporaryDirectory(), isDirectory: true)
.appendingPathComponent("peekaboo-autoclean-\(UUID().uuidString)", isDirectory: true)
.appendingPathComponent("capture-sessions", isDirectory: true)
let oldSession = root.appendingPathComponent("capture-old", isDirectory: true)
let currentSession = root.appendingPathComponent("capture-current", isDirectory: true)
try FileManager.default.createDirectory(at: oldSession, withIntermediateDirectories: true)
try FileManager.default.createDirectory(at: currentSession, withIntermediateDirectories: true)
try FileManager.default.setAttributes(
[.modificationDate: Date().addingTimeInterval(-3600)],
ofItemAtPath: oldSession.path)
let store = WatchCaptureSessionStore(
outputRoot: currentSession,
autocleanMinutes: 1,
managedAutoclean: true,
sessionId: "capture-current")
let warning = store.performAutoclean()
#expect(warning?.code == .autoclean)
#expect(!FileManager.default.fileExists(atPath: oldSession.path))
#expect(FileManager.default.fileExists(atPath: currentSession.path))
}
@Test
func `Autoclean ignores non-positive retention and keeps current session`() throws {
let root = URL(fileURLWithPath: NSTemporaryDirectory(), isDirectory: true)
.appendingPathComponent("peekaboo-autoclean-current-\(UUID().uuidString)", isDirectory: true)
.appendingPathComponent("capture-sessions", isDirectory: true)
let currentSession = root.appendingPathComponent("capture-current", isDirectory: true)
try FileManager.default.createDirectory(at: currentSession, withIntermediateDirectories: true)
try FileManager.default.setAttributes(
[.modificationDate: Date().addingTimeInterval(-3600)],
ofItemAtPath: currentSession.path)
let store = WatchCaptureSessionStore(
outputRoot: currentSession,
autocleanMinutes: 0,
managedAutoclean: true,
sessionId: "capture-current")
let warning = store.performAutoclean()
#expect(warning == nil)
#expect(FileManager.default.fileExists(atPath: currentSession.path))
}
@Test
@MainActor
func `Stops at max-frames cap and keeps first frame`() async throws {

View File

@ -2,6 +2,7 @@ import Foundation
import PeekabooAutomation
import PeekabooAutomationKit
import PeekabooFoundation
import TachikomaMCP
import Testing
@testable import PeekabooAgentRuntime
@ -70,6 +71,53 @@ struct CaptureToolPathResolverTests {
}
}
@Test
func `request decodes snake case MCP capture options`() async throws {
let windows = CaptureWindowResolverWindowService(windows: [])
let request = try await CaptureRequest(arguments: ToolArguments(raw: [
"source": "live",
"mode": "area",
"region": "1,2,30,40",
"duration_seconds": 2.5,
"idle_fps": 0.5,
"active_fps": 3.0,
"threshold_percent": 0.25,
"heartbeat_sec": 0,
"quiet_ms": 250,
"capture_focus": "background",
"highlight_changes": true,
"max_frames": 3,
"max_mb": 1,
"resolution_cap": 320,
"diff_strategy": "quality",
"diff_budget_ms": 12,
"output_dir": "~/Desktop/mcp-capture",
"autoclean_minutes": 5,
"video_out": "~/Desktop/mcp-capture.mp4",
]), windows: windows)
#expect(request.source == .live)
#expect(request.scope.kind == .region)
#expect(request.scope.region == CGRect(x: 1, y: 2, width: 30, height: 40))
#expect(request.options.duration == 2.5)
#expect(request.options.idleFps == 0.5)
#expect(request.options.activeFps == 3.0)
#expect(request.options.changeThresholdPercent == 0.25)
#expect(request.options.heartbeatSeconds == 0)
#expect(request.options.quietMsToIdle == 250)
#expect(request.options.captureFocus == .background)
#expect(request.options.highlightChanges)
#expect(request.options.maxFrames == 3)
#expect(request.options.maxMegabytes == 1)
#expect(request.options.resolutionCap == 320)
#expect(request.options.diffStrategy == .quality)
#expect(request.options.diffBudgetMs == 12)
#expect(request.outputDirectory.path == NSString(string: "~/Desktop/mcp-capture").expandingTildeInPath)
#expect(request.autocleanMinutes == 5)
#expect(request.videoOut == NSString(string: "~/Desktop/mcp-capture.mp4").expandingTildeInPath)
}
@Test
func `window resolver maps app title selection to stable window id`() async throws {
let windows = CaptureWindowResolverWindowService(windows: [

View File

@ -183,13 +183,13 @@ struct MCPToolRegistryIntegrationTests {
filters: noToolFilters)
let names = Set(tools.map(\.name))
#expect(tools.count == 26)
#expect(tools.count == 27)
#expect(names.contains("clipboard"))
#expect(names.contains("paste"))
#expect(names.contains("set_value"))
#expect(names.contains("perform_action"))
#expect(names.contains("inspect_ui"))
#expect(!names.contains("capture"))
#expect(names.contains("capture"))
}
@Test
@ -204,7 +204,7 @@ struct MCPToolRegistryIntegrationTests {
filters: noToolFilters))
let tools = registry.allTools()
#expect(tools.count == 26)
#expect(tools.count == 27)
// Verify some key tools are present
let imageToolExists = registry.tool(named: "image") != nil
@ -212,12 +212,14 @@ struct MCPToolRegistryIntegrationTests {
let agentToolExists = registry.tool(named: "agent") != nil
let clipboardToolExists = registry.tool(named: "clipboard") != nil
let inspectUIToolExists = registry.tool(named: "inspect_ui") != nil
let captureToolExists = registry.tool(named: "capture") != nil
#expect(imageToolExists)
#expect(clickToolExists)
#expect(agentToolExists)
#expect(clipboardToolExists)
#expect(inspectUIToolExists)
#expect(captureToolExists)
}
@Test

View File

@ -10,8 +10,9 @@ struct PeekabooMCPServerTests {
let server = try await makeServer()
let names = await server.registeredToolNamesForTesting()
#expect(names.count == 26)
#expect(names.count == 27)
#expect(names == names.sorted())
#expect(names.contains("capture"))
#expect(names.contains("image"))
#expect(names.contains("inspect_ui"))
#expect(names.contains("click"))
@ -19,7 +20,6 @@ struct PeekabooMCPServerTests {
#expect(names.contains("paste"))
#expect(names.contains("set_value"))
#expect(names.contains("perform_action"))
#expect(!names.contains("capture"))
}
@Test

View File

@ -12,6 +12,8 @@ read_when:
- `capture live` — adaptive PNG burst capture of screens/windows/regions with idle/active FPS, diff-based frame keeping, contact sheet, and metadata.
- `capture video` — ingest an existing video, sample frames (by FPS or interval), optionally skip diff filtering, and emit the same outputs.
The MCP server exposes the same primitive as the `capture` tool. MCP arguments use snake_case names such as `duration_seconds`, `active_fps`, `threshold_percent`, `output_dir`, and `video_out`.
A hidden alias `capture watch` maps to `capture live` for backwards compatibility. The old standalone `watch` command/tool is removed.
## Common Outputs

View File

@ -17,6 +17,7 @@ read_when:
## Implementation notes
- `serve` instantiates `PeekabooMCPServer` and maps the transport string to `PeekabooCore.TransportType`. Stdio is the default for Claude Code integrations.
- HTTP/SSE server transports are stubbed; they currently throw “not implemented.”
- The native tool catalog includes bounded `capture` for live screen/window/region recording or video ingest. It writes retained frames, `contact.png`, `metadata.json`, and optional MP4 output, so use tool allow/deny filters when exposing MCP to untrusted clients.
- UI automation tools include action-first additions: `set_value` directly mutates a settable accessibility value, and `perform_action` invokes a named accessibility action on an element from `see`.
- `click` preserves element IDs and queries when forwarding to automation, so action-first policy can use accessibility actions before synthetic fallback.

View File

@ -26,7 +26,7 @@ Use this checklist to exercise the Swift MCP server with mcporter. It mirrors th
```
$MCPORTER list --stdio "$PEEKABOO_BIN mcp serve" --name peekaboo-local --schema --timeout 30000
```
Expect: tool catalog prints Peekaboo-native tools (image, see, list, permissions, click, type, drag, window, menu, dock, space, swipe, hotkey, clipboard, agent, sleep). Any transport/auth errors here block the rest of the suite.
Expect: tool catalog prints Peekaboo-native tools (image, capture, see, list, permissions, click, type, drag, window, menu, dock, space, swipe, hotkey, clipboard, agent, sleep). Any transport/auth errors here block the rest of the suite.
2) **Permissions sanity**
```
@ -57,7 +57,17 @@ Use this checklist to exercise the Swift MCP server with mcporter. It mirrors th
```
Expect `📸 Captured …` text plus a saved file path. Open the PNG to confirm the active window is captured without the shadow frame.
6) **Image + analysis (optional, needs AI keys)**
6) **Bounded live capture smoke**
```
$MCPORTER call --stdio "$PEEKABOO_BIN mcp serve" --name peekaboo-local \
capture source:live mode:area region:100,100,640,360 \
duration_seconds:2 active_fps:4 threshold_percent:0 \
output_dir:/tmp/peekaboo-mcp/live video_out:/tmp/peekaboo-mcp/live.mp4 \
--timeout 45000
```
Expect kept-frame text plus `contact.png`, `metadata.json`, one or more frame PNGs, and a non-empty MP4 when `video_out` is set.
7) **Image + analysis (optional, needs AI keys)**
```
$MCPORTER call --stdio "$PEEKABOO_BIN mcp serve" --name peekaboo-local \
image path:/tmp/peekaboo-mcp/frontmost-analysis.png format:png \
@ -67,7 +77,7 @@ Use this checklist to exercise the Swift MCP server with mcporter. It mirrors th
Expect an analysis paragraph plus `savedFiles` metadata; failures here usually mean provider config or permissions issues.
Note: OpenAI Responses (GPT5.x) requires `image_url` to be a string (URL or data URL). Peekaboo normalizes legacy `{ url, detail }` objects internally, but upstream tools should prefer the string form to avoid 400s.
7) **List cached tools after reuse (daemon/keep-alive sanity)**
8) **List cached tools after reuse (daemon/keep-alive sanity)**
```
$MCPORTER list --stdio "$PEEKABOO_BIN mcp serve" --name peekaboo-local --timeout 15000
```

View File

@ -56,6 +56,7 @@ If you disable the `clipboard` tool via allow/deny filters, the injected DESKTOP
- Any audio capture path (`AudioInputService`, voice command helpers) that transcribes speech through `PeekabooAIService`.
Disable by clearing `PEEKABOO_AI_PROVIDERS`, removing API keys, or adding these names to your deny list when running offline.
- **Medium risk** can manipulate apps or data
- `capture`: records retained screen/window/region frames, contact sheets, metadata, and optional MP4 files. Disable it when MCP or agent clients should not persist screen contents.
- `click`, `type`, `hotkey`, `press`, and `paste`: can trigger actions in foreground apps or send process-targeted events to a background app by default when a target process is known. Use `--foreground` for focused foreground delivery. Background delivery still requires macOS event-posting access and does not prove the target app handled the event.
- `scroll`, `swipe`, `drag`, `move`: can trigger pointer actions in foreground apps.
- `window`, `app`, `menu_click`, `dock_launch`, `space`: can close apps, move windows, switch spaces.