From 922258be6941901465ee86a526be91f4dee6045d Mon Sep 17 00:00:00 2001 From: "openclaw-docs-sync[bot]" Date: Wed, 6 May 2026 01:42:33 +0000 Subject: [PATCH] chore(sync): mirror docs from openclaw/openclaw@9b1d28edf1cf5e5ea9c30c3f4cf1203acc2cef68 --- .openclaw-sync/source.json | 4 +- docs/.generated/config-baseline.sha256 | 4 +- .../.generated/plugin-sdk-api-baseline.sha256 | 4 +- docs/.i18n/glossary.zh-CN.json | 4 + docs/gateway/config-agents.md | 12 + docs/gateway/doctor.md | 3 +- docs/gateway/index.md | 36 -- docs/gateway/protocol.md | 14 +- docs/nodes/index.md | 3 + docs/nodes/talk.md | 38 +- docs/platforms/ios.md | 4 + docs/plugins/sdk-migration.md | 114 +++- docs/plugins/sdk-provider-plugins.md | 15 +- docs/refactor/talk-api-contract.md | 320 +++++++++++ docs/refactor/talk-execution.md | 229 ++++++++ docs/refactor/talk-surfaces.md | 128 +++++ docs/refactor/talk.md | 499 ++++++++++++++++++ docs/tools/media-overview.md | 12 + docs/tools/tts.md | 10 + docs/web/control-ui.md | 6 +- 20 files changed, 1404 insertions(+), 55 deletions(-) create mode 100644 docs/refactor/talk-api-contract.md create mode 100644 docs/refactor/talk-execution.md create mode 100644 docs/refactor/talk-surfaces.md create mode 100644 docs/refactor/talk.md diff --git a/.openclaw-sync/source.json b/.openclaw-sync/source.json index 7dfa37437..ac9e11f3e 100644 --- a/.openclaw-sync/source.json +++ b/.openclaw-sync/source.json @@ -1,5 +1,5 @@ { "repository": "openclaw/openclaw", - "sha": "1f7d0ef310d7f7e273eeb5746e474f0caaf16d72", - "syncedAt": "2026-05-06T01:38:34.677Z" + "sha": "9b1d28edf1cf5e5ea9c30c3f4cf1203acc2cef68", + "syncedAt": "2026-05-06T01:40:33.089Z" } diff --git a/docs/.generated/config-baseline.sha256 b/docs/.generated/config-baseline.sha256 index 69c6cbcc4..0269d73d1 100644 --- a/docs/.generated/config-baseline.sha256 +++ b/docs/.generated/config-baseline.sha256 @@ -1,4 +1,4 @@ -c93176f87a1e4576f5951b82037394c4bc9628bb6e056b6b24f96e662d6d636c config-baseline.json -92cbb12ca382f7424e7bd52df21798b10a57621f5c266909fa74e23f6cb973d7 config-baseline.core.json +60fe8b70598ccd0cf41875b1615106583a466694de2bf50019a9a251b58fa02e config-baseline.json +28d86173c32d17ce6348c7af028a00118e32eb9d344a0b19f9132c606da210c0 config-baseline.core.json cd7c0c7fb1435bc7e59099e9ac334462d5ad444016e9ab4512aae63a238f78dc config-baseline.channel.json 6871e789b74722e4ff2c877940dac256c232433ae26b305fc6ca782b90662097 config-baseline.plugin.json diff --git a/docs/.generated/plugin-sdk-api-baseline.sha256 b/docs/.generated/plugin-sdk-api-baseline.sha256 index c356254ce..f9b34b570 100644 --- a/docs/.generated/plugin-sdk-api-baseline.sha256 +++ b/docs/.generated/plugin-sdk-api-baseline.sha256 @@ -1,2 +1,2 @@ -1a06492fe05d1c9dc3194677f52d57ec90468b93023b70d0852ef01d87c7eae3 plugin-sdk-api-baseline.json -c950a1923c0dc7d31120a3010e24217bcf22fd9cacbe102d3ae19b0120c0f648 plugin-sdk-api-baseline.jsonl +7684d1cc87a531a1490bf9f5a9eab8a30518b95d93884ef8a025486aba6e29b1 plugin-sdk-api-baseline.json +e297ee3f86e2753ede344ff9ef8d1e063ce5fc95816f574062e2a974e4f82601 plugin-sdk-api-baseline.jsonl diff --git a/docs/.i18n/glossary.zh-CN.json b/docs/.i18n/glossary.zh-CN.json index 4090e11e4..05284fc38 100644 --- a/docs/.i18n/glossary.zh-CN.json +++ b/docs/.i18n/glossary.zh-CN.json @@ -35,6 +35,10 @@ "source": "Channel message API", "target": "频道消息 API" }, + { + "source": "Talk mode", + "target": "Talk 模式" + }, { "source": "Azure Speech", "target": "Azure Speech" diff --git a/docs/gateway/config-agents.md b/docs/gateway/config-agents.md index e2b7f3473..c42bce27e 100644 --- a/docs/gateway/config-agents.md +++ b/docs/gateway/config-agents.md @@ -1384,6 +1384,18 @@ Defaults for Talk mode (macOS/iOS/Android). speechLocale: "ru-RU", silenceTimeoutMs: 1500, interruptOnSpeech: true, + realtime: { + provider: "openai", + providers: { + openai: { + model: "gpt-realtime", + voice: "alloy", + }, + }, + mode: "realtime", + transport: "webrtc", + brain: "agent-consult", + }, }, } ``` diff --git a/docs/gateway/doctor.md b/docs/gateway/doctor.md index 50751246f..82cb183db 100644 --- a/docs/gateway/doctor.md +++ b/docs/gateway/doctor.md @@ -166,7 +166,7 @@ That stages grounded durable candidates into the short-term dreaming store while If the config contains legacy value shapes (for example `messages.ackReaction` without a channel-specific override), doctor normalizes them into the current schema. - That includes legacy Talk flat fields. Current public Talk config is `talk.provider` + `talk.providers.`. Doctor rewrites old `talk.voiceId` / `talk.voiceAliases` / `talk.modelId` / `talk.outputFormat` / `talk.apiKey` shapes into the provider map. + That includes legacy Talk flat fields. Current public Talk speech config is `talk.provider` + `talk.providers.`, and realtime voice config is `talk.realtime.*`. Doctor rewrites old `talk.voiceId` / `talk.voiceAliases` / `talk.modelId` / `talk.outputFormat` / `talk.apiKey` shapes into the provider map, and rewrites legacy top-level realtime selectors (`talk.mode`, `talk.transport`, `talk.brain`, `talk.model`, `talk.voice`) into `talk.realtime`. Doctor also warns when `plugins.allow` is non-empty and tool policy uses wildcard or plugin-owned tool entries. `tools.allow: ["*"]` only matches tools @@ -199,6 +199,7 @@ That stages grounded durable candidates into the short-term dreaming store while - `routing.bindings` → top-level `bindings` - `routing.agents`/`routing.defaultAgentId` → `agents.list` + `agents.list[].default` - legacy `talk.voiceId`/`talk.voiceAliases`/`talk.modelId`/`talk.outputFormat`/`talk.apiKey` → `talk.provider` + `talk.providers.` + - legacy top-level realtime Talk selectors (`talk.mode`/`talk.transport`/`talk.brain`/`talk.model`/`talk.voice`) + `talk.provider`/`talk.providers` → `talk.realtime` - `routing.agentToAgent` → `tools.agentToAgent` - `routing.transcribeAudio` → `tools.media.audio.models` - `messages.tts.` (`openai`/`elevenlabs`/`microsoft`/`edge`) → `messages.tts.providers.` diff --git a/docs/gateway/index.md b/docs/gateway/index.md index 407adebb6..0b500b075 100644 --- a/docs/gateway/index.md +++ b/docs/gateway/index.md @@ -184,42 +184,6 @@ OPENCLAW_CONFIG_PATH=~/.openclaw/b.json OPENCLAW_STATE_DIR=~/.openclaw-b opencla Detailed setup: [/gateway/multiple-gateways](/gateway/multiple-gateways). -## VoiceClaw real-time brain endpoint - -OpenClaw exposes a VoiceClaw-compatible real-time WebSocket endpoint at -`/voiceclaw/realtime`. Use it when a VoiceClaw desktop client should talk -directly to a real-time OpenClaw brain instead of going through a separate relay -process. - -The endpoint uses Gemini Live for real-time audio and calls OpenClaw as the -brain by exposing OpenClaw tools directly to Gemini Live. Tool calls return an -immediate `working` result to keep the voice turn responsive, then OpenClaw -executes the actual tool asynchronously and injects the result back into the -live session. Set `GEMINI_API_KEY` in the gateway process environment. If -gateway auth is enabled, the desktop client sends the gateway token or password -in its first `session.config` message. - -Real-time brain access runs owner-authorized OpenClaw agent commands. Keep -`gateway.auth.mode: "none"` limited to loopback-only test instances. Non-local -real-time brain connections require gateway auth. - -For an isolated test gateway, run a separate instance with its own port, config, -and state: - -```bash -OPENCLAW_CONFIG_PATH=/path/to/openclaw-realtime/openclaw.json \ -OPENCLAW_STATE_DIR=/path/to/openclaw-realtime/state \ -OPENCLAW_SKIP_CHANNELS=1 \ -GEMINI_API_KEY=... \ -openclaw gateway --port 19789 -``` - -Then configure VoiceClaw to use: - -```text -ws://127.0.0.1:19789/voiceclaw/realtime -``` - ## Remote access Preferred: Tailscale/VPN. diff --git a/docs/gateway/protocol.md b/docs/gateway/protocol.md index 04db84450..342d3e4bb 100644 --- a/docs/gateway/protocol.md +++ b/docs/gateway/protocol.md @@ -253,7 +253,8 @@ base method scope: Nodes declare capability claims at connect time: -- `caps`: high-level capability categories. +- `caps`: high-level capability categories such as `camera`, `canvas`, `screen`, + `location`, `voice`, and `talk`. - `commands`: command allowlist for invoke. - `permissions`: granular toggles (e.g. `screen.record`, `camera.capture`). @@ -361,8 +362,19 @@ enumeration of `src/gateway/server-methods/*.ts`. + - `talk.catalog` returns the read-only Talk provider catalog for speech, streaming transcription, and realtime voice. It includes provider ids, labels, configured state, exposed model/voice ids, canonical modes, transports, brain strategies, and realtime audio/capability flags without returning provider secrets or mutating global config. - `talk.config` returns the effective Talk config payload; `includeSecrets` requires `operator.talk.secrets` (or `operator.admin`). + - `talk.session.create` creates a Gateway-owned Talk session for `realtime/gateway-relay`, `transcription/gateway-relay`, or `stt-tts/managed-room`. `brain: "direct-tools"` requires `operator.admin`. + - `talk.session.join` validates a managed-room session token, emits `session.ready` or `session.replaced` events as needed, and returns room/session metadata plus recent Talk events without the plaintext token or stored token hash. + - `talk.session.appendAudio` appends base64 PCM input audio to Gateway-owned realtime relay and transcription sessions. + - `talk.session.startTurn`, `talk.session.endTurn`, and `talk.session.cancelTurn` drive managed-room turn lifecycle with stale-turn rejection before state is cleared. + - `talk.session.cancelOutput` stops assistant audio output, primarily for VAD-gated barge-in in Gateway relay sessions. + - `talk.session.submitToolResult` completes a provider tool call emitted by a Gateway-owned realtime relay session. + - `talk.session.close` closes a Gateway-owned relay, transcription, or managed-room session and emits terminal Talk events. - `talk.mode` sets/broadcasts the current Talk mode state for WebChat/Control UI clients. + - `talk.client.create` creates a client-owned realtime provider session using `webrtc` or `provider-websocket` while the Gateway owns config, credentials, instructions, and tool policy. + - `talk.client.toolCall` lets client-owned realtime transports forward provider tool calls to Gateway policy. The first supported tool is `openclaw_agent_consult`; clients receive a run id and wait for normal chat lifecycle events before submitting the provider-specific tool result. + - `talk.event` is the single Talk event channel for realtime, transcription, STT/TTS, managed-room, telephony, and meeting adapters. - `talk.speak` synthesizes speech through the active Talk speech provider. - `tts.status` returns TTS enabled state, active provider, fallback providers, and provider config state. - `tts.providers` returns the visible TTS provider inventory. diff --git a/docs/nodes/index.md b/docs/nodes/index.md index e5ce896fb..20f4a7540 100644 --- a/docs/nodes/index.md +++ b/docs/nodes/index.md @@ -197,6 +197,9 @@ Node commands must pass two gates before they can be invoked: Windows and macOS companion nodes allow safe declared commands such as `canvas.*`, `camera.list`, `location.get`, and `screen.snapshot` by default. +Trusted nodes that advertise the `talk` capability or declare `talk.*` commands +also allow declared push-to-talk commands (`talk.ptt.start`, `talk.ptt.stop`, +`talk.ptt.cancel`, `talk.ptt.once`) by default, independent of platform label. Dangerous or privacy-heavy commands such as `camera.snap`, `camera.clip`, and `screen.record` still require explicit opt-in with `gateway.nodes.allowCommands`. `gateway.nodes.denyCommands` always wins over diff --git a/docs/nodes/talk.md b/docs/nodes/talk.md index fac213100..4bc69ef82 100644 --- a/docs/nodes/talk.md +++ b/docs/nodes/talk.md @@ -1,18 +1,28 @@ --- -summary: "Talk mode: continuous speech conversations with configured TTS providers" +summary: "Talk mode: continuous speech conversations across local STT/TTS and realtime voice" read_when: - Implementing Talk mode on macOS/iOS/Android - Changing voice/TTS/interrupt behavior title: "Talk mode" --- -Talk mode is a continuous voice conversation loop: +Talk mode has two runtime shapes: + +- Native macOS/iOS/Android Talk uses local speech recognition, Gateway chat, and `talk.speak` TTS. Nodes advertise the `talk` capability and declare the `talk.*` commands they support. +- Browser Talk uses `talk.client.create` for client-owned `webrtc` and `provider-websocket` sessions, or `talk.session.create` for Gateway-owned `gateway-relay` sessions. `managed-room` is reserved for Gateway handoff and walkie-talkie rooms. +- Transcription-only clients use `talk.session.create({ mode: "transcription", transport: "gateway-relay", brain: "none" })`, then `talk.session.appendAudio`, `talk.session.cancelTurn`, and `talk.session.close` when they need captions or dictation without an assistant voice response. + +Native Talk is a continuous voice conversation loop: 1. Listen for speech -2. Send transcript to the model (main session, chat.send) +2. Send transcript to the model through the active session 3. Wait for the response 4. Speak it via the configured Talk provider (`talk.speak`) +Browser realtime Talk forwards provider tool calls through `talk.client.toolCall`; browser clients do not call `chat.send` directly for realtime consults. + +Transcription-only Talk emits the same common Talk event envelope as realtime and STT/TTS sessions, but uses `mode: "transcription"` and `brain: "none"`. It is for captions, dictation, and observe-only speech capture; one-shot uploaded voice notes still use the media/audio path. + ## Behavior (macOS) - **Always-on overlay** while Talk mode is enabled. @@ -66,6 +76,19 @@ Supported keys: speechLocale: "ru-RU", silenceTimeoutMs: 1500, interruptOnSpeech: true, + realtime: { + provider: "openai", + providers: { + openai: { + apiKey: "openai_api_key", + model: "gpt-realtime", + voice: "alloy", + }, + }, + mode: "realtime", + transport: "webrtc", + brain: "agent-consult", + }, }, } ``` @@ -79,6 +102,11 @@ Defaults: - `providers.elevenlabs.modelId`: defaults to `eleven_v3` when unset. - `providers.mlx.modelId`: defaults to `mlx-community/Soprano-80M-bf16` when unset. - `providers.elevenlabs.apiKey`: falls back to `ELEVENLABS_API_KEY` (or gateway shell profile if available). +- `realtime.provider`: selects the active browser/server realtime voice provider. Use `openai` for WebRTC, `google` for provider WebSocket, or a bridge-only provider through Gateway relay. +- `realtime.providers.` stores provider-owned realtime config. The browser receives only ephemeral or constrained session credentials, never a standard API key. +- `realtime.brain`: `agent-consult` routes realtime tool calls through Gateway policy; `direct-tools` is owner-only compatibility behavior; `none` is for transcription or external orchestration. +- `talk.catalog` exposes each provider's valid modes, transports, brain strategies, realtime audio formats, and capability flags so first-party Talk clients can avoid unsupported combinations. +- Streaming transcription providers are discovered through `talk.catalog.transcription`. The current Gateway relay uses the Voice Call streaming provider config until the dedicated Talk transcription config surface is added. - `speechLocale`: optional BCP 47 locale id for on-device Talk speech recognition on iOS/macOS. Leave unset to use the device default. - `outputFormat`: defaults to `pcm_44100` on macOS/iOS and `pcm_24000` on Android (set `mp3_*` to force MP3 streaming) @@ -103,7 +131,9 @@ Defaults: ## Notes - Requires Speech + Microphone permissions. -- Uses `chat.send` against session key `main`. +- Native Talk uses the active Gateway session and only falls back to history polling when response events are unavailable. +- Browser realtime Talk uses `talk.client.toolCall` for `openclaw_agent_consult` instead of exposing `chat.send` to provider-owned browser sessions. +- Transcription-only Talk uses `talk.session.create`, `talk.session.appendAudio`, `talk.session.cancelTurn`, and `talk.session.close`; clients subscribe to `talk.event` for partial/final transcript updates. - The gateway resolves Talk playback through `talk.speak` using the active Talk provider. Android falls back to local system TTS only when that RPC is unavailable. - macOS local MLX playback uses the bundled `openclaw-mlx-tts` helper when present, or an executable on `PATH`. Set `OPENCLAW_MLX_TTS_BIN` to point at a custom helper binary during development. - `stability` for `eleven_v3` is validated to `0.0`, `0.5`, or `1.0`; other models accept `0..1`. diff --git a/docs/platforms/ios.md b/docs/platforms/ios.md index 7ea4153cd..2b8d01c86 100644 --- a/docs/platforms/ios.md +++ b/docs/platforms/ios.md @@ -263,6 +263,10 @@ openclaw nodes invoke --node "iOS Node" --command canvas.snapshot --params '{"ma ## Voice wake + talk mode - Voice wake and talk mode are available in Settings. +- Talk-capable iOS nodes advertise the `talk` capability and can declare + `talk.ptt.start`, `talk.ptt.stop`, `talk.ptt.cancel`, and `talk.ptt.once`; + the Gateway allows those push-to-talk commands by default for trusted + Talk-capable nodes. - iOS may suspend background audio; treat voice features as best-effort when the app is not active. ## Common errors diff --git a/docs/plugins/sdk-migration.md b/docs/plugins/sdk-migration.md index 0db0a8de0..153edf109 100644 --- a/docs/plugins/sdk-migration.md +++ b/docs/plugins/sdk-migration.md @@ -77,6 +77,118 @@ Current bundled provider examples: - OpenRouter keeps provider builder and onboarding/config helpers in its own `api.ts` +## Talk and realtime voice migration plan + +Realtime voice, telephony, meeting, and browser Talk code is moving from +surface-local turn bookkeeping to a shared Talk session controller exported by +`openclaw/plugin-sdk/realtime-voice`. The new controller owns the common Talk +event envelope, active turn state, capture state, output-audio state, recent +event history, and stale-turn rejection. Provider plugins should keep owning +vendor-specific realtime sessions; surface plugins should keep owning capture, +playback, telephony, and meeting quirks. + +This Talk migration is intentionally breaking-clean: + +1. Keep the shared controller/runtime primitives in + `plugin-sdk/realtime-voice`. +2. Move bundled surfaces onto the shared controller: browser relay, + managed-room handoff, voice-call realtime, voice-call streaming STT, Google + Meet realtime, and native push-to-talk. +3. Replace old Talk RPC families with the final `talk.session.*` and + `talk.client.*` API. +4. Advertise one live Talk event channel in Gateway + `hello-ok.features.events`: `talk.event`. +5. Delete the old realtime HTTP endpoint and any request-time instruction + override path. + +New code should not call `createTalkEventSequencer(...)` directly unless it is +implementing a low-level adapter or test fixture. Prefer the shared controller +so turn-scoped events cannot be emitted without a turn id, stale `turnEnd` / +`turnCancel` calls cannot clear a newer active turn, and output-audio lifecycle +events stay consistent across telephony, meetings, browser relay, managed-room +handoff, and native Talk clients. + +The target public API shape is: + +```typescript +// Gateway-owned Talk session API. +await gateway.request("talk.session.create", { + mode: "realtime", + transport: "gateway-relay", + brain: "agent-consult", + sessionKey: "main", +}); +await gateway.request("talk.session.appendAudio", { sessionId, audioBase64 }); +await gateway.request("talk.session.cancelOutput", { sessionId, reason: "barge-in" }); +await gateway.request("talk.session.submitToolResult", { sessionId, callId, result }); +await gateway.request("talk.session.close", { sessionId }); + +// Client-owned provider session API. +await gateway.request("talk.client.create", { + mode: "realtime", + transport: "webrtc", + brain: "agent-consult", + sessionKey: "main", +}); +await gateway.request("talk.client.toolCall", { sessionKey, callId, name, args }); +``` + +Browser-owned WebRTC/provider-websocket sessions use `talk.client.create`, +because the browser owns the provider negotiation and media transport while the +Gateway owns credentials, instructions, and tool policy. `talk.session.*` is the +common Gateway-managed surface for gateway-relay realtime, gateway-relay +transcription, and managed-room native STT/TTS sessions. + +Legacy configs that placed realtime selectors beside `talk.provider` / +`talk.providers` should be repaired with `openclaw doctor --fix`; runtime Talk +does not reinterpret speech/TTS provider config as realtime provider config. + +The supported `talk.session.create` combinations are intentionally small: + +| Mode | Transport | Brain | Owner | Notes | +| --------------- | --------------- | --------------- | ------------------ | ------------------------------------------------------------------------------------------------------------------ | +| `realtime` | `gateway-relay` | `agent-consult` | Gateway | Full-duplex provider audio bridged through the Gateway; tool calls are routed through the agent-consult tool. | +| `transcription` | `gateway-relay` | `none` | Gateway | Streaming STT only; callers send input audio and receive transcript events. | +| `stt-tts` | `managed-room` | `agent-consult` | Native/client room | Push-to-talk and walkie-talkie style rooms where the client owns capture/playback and the Gateway owns turn state. | +| `stt-tts` | `managed-room` | `direct-tools` | Native/client room | Admin-only room mode for trusted first-party surfaces that execute Gateway tool actions directly. | + +Removed method map: + +| Old | New | +| -------------------------------- | -------------------------------------------------------- | +| `talk.realtime.session` | `talk.client.create` | +| `talk.realtime.toolCall` | `talk.client.toolCall` | +| `talk.realtime.relayAudio` | `talk.session.appendAudio` | +| `talk.realtime.relayCancel` | `talk.session.cancelOutput` or `talk.session.cancelTurn` | +| `talk.realtime.relayToolResult` | `talk.session.submitToolResult` | +| `talk.realtime.relayStop` | `talk.session.close` | +| `talk.transcription.session` | `talk.session.create({ mode: "transcription" })` | +| `talk.transcription.relayAudio` | `talk.session.appendAudio` | +| `talk.transcription.relayCancel` | `talk.session.cancelTurn` | +| `talk.transcription.relayStop` | `talk.session.close` | +| `talk.handoff.create` | `talk.session.create({ transport: "managed-room" })` | +| `talk.handoff.join` | `talk.session.join` | +| `talk.handoff.revoke` | `talk.session.close` | + +The unified control vocabulary is also deliberately narrow: + +| Method | Applies to | Contract | +| ------------------------------- | ------------------------------------------------------- | --------------------------------------------------------------------------------------------- | +| `talk.session.appendAudio` | `realtime/gateway-relay`, `transcription/gateway-relay` | Append a base64 PCM audio chunk to the provider session owned by the same Gateway connection. | +| `talk.session.startTurn` | `stt-tts/managed-room` | Start a managed-room user turn. | +| `talk.session.endTurn` | `stt-tts/managed-room` | End the active turn after stale-turn validation. | +| `talk.session.cancelTurn` | all Gateway-owned sessions | Cancel active capture/provider/agent/TTS work for a turn. | +| `talk.session.cancelOutput` | `realtime/gateway-relay` | Stop assistant audio output without necessarily ending the user turn. | +| `talk.session.submitToolResult` | `realtime/gateway-relay` | Complete a provider tool call emitted by the relay. | +| `talk.session.close` | all unified sessions | Stop relay sessions or revoke managed-room state, then forget the unified session id. | + +Do not introduce provider or platform special cases in core to make this work. +Core owns Talk session semantics. Provider plugins own vendor session setup. +Voice-call and Google Meet own telephony/meeting adapters. Browser and native +apps own device capture/playback UX. + +The detailed implementation plan lives in [Talk refactor plan](/refactor/talk). + ## Compatibility policy For external plugins, compatibility work follows this order: @@ -497,7 +609,7 @@ releases. | `plugin-sdk/speech` | Speech helpers | Speech provider types plus provider-facing directive, registry, validation helpers, and OpenAI-compatible TTS builder | | `plugin-sdk/speech-core` | Shared speech core | Speech provider types, registry, directives, normalization | | `plugin-sdk/realtime-transcription` | Realtime transcription helpers | Provider types, registry helpers, and shared WebSocket session helper | - | `plugin-sdk/realtime-voice` | Realtime voice helpers | Provider types, registry/resolution helpers, and bridge session helpers | + | `plugin-sdk/realtime-voice` | Realtime voice helpers | Provider types, registry/resolution helpers, bridge session helpers, shared agent talk-back queues, transcript/event health, echo suppression, and fast context consult helpers | | `plugin-sdk/image-generation` | Image-generation helpers | Image generation provider types plus image asset/data URL helpers and the OpenAI-compatible image provider builder | | `plugin-sdk/image-generation-core` | Shared image-generation core | Image-generation types, failover, auth, and registry helpers | | `plugin-sdk/music-generation` | Music-generation helpers | Music-generation provider/request/result types | diff --git a/docs/plugins/sdk-provider-plugins.md b/docs/plugins/sdk-provider-plugins.md index cfe7cb209..2a4c058f7 100644 --- a/docs/plugins/sdk-provider-plugins.md +++ b/docs/plugins/sdk-provider-plugins.md @@ -588,6 +588,13 @@ API key auth, and dynamic model resolution. api.registerRealtimeVoiceProvider({ id: "acme-ai", label: "Acme Realtime Voice", + capabilities: { + transports: ["gateway-relay"], + inputAudioFormats: [{ encoding: "pcm16", sampleRateHz: 24000, channels: 1 }], + outputAudioFormats: [{ encoding: "pcm16", sampleRateHz: 24000, channels: 1 }], + supportsBargeIn: true, + supportsToolCalls: true, + }, isConfigured: ({ providerConfig }) => Boolean(providerConfig.apiKey), createBridge: (req) => ({ // Set this only if the provider accepts multiple tool responses for @@ -606,9 +613,11 @@ API key auth, and dynamic model resolution. }); ``` - Implement `handleBargeIn` when a transport can detect that a human is - interrupting assistant playback and the provider supports truncating or - clearing the active audio response. + Declare `capabilities` so `talk.catalog` can expose valid modes, + transports, audio formats, and feature flags to browser and native Talk + clients. Implement `handleBargeIn` when a transport can detect that a + human is interrupting assistant playback and the provider supports + truncating or clearing the active audio response. ```typescript diff --git a/docs/refactor/talk-api-contract.md b/docs/refactor/talk-api-contract.md new file mode 100644 index 000000000..d39d8d0c6 --- /dev/null +++ b/docs/refactor/talk-api-contract.md @@ -0,0 +1,320 @@ +--- +summary: "Detailed API, event, runtime, cancellation, and tool-policy contract for the Talk refactor" +read_when: + - Implementing Talk Gateway methods or protocol schemas + - Changing Talk config, events, cancellation, or provider tool policy + - Reviewing whether a Talk behavior belongs in core or an adapter +title: "Talk API and runtime contract" +--- + +# Talk API And Runtime Contract + +This is the detailed contract for [Talk refactor plan](/refactor/talk). + +## Config Contract + +Config stays under the existing `talk` object. Do not add `talk.speech` in this +refactor. + +```ts +type TalkConfig = { + provider?: string; + providers?: Record; + realtime?: { + provider?: string; + model?: string; + voice?: string; + mode?: TalkMode; + transport?: TalkTransport; + brain?: TalkBrain; + providers?: Record; + }; + input?: { + interruptOnSpeech?: boolean; + silenceTimeoutMs?: number; + }; +}; +``` + +Rules: + +- `talk.provider` and `talk.providers.*` remain speech/STT/TTS provider config. +- `talk.realtime.provider` and `talk.realtime.providers.*` are realtime voice provider config. +- `talk.config` returns effective config without secrets unless privileged. +- `talk.catalog` returns capabilities, not inferred provider-id guesses. +- Doctor migrates old realtime selectors into `talk.realtime`. +- Runtime does not silently reinterpret Voice Call or TTS config as realtime config. + +## Method Semantics + +### `talk.catalog` + +Returns effective Talk capabilities: + +- modes +- transports +- brain strategies +- providers +- models +- voices +- input audio formats +- output audio formats +- browser-safe client session support +- Gateway relay support +- managed-room support +- local STT/TTS support + +Provider capability declarations drive this. Core must not infer support from +provider ids. + +### `talk.speak` + +One-shot TTS: + +```ts +await gateway.request("talk.speak", { + text: "Ready.", + voice: "alloy", +}); +``` + +`talk.speak` does not create live session state, turn state, transcript state, +barge-in state, or provider realtime state. + +### `talk.client.create` + +Creates a client-owned provider session while Gateway still owns config, +instructions, credentials, and tool policy. + +Use it for browser WebRTC, browser provider WebSocket, and native provider media +sessions that require client-owned sockets. Reject `gateway-relay` and +`managed-room`; the error points clients to `talk.session.create`. + +### `talk.client.toolCall` + +Forwards provider tool calls from client-owned provider sessions to Gateway +policy: + +```ts +await gateway.request("talk.client.toolCall", { + sessionId, + callId, + name, + argumentsJson, +}); +``` + +Validate session identity, caller ownership, brain strategy, and policy. Pass an +`AbortSignal` into agent/tool runtime, reject stale or closed sessions, and never +accept request-time instructions. + +### `talk.session.create` + +Creates a Gateway-owned live Talk session. + +| Mode | Transport | Brain | Owner | +| --------------- | --------------- | --------------- | ------------------- | +| `realtime` | `gateway-relay` | `agent-consult` | Gateway | +| `transcription` | `gateway-relay` | `none` | Gateway | +| `stt-tts` | `managed-room` | `agent-consult` | Gateway/client room | +| `stt-tts` | `managed-room` | `direct-tools` | trusted room | + +Reject `webrtc` and `provider-websocket`; the error points clients to +`talk.client.create`. + +### `talk.session.join` + +Joins or reconnects to a Gateway-owned managed room. Validate session id and +token, never expose token hashes, emit `session.replaced` to the displaced +client, and emit `session.ready` to the new owner. + +### `talk.session.appendAudio` + +Appends an input audio frame to a Gateway-owned relay session: + +```ts +await gateway.request("talk.session.appendAudio", { + sessionId, + audioBase64, + timestamp, +}); +``` + +Use for realtime Gateway relay and streaming transcription. Do not use this for +managed-room native push-to-talk when the native node captures audio locally and +returns transcript/output through node command results. + +### Turn Verbs + +Use explicit verbs instead of generic controls: + +```ts +await gateway.request("talk.session.startTurn", { sessionId }); +await gateway.request("talk.session.endTurn", { sessionId, turnId }); +await gateway.request("talk.session.cancelTurn", { sessionId, turnId, reason }); +await gateway.request("talk.session.cancelOutput", { sessionId, turnId, reason }); +``` + +`endTurn` rejects stale `turnId` before clearing active state. `cancelTurn` +aborts capture, STT, provider response, agent consult, tools, TTS, relay output, +and room streams tied to that turn. `cancelOutput` stops assistant audio without +necessarily ending the user turn. Barge-in must be speech/VAD gated. + +### `talk.session.submitToolResult` + +Completes a provider tool call emitted inside a Gateway-owned relay session: + +```ts +await gateway.request("talk.session.submitToolResult", { + sessionId, + callId, + output, +}); +``` + +### `talk.session.close` + +Closes a Gateway-owned session. Close emits one terminal event, stops capture and +playback, aborts provider and agent work, drains TTS, revokes room join state, +and removes retained state after its replay/debug window. + +## Event Contract + +All live Talk paths emit one public event channel: + +```ts +talk.event; +``` + +Every event uses this envelope: + +```ts +type TalkEvent = { + id: string; + type: TalkEventType; + sessionId: string; + turnId?: string; + captureId?: string; + seq: number; + timestamp: string; + mode: TalkMode; + transport: TalkTransport; + brain: TalkBrain; + provider?: string; + final?: boolean; + callId?: string; + itemId?: string; + parentId?: string; + source?: string; + payload: TPayload; +}; +``` + +Core event types include `session.*`, `turn.*`, `capture.*`, `input.audio.*`, +`transcript.*`, `output.text.*`, `output.audio.*`, `tool.*`, `usage.metrics`, +`latency.metrics`, and `health.changed`. + +Rules: + +- `sessionId` is required for every event. +- `turnId` is required for turn-bound input, output, transcript, tool, and cancellation events. +- `captureId` is required while capture is active. +- `seq` monotonically increases per session. +- `timestamp` uses ISO 8601 UTC. +- `callId`, `itemId`, and `parentId` correlate provider responses, tool calls, TTS jobs, and relay frames. +- payloads must not duplicate large raw audio frames when transport already carries them. +- consumers should rely on envelope fields instead of provider-specific payloads. + +Text-ready is not audio-ready. Clients may show text after `output.text.done`, +but should not enter speaking/playback state until `output.audio.started` or +`output.audio.delta`. + +## Shared Runtime Target + +Keep one provider-agnostic runtime under `src/talk`. The first pass keeps names +close to the old runtime modules so the move stays reviewable: + +```text +src/talk/ + audio-codec.ts + agent-consult-runtime.ts + agent-consult-tool.ts + agent-talkback-runtime.ts + fast-context-runtime.ts + provider-registry.ts + provider-resolver.ts + provider-types.ts + session-log-runtime.ts + session-runtime.ts + talk-events.ts + talk-session-controller.ts +``` + +New code should import the shared runtime from `src/talk` inside core. Plugins +that already use the stable SDK subpath keep importing +`openclaw/plugin-sdk/realtime-voice`; that facade re-exports the Talk runtime +contract without exposing core file layout. + +Responsibilities: + +- normalize modes, transports, brains, codecs, and audio metadata +- create, close, and replace session records +- allocate turn ids and capture ids +- reject stale turn ids before mutation +- sequence events +- retain recent events for replay, reconnect, and diagnostics +- track active input capture and assistant output +- coordinate barge-in and output cancellation +- propagate abort signals +- register provider tool calls and bind tool results +- expose test builders for session/event assertions + +Gateway method files should become thin adapters: + +```text +src/gateway/server-methods/ + talk.ts + talk-client.ts + talk-session.ts +``` + +Internal Gateway helpers may exist only as staging files while code moves to +`src/talk`. + +## Cancellation Contract + +Cancellation must abort underlying work, not only ignore stale output. + +When a turn or session is cancelled: + +- provider realtime response is cancelled when supported +- provider session is closed or reset when cancellation cannot be scoped +- streaming STT receives abort +- agent consult receives abort +- queued tools do not start after abort +- already-started side-effecting tools receive abort and report cancellation +- pending TTS jobs are drained +- playback sources are stopped +- relay streams are cleared +- managed-room capture and output state reset +- stale finals and stale audio deltas are ignored +- one terminal cancellation event is emitted + +Barge-in uses VAD or provider speech-started signals, ignores silence and echo, +cancels output only after real user speech, and starts or ensures a turn before +emitting `turn.cancelled`. + +## Tool Policy Contract + +Gateway owns Talk tool policy. + +Client-owned flow: `talk.client.create`, provider tool call to client, +`talk.client.toolCall`, Gateway policy validation, agent/direct-tool execution, +client result submission to provider. + +Gateway-owned flow: `talk.session.create`, provider tool call to Gateway, +Gateway policy validation, agent/direct-tool execution, provider result +submission, `talk.event` emission. + +No Talk path accepts caller-provided instructions. Gateway builds instructions +from trusted config and session context. diff --git a/docs/refactor/talk-execution.md b/docs/refactor/talk-execution.md new file mode 100644 index 000000000..1a17bcbc5 --- /dev/null +++ b/docs/refactor/talk-execution.md @@ -0,0 +1,229 @@ +--- +summary: "Implementation packages, deletion checklist, test matrix, and verification commands for the Talk refactor" +read_when: + - Implementing the Talk refactor plan + - Deleting legacy Talk RPCs, event channels, or realtime endpoint code + - Verifying browser, native, telephony, meeting, STT, or TTS Talk behavior after refactor work +title: "Talk refactor execution checklist" +--- + +# Talk Refactor Execution Checklist + +Use this as the PR tracker for [Talk refactor plan](/refactor/talk). + +## Implementation Packages + +### Package 1: Protocol + +- update `src/gateway/protocol/schema/channels.ts` +- update `src/gateway/protocol/schema/protocol-schemas.ts` +- update `src/gateway/protocol/schema/types.ts` +- update `src/gateway/protocol/index.ts` +- regenerate generated protocol clients +- remove old schemas from generated metadata +- update protocol tests + +Done when old RPC/event names are absent from generated protocol output. + +### Package 2: Gateway Methods + +- split client-owned methods into `talk-client.ts` +- keep session-owned methods in `talk-session.ts` +- keep catalog/config/speak/mode in `talk.ts` +- classify every new method in method scopes +- advertise only `talk.event` in hello event features +- remove old method list entries +- update authorization tests + +Done when every public Talk method has an explicit scope. + +### Package 3: Session Runtime + +- add `src/talk` primitives +- move event sequencing into shared runtime +- move stale-turn rejection into shared runtime +- move active output state into shared runtime +- move cancellation bookkeeping into shared runtime +- expose small test helpers + +Done when relay, transcription, handoff, telephony, and meetings do not each +invent event and turn bookkeeping. + +### Package 4: Browser UI + +- update realtime startup to `talk.client.create` +- update realtime tool consult to `talk.client.toolCall` +- update relay startup to `talk.session.create` +- update relay audio to `talk.session.appendAudio` +- update relay tool result to `talk.session.submitToolResult` +- update relay output cancel to `talk.session.cancelOutput` +- update relay close to `talk.session.close` +- listen only to `talk.event` +- remove relay mark RPC + +Done when UI tests prove no removed RPC names remain. + +### Package 5: Native And Nodes + +- route native Talk through session events +- map push-to-talk commands to managed-room turn lifecycle +- clean capture state on failed start +- keep local STT/TTS as adapter behavior +- remove chat-history polling from the success path +- keep fallback polling only if explicitly needed + +Done when native voice success path is event-driven. + +### Package 6: Voice Call + +- map telephony realtime events into `talk.event` +- map local speech detection to `startTurn`, `cancelOutput`, and `cancelTurn` +- pass abort through agent consult and tools +- keep marks, clear, u-law, and call lifecycle in the plugin +- add tests for early speech before provider speech-started + +Done when Voice Call shares event and cancellation semantics without leaking +telephony into core. + +### Package 7: Meetings + +- map meeting speech and transcript state into `talk.event` +- keep participant and room state in meeting adapter +- add echo-suppression aware barge-in tests +- ensure meeting adapters can choose realtime, transcription, or `stt-tts` + +Done when meeting behavior is an adapter over Talk, not a parallel realtime loop. + +### Package 8: Doctor And Migration + +- detect old realtime selectors outside `talk.realtime` +- write explicit `talk.realtime.provider`, `model`, `voice`, `transport`, and `brain` +- report removed RPC names when logs show old clients +- keep startup free of hidden config rewrites +- update SDK migration, Gateway protocol, Talk node, Control UI, and TTS docs + +Done when runtime config is explicit and docs mention removed API only in +migration notes. + +## Deletion Checklist + +Delete or prove absent: + +- `src/gateway/voiceclaw-realtime/` +- `/voiceclaw/realtime` +- `instructionsOverride` +- `talk.realtime.*` public RPCs +- `talk.transcription.*` public RPCs +- `talk.handoff.*` public RPCs +- `talk.session.inputAudio` +- `talk.session.control` +- `talk.session.toolResult` +- `talk.realtime.relay` +- `talk.transcription.relay` +- old generated protocol models +- old UI relay method calls + +Keep only these old names in explicit migration tables. + +## Test Matrix + +Protocol: + +- final methods exist in protocol schemas +- removed methods are absent from protocol schemas +- final event is advertised in hello features +- removed events are absent from broadcast guards +- generated clients match schema +- request-time instruction override is rejected or impossible by schema + +Gateway: + +- `talk.client.create` creates WebRTC session result +- `talk.client.create` creates provider WebSocket session result +- `talk.client.create` rejects Gateway-owned transports +- `talk.client.toolCall` validates caller, session, brain, and policy +- `talk.session.create` creates realtime Gateway relay +- `talk.session.create` creates transcription relay +- `talk.session.create` creates STT/TTS managed room +- `talk.session.create` rejects client-owned transports +- `talk.session.join` replacement notifies displaced client +- `talk.session.appendAudio` routes to relay/transcription session +- `talk.session.startTurn` starts managed-room turn +- `talk.session.endTurn` rejects stale turn ids +- `talk.session.cancelTurn` aborts provider, agent, tools, TTS, and streams +- `talk.session.cancelOutput` cancels playback only +- `talk.session.submitToolResult` binds to provider call id +- `talk.session.close` emits terminal event and releases resources + +Browser: + +- WebRTC path calls `talk.client.create` +- provider WebSocket path calls `talk.client.create` +- provider tool calls use `talk.client.toolCall` +- Gateway relay uses only `talk.session.*` +- Gateway relay listens only to `talk.event` +- barge-in requires speech/VAD +- relay close rejects or aborts pending consult runs +- no removed RPC names in UI tests + +Native: + +- push-to-talk start emits capture/turn events +- failed push-to-talk start cleans capture state +- cancel clears capture and output state +- STT/TTS success path is event-driven +- fallback polling is explicit and tested if kept +- node policy rejects untrusted Talk commands + +Telephony: + +- early speech before provider speech-started creates or guards turn before cancellation +- marks and clear events map to output state +- u-law codec stays adapter-owned +- cancellation aborts consult run +- closed call prevents stale tool result submission + +Meetings: + +- participant context appears as metadata, not core branching +- echo suppression prevents false barge-in +- transcript events use common envelope +- meeting close aborts active work + +Architecture: + +- no removed public RPC names in protocol metadata +- no retired realtime endpoint route +- no retired realtime folder +- no request-time instruction override field +- no core branches on app platform names +- provider behavior comes from capabilities + +## Verification Commands + +Focused local loop: + +```sh +pnpm test src/gateway/protocol/index.test.ts +pnpm test src/gateway/server-methods/talk.test.ts +pnpm test src/gateway/method-scopes.test.ts src/gateway/server-methods-list.test.ts +pnpm test src/gateway/talk-realtime-relay.test.ts src/gateway/talk-transcription-relay.test.ts +pnpm test ui/src/ui/realtime-talk.test.ts ui/src/ui/realtime-talk-gateway-relay.test.ts ui/src/ui/realtime-talk-webrtc.test.ts ui/src/ui/realtime-talk-google-live.test.ts +pnpm exec oxfmt --check --threads=1 docs/refactor/talk.md docs/refactor/talk-execution.md +``` + +Generation and docs: + +```sh +pnpm protocol:gen && pnpm protocol:gen:swift +pnpm docs:check-mdx +pnpm plugin-sdk:api:check +``` + +Broad gate before push: + +```sh +pnpm check:changed +``` + +Use Testbox for broad gates on maintainer machines. diff --git a/docs/refactor/talk-surfaces.md b/docs/refactor/talk-surfaces.md new file mode 100644 index 000000000..52c31dc1b --- /dev/null +++ b/docs/refactor/talk-surfaces.md @@ -0,0 +1,128 @@ +--- +summary: "Surface adapter plan for browser, native, walkie-talkie, telephony, and meeting Talk refactor work" +read_when: + - Updating browser realtime Talk, native Talk, walkie-talkie handoff, Voice Call, or meeting voice code + - Deciding whether a Talk behavior belongs in an adapter or shared runtime +title: "Talk surface mapping" +--- + +# Talk Surface Mapping + +This maps product surfaces into [Talk refactor plan](/refactor/talk) primitives. + +## Browser + +WebRTC: + +- call `talk.client.create` +- open provider media connection in browser +- forward provider tool calls through `talk.client.toolCall` +- receive provider audio through provider media/data channel + +Provider WebSocket: + +- call `talk.client.create` +- connect using constrained provider result +- keep provider-specific framing in the browser adapter +- forward tool calls through `talk.client.toolCall` + +Gateway relay: + +- call `talk.session.create` +- send PCM frames with `talk.session.appendAudio` +- listen only to `talk.event` +- submit tool results with `talk.session.submitToolResult` +- barge-in with `talk.session.cancelOutput` +- close with `talk.session.close` + +## Native And Nodes + +Native apps map local audio lifecycle into Talk primitives. + +Native realtime: + +- use `talk.client.create` when the app owns provider media +- use `talk.session.create` when Gateway owns provider relay + +Native STT/TTS: + +- use `talk.session.create({ mode: "stt-tts", transport: "managed-room" })` +- keep local STT and local TTS behind native adapters +- drive success path from Talk events +- keep history polling only as a degraded fallback if explicitly tested + +Native push-to-talk: + +- press maps to `talk.session.startTurn` +- release maps to `talk.session.endTurn` +- cancel maps to `talk.session.cancelTurn` +- node capture commands emit capture events +- failed start cleans capture state +- opening voice UI never mutates global Talk config + +Trusted node command adapters may remain: + +```ts +talk.ptt.start; +talk.ptt.stop; +talk.ptt.cancel; +talk.ptt.once; +``` + +## Walkie-Talkie + +Walkie-talkie is managed-room Talk: + +```ts +await gateway.request("talk.session.create", { + mode: "stt-tts", + transport: "managed-room", + brain: "agent-consult", + sessionKey, +}); +``` + +Then: + +- client joins with `talk.session.join` +- press calls `talk.session.startTurn` +- release calls `talk.session.endTurn` +- cancel calls `talk.session.cancelTurn` +- assistant speech emits `output.text.*` and `output.audio.*` +- replacement emits `session.replaced` to old owner +- close calls `talk.session.close` + +Room state includes canonical session id, route/channel target, caller identity, +mode, transport, brain, provider, model, voice, locale, expiry, token hash, +active client id, active turn id, and replacement state. + +Two simultaneous rooms must not share turn ids, transcripts, audio output, or +cancellation tokens. + +## Telephony + +Voice Call becomes a telephony adapter over Talk semantics. + +Keep telephony-owned: Twilio/Plivo WebSocket contracts, stream ids, call ids, +G.711 u-law, marks, clear events, backpressure, phone call lifecycle, and inbound +speech detection quirks. + +Move shared behavior to Talk: event envelope, turn ids, cancellation, agent +consult abort, tool policy, usage and latency metrics, and output state. + +Telephony should emit `talk.event` for observability, even if phone media +remains plugin-owned. + +## Meetings + +Google Meet and future meeting integrations become meeting adapters over Talk +semantics. + +Keep meeting-owned: meeting join/leave, participant identity, room permissions, +echo suppression, transcript context, and meeting-specific mute/deafen behavior. + +Move shared behavior to Talk: turn lifecycle, transcript events, assistant output +events, tool policy, cancellation, and metrics. + +Meeting adapters may run `transcription`, `stt-tts`, or `realtime` depending on +provider support. diff --git a/docs/refactor/talk.md b/docs/refactor/talk.md new file mode 100644 index 000000000..485448e0d --- /dev/null +++ b/docs/refactor/talk.md @@ -0,0 +1,499 @@ +--- +summary: "Breaking refactor plan for one Talk architecture across realtime voice, STT/TTS, browser, native, telephony, meetings, and walkie-talkie handoff" +read_when: + - Refactoring Talk mode, realtime voice, voice-call, Google Meet, browser realtime voice, native push-to-talk, STT, or TTS + - Changing Talk Gateway protocol, provider contracts, realtime transports, managed rooms, audio events, cancellation, or tool policy + - Deciding whether a voice feature belongs in core, a provider plugin, a native app, a meeting adapter, or a telephony adapter +title: "Talk refactor plan" +--- + +# Talk Refactor Plan + +This is the breaking-clean plan for unifying every live voice path behind one +Talk architecture. + +The old architecture grew by product surface: browser realtime, Gateway relay, +managed native handoff, streaming transcription, Voice Call, Google Meet, local +STT/TTS, one-shot TTS, and a retired realtime WebSocket endpoint each learned +their own names for sessions, turns, capture, output, barge-in, tool calls, +cancellation, and transcript events. + +The new architecture grows by primitive. There is one public Talk API, one +event envelope, one turn model, one cancellation contract, one provider policy +boundary, and one place for shared runtime state. Browser, native, telephony, +meetings, and walkie-talkie become adapters over those primitives. + +## Product Target + +OpenClaw supports three Talk products: + +| Product | User experience | Mode | +| --------------------- | ----------------------------------------------------------------------- | --------------- | +| Realtime conversation | Low-latency duplex speech with interruption and provider tool calls | `realtime` | +| Walkie-talkie | Press or hold to speak, release, then hear OpenClaw answer | `stt-tts` | +| Transcription | Live captions, dictation, notes, meeting transcript, no assistant audio | `transcription` | + +All three products share session identity, join/reconnect state, turn and +capture ids, input audio metadata, output text/audio state, transcript finality, +tool-call correlation, cancellation, replay, provider capabilities, policy, +auth, and observability. + +One-shot uploaded audio and one-shot TTS do not need live Talk session state +unless they participate in live capture, turns, interruption, replay, or +cancellation. + +## Hard Decisions + +This refactor intentionally removes compatibility that would keep the design +muddy: + +- remove public `talk.realtime.*` RPCs +- remove public `talk.transcription.*` RPCs +- remove public `talk.handoff.*` RPCs +- remove generic `talk.session.inputAudio`, `talk.session.control`, and + `talk.session.toolResult` +- remove old relay event channels +- remove `/voiceclaw/realtime` +- remove `src/gateway/voiceclaw-realtime/` +- remove request-time instruction overrides +- keep `talk.speak` as one-shot TTS, not a live session API +- keep legacy realtime config repair in doctor, not startup +- keep platform and product names out of core branching + +## Vocabulary + +Keep mode, transport, brain, and surface separate. + +```ts +type TalkMode = "realtime" | "stt-tts" | "transcription"; + +type TalkTransport = "webrtc" | "provider-websocket" | "gateway-relay" | "managed-room"; + +type TalkBrain = "agent-consult" | "direct-tools" | "none"; +``` + +### Modes + +`realtime` means a provider owns a live voice session. Audio goes in, audio +comes out, interruptions are possible, and provider tool calls may happen during +one provider session. + +`stt-tts` means input speech is transcribed, OpenClaw answers as text, and TTS +renders the answer. This is the native Talk and walkie-talkie path when a full +duplex provider session is not the right shape. + +`transcription` means speech-to-text without assistant audio output. It covers +captions, dictation, notes, meeting transcript capture, and live voice-note +ingestion. + +### Transports + +`webrtc` is client-owned SDP/media/data-channel transport. It fits browser-owned +OpenAI Realtime sessions with ephemeral credentials. + +`provider-websocket` is client-owned provider JSON and audio framing. It fits +browser-owned Google Live style sessions. + +`gateway-relay` means the Gateway owns the provider connection. The client sends +authenticated audio frames to the Gateway and receives `talk.event` plus audio +output through Gateway-managed relay state. + +`managed-room` means the Gateway owns a room-like session that clients can join, +replace, and drive with explicit turn verbs. It is the primitive for +walkie-talkie and native handoff. + +Telephony and meetings are not core transports. They are adapters that map +phone or meeting media into `gateway-relay`, `managed-room`, or `stt-tts` while +keeping call and meeting lifecycle outside core. + +### Brain Strategies + +`agent-consult` means provider tool calls or session turns consult an OpenClaw +agent. Gateway owns prompt construction, context selection, authorization, abort +signals, and final result delivery. + +`direct-tools` means a trusted first-party surface can call selected OpenClaw +tools directly through Gateway policy. Keep this privileged. + +`none` means transcription-only, external orchestration, or no OpenClaw tool +access. + +## Ownership Boundaries + +Core owns generic Talk semantics: + +- mode, transport, brain, codec, and audio descriptors +- session records and session ownership +- turn ids and capture ids +- event envelope, sequencing, replay, and stale-output suppression +- active capture state +- active assistant output state +- replacement and reconnect state +- cancellation propagation +- tool policy and tool-call correlation +- usage, latency, and health events + +Provider plugins own vendor behavior: + +- OpenAI Realtime SDP and data-channel details +- Google Live WebSocket framing +- streaming STT provider details +- TTS provider details +- provider auth, model, voice, codec, and resume quirks +- provider capability declarations + +Surface adapters own IO and product quirks: + +- browser capture and playback +- native audio sessions, local speech engines, and foreground Talk UX +- node command dispatch +- telephony media streams, marks, clear events, u-law, and call lifecycle +- meeting join/leave, participants, echo suppression, and authorization + +Core may store optional surface metadata for diagnostics. Core must not branch +on browser, iOS, Android, macOS, Google Meet, Voice Call, or any retired product +name. + +## Final Gateway API + +The public Gateway surface is deliberately small: + +```ts +// Discovery and configuration. +talk.catalog; +talk.config; + +// One-shot speech output. +talk.speak; + +// Client-owned provider sessions. +talk.client.create; +talk.client.toolCall; + +// Gateway-owned live sessions. +talk.session.create; +talk.session.join; +talk.session.appendAudio; +talk.session.startTurn; +talk.session.endTurn; +talk.session.cancelTurn; +talk.session.cancelOutput; +talk.session.submitToolResult; +talk.session.close; + +// Events and foreground node mode. +talk.event; +talk.mode; +``` + +Use `talk.client.*` when the client owns provider media transport. Use +`talk.session.*` when the Gateway owns live session state. + +`talk.mode` is the existing foreground node mode broadcast. It can stay, but it +is not part of the Talk session control API. + +### Supported Creation Matrix + +| Method | Mode | Transport | Brain | Owner | +| --------------------- | --------------- | -------------------- | --------------- | ------- | +| `talk.client.create` | `realtime` | `webrtc` | `agent-consult` | client | +| `talk.client.create` | `realtime` | `provider-websocket` | `agent-consult` | client | +| `talk.session.create` | `realtime` | `gateway-relay` | `agent-consult` | Gateway | +| `talk.session.create` | `transcription` | `gateway-relay` | `none` | Gateway | +| `talk.session.create` | `stt-tts` | `managed-room` | `agent-consult` | Gateway | +| `talk.session.create` | `stt-tts` | `managed-room` | `direct-tools` | Gateway | + +Reject combinations that blur ownership. `talk.client.create` must reject +Gateway-owned transports. `talk.session.create` must reject client-owned +transports. + +## Removed API + +Remove these names from handlers, method lists, scopes, protocol schemas, +generated clients, broadcast guards, tests, and docs except explicit migration +tables: + +| Removed | Replacement | +| ------------------------------- | -------------------------------------------------------- | +| `talk.realtime.session` | `talk.client.create` | +| `talk.realtime.toolCall` | `talk.client.toolCall` | +| `talk.realtime.relayAudio` | `talk.session.appendAudio` | +| `talk.realtime.relayCancel` | `talk.session.cancelOutput` or `talk.session.cancelTurn` | +| `talk.realtime.relayMark` | internal relay output state | +| `talk.realtime.relayToolResult` | `talk.session.submitToolResult` | +| `talk.realtime.relayClose` | `talk.session.close` | +| `talk.realtime.relay` | `talk.event` | +| `talk.transcription.session` | `talk.session.create({ mode: "transcription" })` | +| `talk.transcription.audio` | `talk.session.appendAudio` | +| `talk.transcription.cancel` | `talk.session.cancelTurn` | +| `talk.transcription.close` | `talk.session.close` | +| `talk.transcription.relay` | `talk.event` | +| `talk.handoff.create` | `talk.session.create({ transport: "managed-room" })` | +| `talk.handoff.join` | `talk.session.join` | +| `talk.handoff.revoke` | `talk.session.close` | +| `talk.session.inputAudio` | `talk.session.appendAudio` | +| `talk.session.control` | explicit turn/output verbs | +| `talk.session.toolResult` | `talk.session.submitToolResult` | + +Delete this endpoint: + +```text +/voiceclaw/realtime +``` + +Delete this folder: + +```text +src/gateway/voiceclaw-realtime/ +``` + +Do not leave a compatibility namespace around retired code. + +## Target Source Layout + +Shared runtime: + +```text +src/talk/ + audio-codec.ts + agent-consult-runtime.ts + agent-consult-tool.ts + agent-talkback-runtime.ts + fast-context-runtime.ts + provider-registry.ts + provider-resolver.ts + provider-types.ts + session-log-runtime.ts + session-runtime.ts + talk-events.ts + talk-session-controller.ts +``` + +Gateway adapters: + +```text +src/gateway/server-methods/ + talk.ts # catalog, config, speak, mode, composition + talk-client.ts # client-owned provider sessions + talk-session.ts # Gateway-owned live sessions +``` + +Gateway relay helpers can exist while the code moves, but the long-term shape +is that relay, transcription, and handoff state use `src/talk` primitives +instead of each reimplementing turns and events. + +Public SDK: + +```text +src/plugin-sdk/realtime-voice.ts +``` + +Keep this SDK subpath as the stable plugin import facade. It may re-export +Talk runtime contracts, but plugin authors should not import core file layout. + +## Event Contract + +All live paths emit `talk.event` with the envelope defined in +[Talk API and runtime contract](/refactor/talk-api-contract). The required +shape is: `id`, `type`, `sessionId`, `seq`, `timestamp`, `mode`, `transport`, +`brain`, and `payload`, with `turnId`, `captureId`, `callId`, `itemId`, and +`parentId` when the event is tied to turn, capture, provider item, tool call, or +TTS output. + +Core event families are `session.*`, `turn.*`, `capture.*`, `input.audio.*`, +`transcript.*`, `output.text.*`, `output.audio.*`, `tool.*`, `usage.metrics`, +`latency.metrics`, and `health.changed`. Payloads must not duplicate large raw +audio frames when the transport already carries them. Text-ready is not +audio-ready; clients enter playback state only on audio events. + +## Cancellation Contract + +Cancellation must abort underlying work, not only ignore stale output. + +When a turn or session is cancelled: + +- provider realtime response is cancelled when supported +- provider session is closed or reset when cancellation cannot be scoped +- streaming STT receives abort +- agent consult receives abort +- queued tools do not start after abort +- already-started side-effecting tools receive abort and report cancellation +- pending TTS jobs are drained +- playback sources are stopped +- relay streams are cleared +- managed-room capture and output state reset +- stale finals and stale audio deltas are ignored +- one terminal cancellation event is emitted + +Barge-in requires real speech: provider speech-started, local VAD, or an +adapter-owned speech detector. Silence, echo, or microphone buffers alone must +not cancel assistant output. + +## Config Contract + +Config stays under `talk`; do not add `talk.speech`. `talk.provider` and +`talk.providers.*` remain speech/STT/TTS provider config. Realtime selectors +live under `talk.realtime.provider`, `talk.realtime.providers.*`, `model`, +`voice`, `mode`, `transport`, and `brain`. + +`talk.config` returns effective config without secrets unless privileged. +`talk.catalog` returns provider capabilities, not inferred provider-id guesses. +Doctor migrates old realtime placement into `talk.realtime`; runtime startup +does not reinterpret Voice Call, STT, or TTS config as realtime config. + +## Surface Mapping + +| Surface | Talk mapping | +| ------------------------------- | ----------------------------------------------------------------------------------------------------- | +| Browser WebRTC | `talk.client.create`, client-owned provider media, `talk.client.toolCall` for provider tool calls | +| Browser provider WebSocket | `talk.client.create`, browser-owned provider framing, Gateway-owned credentials and policy | +| Browser Gateway relay | `talk.session.create`, `appendAudio`, `submitToolResult`, `cancelOutput`, `close`, and `talk.event` | +| Native push-to-talk | `stt-tts` plus `managed-room`; press/startTurn, release/endTurn, cancel/cancelTurn | +| Walkie-talkie | managed-room join/replacement plus shared turn/output events | +| Voice Call | telephony adapter over Talk events; call ids, stream ids, u-law, marks, clear events stay plugin side | +| Google Meet and future meetings | meeting adapter over Talk events; participant state, permissions, mute, and echo suppression stay out | + +See [Talk surface mapping](/refactor/talk-surfaces) for the adapter-level +rules. + +## Detailed Refactor Phases + +### Phase 1: Protocol Is The Source Of Truth + +- define final `talk.client.*`, `talk.session.*`, `talk.event`, `talk.catalog`, `talk.config`, `talk.speak`, and `talk.mode` +- delete removed RPCs from method lists and generated metadata +- delete removed event channels from hello feature advertising +- classify every final method in `METHOD_SCOPE_GROUPS` +- regenerate TypeScript and Swift protocol clients +- add protocol tests proving removed names are absent + +Exit criteria: generated clients expose only the final public Talk API. + +### Phase 2: Shared Runtime Becomes `src/talk` + +- move provider-agnostic realtime voice modules into `src/talk` +- keep the plugin SDK facade at `openclaw/plugin-sdk/realtime-voice` +- rename logs and tests from realtime-voice wording to Talk wording where that improves clarity +- centralize event sequencing, active turn state, capture state, output state, stale-turn rejection, and replay history +- keep provider adapters out of this folder + +Exit criteria: core and bundled surfaces import shared semantics from `src/talk` +or the SDK facade, not from surface-local helpers. + +### Phase 3: Gateway Method Split + +- make `talk.ts` a composition point for catalog, config, speak, mode, client, and session handlers +- put client-owned provider session methods in `talk-client.ts` +- put Gateway-owned session methods in `talk-session.ts` +- make relay, transcription, and managed-room handlers thin adapters over shared runtime primitives +- route session replacement notifications to the displaced connection +- reject stale turn completion before mutating active room state + +Exit criteria: public RPC handlers read like API adapters, not separate Talk +implementations. + +### Phase 4: Browser UI Uses The Final API + +- update WebRTC and provider WebSocket startup to `talk.client.create` +- update browser provider tool calls to `talk.client.toolCall` +- update Gateway relay startup to `talk.session.create` +- update relay audio to `talk.session.appendAudio` +- update relay tool result submission to `talk.session.submitToolResult` +- update relay close to `talk.session.close` +- listen only to `talk.event` +- handle aborted consult runs immediately instead of timing out +- gate relay barge-in on speech or VAD + +Exit criteria: UI tests contain no calls to removed Talk RPC names. + +### Phase 5: Native And Nodes Become Event-Driven + +- map native push-to-talk into managed-room sessions +- start, end, cancel, and replace turns through explicit session verbs +- clean capture state when push-to-talk start fails +- keep local STT and TTS as native adapter behavior +- remove chat-history polling from the success path +- keep fallback polling only if there is an explicit degraded-mode test + +Exit criteria: native Talk success path is driven by `talk.event`, not hidden +chat side effects. + +### Phase 6: Telephony And Meetings Become Adapters + +- map Voice Call realtime and streaming STT into Talk event/cancellation semantics +- create or guard a turn before early speech cancellation events +- keep telephony codec, marks, clear events, and call lifecycle outside core +- map Google Meet transcript and assistant output into `talk.event` +- keep participant and echo-suppression behavior in the meeting adapter +- pass abort signals into agent consult and tool runtime + +Exit criteria: Voice Call and meetings share event and cancellation semantics +without introducing telephony or meeting branches in core. + +### Phase 7: Config And Doctor Cleanup + +- keep `talk.provider` and `talk.providers.*` as speech/STT/TTS config +- keep realtime voice selectors under `talk.realtime` +- make `talk.config` return only resolved effective provider data +- repair legacy realtime placement in doctor +- document that runtime startup does not guess or rewrite config +- update SDK migration, Gateway protocol, Talk node, Control UI, and TTS docs + +Exit criteria: no second speech namespace, no startup migrations, and no +ambiguous active provider in `talk.config`. + +### Phase 8: Delete The Retired Stack + +- remove `/voiceclaw/realtime` +- delete `src/gateway/voiceclaw-realtime/` +- remove request-time `instructionsOverride` +- remove old RPC handlers, scopes, broadcast guards, protocol schemas, generated clients, docs, and UI calls +- keep old names only in explicit migration tables and negative tests + +Exit criteria: repository search finds removed public names only in migration +notes or tests that assert absence. + +## Test And Verification Plan + +The full matrix lives in +[Talk refactor execution checklist](/refactor/talk-execution). The required +proof areas are: + +- protocol and generated clients expose only the final Talk API +- Gateway tests cover every `talk.client.*` and `talk.session.*` method +- UI tests prove browser WebRTC, provider WebSocket, and relay paths use the final API +- native tests prove managed-room push-to-talk cleanup, replacement, and event flow +- Voice Call and meeting tests prove early speech, barge-in, output state, and cancellation behavior +- config tests prove `talk.config` reports only resolved effective provider data +- architecture searches prove removed RPCs, events, endpoint, folder, and instruction override stay gone +- docs, protocol generation, SDK API checks, Android tests, build, and `pnpm check:changed` pass before push + +## Definition Of Done + +The refactor is complete when: + +- final API is the only advertised public API +- removed RPCs are gone from handlers, scopes, method lists, schemas, generated clients, docs, and UI +- removed event channels are gone +- retired realtime HTTP endpoint is gone +- retired realtime folder is gone +- browser Talk works through `talk.client.*` or `talk.session.*` +- native Talk works through session events +- streaming STT works through `talk.session.*` +- TTS one-shot remains `talk.speak` +- walkie-talkie works through managed-room sessions +- Voice Call and meetings use shared events and cancellation semantics +- cancellation aborts underlying work +- event envelopes are consistent +- config migration is handled by doctor +- tests prove the deleted API cannot accidentally return + +Supporting details: + +- [Talk API and runtime contract](/refactor/talk-api-contract) +- [Talk surface mapping](/refactor/talk-surfaces) +- [Talk refactor execution checklist](/refactor/talk-execution) + +The end state: one Talk system, a small public API, provider-owned vendor +logic, surface-owned IO, and a Gateway core that owns policy, events, sessions, +turns, cancellation, and observability. diff --git a/docs/tools/media-overview.md b/docs/tools/media-overview.md index b1bea44b6..3b4e5df80 100644 --- a/docs/tools/media-overview.md +++ b/docs/tools/media-overview.md @@ -14,6 +14,12 @@ media capabilities are tool-driven: the agent decides when to use them based on the conversation, and each tool only appears when at least one backing provider is configured. +Live speech uses the Talk session contract instead of the one-shot media tool +path. Talk has three modes: provider-native `realtime`, local or streaming +`stt-tts`, and `transcription` for observe-only speech capture. Those modes +share provider catalogs, event envelopes, and cancellation semantics with +telephony, meetings, browser realtime, and native push-to-talk clients. + ## Capabilities @@ -110,6 +116,11 @@ Deepgram, ElevenLabs, Mistral, OpenAI, and xAI also register Voice Call streaming STT providers, so live phone audio can be forwarded to the selected vendor without waiting for a completed recording. +For live user conversations, prefer [Talk mode](/nodes/talk). Batch audio +attachments stay on the media path; browser realtime, native push-to-talk, +telephony, and meeting audio should use Talk events and the session-scoped +catalogs returned by the Gateway. + ## Provider mappings (how vendors split across surfaces) @@ -144,3 +155,4 @@ vendor without waiting for a completed recording. - [Text-to-speech](/tools/tts) - [Media understanding](/nodes/media-understanding) - [Audio nodes](/nodes/audio) +- [Talk mode](/nodes/talk) diff --git a/docs/tools/tts.md b/docs/tools/tts.md index 78bfc10bb..1aafb2ebe 100644 --- a/docs/tools/tts.md +++ b/docs/tools/tts.md @@ -12,6 +12,11 @@ OpenClaw can convert outbound replies into audio across **14 speech providers** and deliver native voice messages on Feishu, Matrix, Telegram, and WhatsApp, audio attachments everywhere else, and PCM/Ulaw streams for telephony and Talk. +TTS is the speech-output half of Talk's `stt-tts` mode. Provider-native +`realtime` Talk sessions synthesize speech inside the realtime provider instead +of calling this TTS path, while `transcription` sessions do not synthesize an +assistant voice response. + ## Quick start @@ -586,6 +591,11 @@ attempted provider: The whole TTS request only fails when **every** attempted provider is skipped or fails. +Talk session provider selection is session-scoped. A Talk client should choose +provider ids, model ids, voice ids, and locales from `talk.catalog` and pass +them through the Talk session or handoff request. Opening a voice session should +not mutate `messages.tts` or global Talk provider defaults. + ## Model-driven directives By default, the assistant **can** emit `[[tts:...]]` directives to override diff --git a/docs/web/control-ui.md b/docs/web/control-ui.md index d3a1e9032..5b0eb77a7 100644 --- a/docs/web/control-ui.md +++ b/docs/web/control-ui.md @@ -96,7 +96,7 @@ Imported themes are stored only in the current browser profile. They are not wri - Chat with the model via Gateway WS (`chat.history`, `chat.send`, `chat.abort`, `chat.inject`). - - Talk through browser realtime sessions. OpenAI uses direct WebRTC, Google Live uses a constrained one-use browser token over WebSocket, and backend-only realtime voice plugins use the Gateway relay transport. The relay keeps provider credentials on the Gateway while the browser streams microphone PCM through `talk.realtime.relay*` RPCs and sends `openclaw_agent_consult` tool calls back through `chat.send` for the larger configured OpenClaw model. + - Talk through browser realtime sessions. OpenAI uses direct WebRTC, Google Live uses a constrained one-use browser token over WebSocket, and backend-only realtime voice plugins use the Gateway relay transport. Client-owned provider sessions start with `talk.client.create`; Gateway relay sessions start with `talk.session.create`. The relay keeps provider credentials on the Gateway while the browser streams microphone PCM through `talk.session.appendAudio` and forwards `openclaw_agent_consult` provider tool calls through `talk.client.toolCall` for Gateway policy and the larger configured OpenClaw model. - Stream tool calls + live tool output cards in Chat (agent events). @@ -168,9 +168,9 @@ Imported themes are stored only in the current browser profile. They are not wri - Talk mode uses a registered realtime voice provider. Configure OpenAI with `talk.provider: "openai"` plus `talk.providers.openai.apiKey`, or configure Google with `talk.provider: "google"` plus `talk.providers.google.apiKey`; Voice Call realtime provider config can still be reused as the fallback. The browser never receives a standard provider API key. OpenAI receives an ephemeral Realtime client secret for WebRTC. Google Live receives a one-use constrained Live API auth token for a browser WebSocket session, with instructions and tool declarations locked into the token by the Gateway. Providers that only expose a backend realtime bridge run through the Gateway relay transport, so credentials and vendor sockets stay server-side while browser audio moves through authenticated Gateway RPCs. The Realtime session prompt is assembled by the Gateway; `talk.realtime.session` does not accept caller-provided instruction overrides. + Talk mode uses a registered realtime voice provider. Configure OpenAI with `talk.realtime.provider: "openai"` plus `talk.realtime.providers.openai.apiKey`, or configure Google with `talk.realtime.provider: "google"` plus `talk.realtime.providers.google.apiKey`. The browser never receives a standard provider API key. OpenAI receives an ephemeral Realtime client secret for WebRTC. Google Live receives a one-use constrained Live API auth token for a browser WebSocket session, with instructions and tool declarations locked into the token by the Gateway. Providers that only expose a backend realtime bridge run through the Gateway relay transport, so credentials and vendor sockets stay server-side while browser audio moves through authenticated Gateway RPCs. The Realtime session prompt is assembled by the Gateway; `talk.client.create` does not accept caller-provided instruction overrides. - In the Chat composer, the Talk control is the waves button next to the microphone dictation button. When Talk starts, the composer status row shows `Connecting Talk...`, then `Talk live` while audio is connected, or `Asking OpenClaw...` while a realtime tool call is consulting the configured larger model through `chat.send`. + In the Chat composer, the Talk control is the waves button next to the microphone dictation button. When Talk starts, the composer status row shows `Connecting Talk...`, then `Talk live` while audio is connected, or `Asking OpenClaw...` while a realtime tool call is consulting the configured larger model through `talk.client.toolCall`. Maintainer live smoke: `OPENAI_API_KEY=... GEMINI_API_KEY=... node --import tsx scripts/dev/realtime-talk-live-smoke.ts` verifies the OpenAI browser WebRTC SDP exchange, Google Live constrained-token browser WebSocket setup, and the Gateway relay browser adapter with fake microphone media. The command prints provider status only and does not log secrets.