Compare commits

...

106 Commits

Author SHA1 Message Date
Scott Hanselman
89fc8bb053 security: harden system.run approval checks
Closes #184 by blocking dangerous environment overrides and by re-evaluating nested shell-wrapper payloads and chained commands against the exec approval policy.

This extends the partial env-only approach discussed in PR #186 so the Windows node closes both vectors called out in the issue.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
2026-04-19 21:32:26 -07:00
Nich Overend
7b4854175e Merge remote-tracking branch 'upstream/master' into feature/voice-mode
# Conflicts:
#	tests/OpenClaw.Shared.Tests/OpenClawGatewayClientTests.cs
2026-04-08 23:25:40 +01:00
Nich Overend
57ae5322fd
Merge pull request #6 from NichUK/codex/fix-pr120-ci
Add tests for voice provider configuration helpers
2026-04-04 18:40:39 +01:00
Nich
5534e9399e Add tests for voice provider configuration helpers
Cover the pure shared logic in VoiceProviderConfigurationStoreExtensions with focused unit tests for case-insensitive provider lookup, case-insensitive setting lookup, SetValue creation/update behavior, and removal of blank or null values.
2026-04-04 18:23:05 +01:00
Nich Overend
88caa1a632
Merge pull request #5 from NichUK/codex/fix-pr120-ci
Refactor tray voice code into OpenClaw.Tray.Shared
2026-04-03 16:49:53 +01:00
Nich Overend
7e7bb329fe
Update src/OpenClaw.Tray.WinUI/Helpers/AppIconHelper.cs
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
2026-04-03 16:26:57 +01:00
Nich Overend
4f3c8c6b3b
Update src/OpenClaw.Tray.WinUI/App.xaml.cs
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
2026-04-03 16:26:41 +01:00
Nich Overend
fd6c89f0d7
Update src/OpenClaw.Tray.WinUI/App.xaml.cs
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
2026-04-03 16:25:18 +01:00
Nich
91006ce0bb Refactor tray voice code into OpenClaw.Tray.Shared
Move voice-mode test-targeted logic out of the WinUI app and into a dedicated shared project so tray tests no longer need to reference OpenClaw.Tray.WinUI directly.

This restores the original CI assumption that the tray test project can be built on its own without transitively building a Windows App SDK application with an implicit architecture. It also keeps the voice/chat extraction scoped away from the broader OpenClaw.Shared library, which remains general-purpose and non-tray-specific.

The new OpenClaw.Tray.Shared project now contains the shared voice/chat surface used by both the tray app and tray tests, including voice transport helpers, provider catalog loading, cloud TTS support, chat coordination, and the web chat DOM bridge. The WinUI app retains the UI shell pieces, including DispatcherQueueAdapter and the app-level icon path helper.

As a follow-up cleanup during the extraction, split the previous IconHelper into AppIconHelper in the WinUI project and VoiceTrayIconHelper in the shared tray project so the new shared library stays focused on voice-related behavior rather than wider tray infrastructure.
2026-04-03 16:06:22 +01:00
Nich Overend
1820901ea4 Remove incorrect project binding to x64 2026-04-02 22:22:29 +01:00
Nich Overend
83c79835c3 Revert "Fix SupportsSpeechToTextRuntime test assertions to match streaming provider route kinds"
Reverts CoPilot fix
This reverts commit 78d0a3daea.
2026-04-02 22:11:48 +01:00
Nich Overend
a6592e8699 Merge branch 'feature/voice-mode' of github.com:NichUK/openclaw-windows-node into feature/voice-mode 2026-04-02 22:06:33 +01:00
Nich Overend
3d5b4e79b2 Fix incorrect test project binding to x64 2026-04-02 22:06:18 +01:00
copilot-swe-agent[bot]
78d0a3daea
Fix SupportsSpeechToTextRuntime test assertions to match streaming provider route kinds
Agent-Logs-Url: https://github.com/NichUK/openclaw-windows-node/sessions/f2ae3d04-4f08-49c2-8095-9e801a4ccf6d

Co-authored-by: NichUK <346792+NichUK@users.noreply.github.com>
2026-04-02 20:59:33 +00:00
Nich Overend
777088cb41
Update src/OpenClaw.Tray.WinUI/Services/Voice/VoiceCloudTextToSpeechClient.cs
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
2026-04-02 21:56:38 +01:00
Nich Overend
6c9680f253 Fix tests after master merge 2026-04-02 21:35:16 +01:00
Nich Overend
db108f4820 Merge origin/master into feature/voice-mode 2026-04-02 21:19:53 +01:00
Nich Overend
fc66745f39 Merge remote feature branch into feature/voice-mode 2026-03-30 18:43:57 +01:00
Nich Overend
c9dc8e88c5 Merge branch 'codex/webchat-direct-send-restore' into feature/voice-mode 2026-03-30 18:42:27 +01:00
Nich Overend
4756563dc3 Tweak voice mode README credit wording 2026-03-30 18:16:10 +01:00
Nich Overend
67771c2bc3 Address remaining voice review comments 2026-03-30 17:05:11 +01:00
Nich Overend
a78764006b Add voice mode feature icon asset 2026-03-30 12:12:19 +01:00
Nich Overend
dc8651c1c6 Refine voice mode controls and docs 2026-03-30 11:15:56 +01:00
Nich Overend
5443dc31f3 Refine voice transport and webchat draft bridge 2026-03-28 09:29:31 +00:00
Nich Overend
89ccb0800e Add compact voice repeater window 2026-03-28 09:29:07 +00:00
Nich Overend
c6be9007c0 Ship v0.1rc3 voice chat and docs fixes 2026-03-26 23:33:06 +00:00
Nich Overend
ad471b23bc Polish talk mode recovery and settings 2026-03-26 22:30:03 +00:00
Nich Overend
03e7e39643 Catalog future STT providers without exposing them 2026-03-26 17:50:31 +00:00
Nich Overend
7d074d7d45 Stop rebuilding Windows STT on idle timeouts 2026-03-26 17:37:37 +00:00
Nich Overend
1515e438ab Split Talk Mode STT into route classes 2026-03-26 16:57:52 +00:00
Nich Overend
f1db7b7c17 Fix stale talk mode restart latch 2026-03-26 10:49:59 +00:00
Nich Overend
8b81870e9e Log talk mode recognizer restart decisions 2026-03-26 10:37:42 +00:00
Nich Overend
ea3fb7df52 Merge remote-tracking branch 'origin/feature/voice-mode' into feature/voice-mode
# Conflicts:
#	src/OpenClaw.Tray.WinUI/Services/Voice/VoiceCloudTextToSpeechClient.cs
2026-03-26 10:31:49 +00:00
Nich Overend
5628675781
Merge pull request #3 from NichUK/copilot/sub-pr-2
Thread CancellationToken + timeout through WebSocket TTS operations
2026-03-26 10:25:21 +00:00
copilot-swe-agent[bot]
c8f5ba1c2f Add VoiceCloudTextToSpeechClient cancellation and decode tests
Co-authored-by: NichUK <346792+NichUK@users.noreply.github.com>
Agent-Logs-Url: https://github.com/NichUK/openclaw-windows-node/sessions/368e6f83-a2f3-412c-bac7-47d57ddd4d92
2026-03-26 10:12:26 +00:00
copilot-swe-agent[bot]
2a93e5a74b Simplify to single CancellationTokenSource with linked token and timeout
Co-authored-by: NichUK <346792+NichUK@users.noreply.github.com>
Agent-Logs-Url: https://github.com/NichUK/openclaw-windows-node/sessions/556f63ce-3524-4508-9ac9-5b05a7697956
2026-03-26 10:05:07 +00:00
copilot-swe-agent[bot]
7c9c8b6852 Merge remote-tracking branch 'origin/feature/voice-mode' into copilot/sub-pr-2
# Conflicts:
#	src/OpenClaw.Tray.WinUI/Services/Voice/VoiceCloudTextToSpeechClient.cs
2026-03-26 10:00:25 +00:00
Nich Overend
40b1aa6cdd
Merge pull request #4 from NichUK/copilot/sub-pr-2-again
Propagate CancellationToken through WebSocket TTS call chain
2026-03-26 09:56:38 +00:00
copilot-swe-agent[bot]
5bad5420bf Thread CancellationToken through WebSocket TTS operations to prevent indefinite hangs
Co-authored-by: NichUK <346792+NichUK@users.noreply.github.com>
Agent-Logs-Url: https://github.com/NichUK/openclaw-windows-node/sessions/0cc237fa-b2b8-427a-83e8-4375e2c3f2fc
2026-03-26 09:47:31 +00:00
copilot-swe-agent[bot]
be22a393f9 Propagate CancellationToken through WebSocket TTS call chain
Co-authored-by: NichUK <346792+NichUK@users.noreply.github.com>
Agent-Logs-Url: https://github.com/NichUK/openclaw-windows-node/sessions/b0f37bbe-5816-430c-9069-9ebbdd02b0a1
2026-03-26 09:46:54 +00:00
Nich Overend
0a0c3bec3e
Update src/OpenClaw.Tray.WinUI/Windows/WebChatWindow.xaml.cs
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
2026-03-26 09:44:19 +00:00
Nich Overend
7291d7846c
Update src/OpenClaw.Tray.WinUI/Services/Voice/VoiceProviderCatalogService.cs
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
2026-03-26 09:43:48 +00:00
Nich Overend
91de10068c
Update src/OpenClaw.Shared/OpenClawGatewayClient.cs
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
2026-03-26 09:41:40 +00:00
copilot-swe-agent[bot]
14f42794d9 Initial plan 2026-03-26 09:41:32 +00:00
copilot-swe-agent[bot]
e8fefaf661 Initial plan 2026-03-26 09:41:22 +00:00
Nich Overend
8499202a8b Match ElevenLabs websocket generation flow 2026-03-25 23:19:32 +00:00
Nich Overend
39623a66c7 Stop premature ElevenLabs websocket EOS 2026-03-25 23:12:05 +00:00
Nich Overend
e22474bd58 Tune ElevenLabs websocket turn flushing 2026-03-25 23:06:25 +00:00
Nich Overend
5dadf15f31 Adjust voice tray icon states 2026-03-25 22:49:41 +00:00
Nich Overend
cd013ac099 Add ElevenLabs websocket TTS provider 2026-03-25 22:30:42 +00:00
Nich Overend
93f01dd6b3 Fix talk mode playback failure handling 2026-03-25 22:02:47 +00:00
Nich Overend
4bf802aa43 Fix overlapping talk mode recovery watchdogs 2026-03-25 21:52:07 +00:00
Nich Overend
bcdaca84fe Delay deaf recognizer recovery until silence 2026-03-25 21:42:10 +00:00
Nich Overend
5ca4af28a0 Stop recycling talk mode on silence 2026-03-25 21:33:17 +00:00
Nich Overend
a3059027a9 Inline parity exceeded notes 2026-03-25 21:17:38 +00:00
Nich Overend
cc1ab5c956 Document macOS voice parity backlog 2026-03-25 21:06:01 +00:00
Nich Overend
ee64cb9f38 Gate talk mode listening on capture readiness 2026-03-25 21:01:40 +00:00
Nich Overend
339a969a8e Fix AudioGraph frame buffer interop 2026-03-25 20:51:55 +00:00
Nich Overend
7046346e4e Add AudioGraph capture backbone for talk mode 2026-03-25 20:39:58 +00:00
Nich Overend
13174bab11 Document AudioGraph voice input architecture 2026-03-25 18:26:56 +00:00
Nich Overend
0709911c25 Avoid stale voice preview replies 2026-03-25 18:21:38 +00:00
Nich Overend
b7ea999edd Support selected playback devices 2026-03-25 18:01:24 +00:00
Nich Overend
cbb4fdcb5a Refresh talk mode on default mic changes 2026-03-25 17:10:54 +00:00
Nich Overend
dcddec2bdd Fix truncated voice input and MiniMax output 2026-03-25 16:56:09 +00:00
Nich Overend
40383a1f75 Add streaming playback backlog story 2026-03-25 16:44:20 +00:00
Nich Overend
3c6224415c Fix voice draft clearing and playback start 2026-03-25 16:42:12 +00:00
Nich Overend
e07234b97e Rebuild talk mode recognizer after deaf sessions 2026-03-25 16:33:42 +00:00
Nich Overend
c212c3f034 Fix stalled talk mode recognizer recycle 2026-03-25 16:19:10 +00:00
Nich Overend
7b8f118c1c Add configurable tray chat memory stripping 2026-03-25 15:52:45 +00:00
Nich Overend
d569f71b4f Revert talk mode to direct chat send 2026-03-25 11:12:58 +00:00
Nich Overend
d1c365567c Recycle stalled talk mode recognition sessions 2026-03-25 10:57:39 +00:00
Nich Overend
d61d82d0ee Delay talk mode ready state until recognizer warm-up 2026-03-25 10:41:04 +00:00
Nich Overend
d8cd664e42 Add voice mode commit timeline to docs 2026-03-25 00:58:10 +00:00
Nich Overend
06d508fd4d Accept late talk mode replies after timeout 2026-03-25 00:57:10 +00:00
Nich Overend
82e2958795 Add voice control and configuration APIs 2026-03-25 00:32:42 +00:00
Nich Overend
c3ded30d47 Queue talk mode replies for sequential playback 2026-03-25 00:10:16 +00:00
Nich Overend
ffa3fa234f Keep talk mode alive after input failures 2026-03-23 23:45:16 +00:00
Nich Overend
2ff57fc017 Add pre-response voice latency timing logs 2026-03-23 23:41:39 +00:00
Nich Overend
91ccec377f Add dynamic tray icons for voice states 2026-03-23 23:28:10 +00:00
Nich Overend
71d0de4286 Fix MiniMax websocket voice playback routing 2026-03-23 23:03:09 +00:00
Nich Overend
45ff8f8c0a Fix voice restart after settings save 2026-03-23 22:57:02 +00:00
Nich Overend
5efcebfe31 Add catalog-driven MiniMax WebSocket TTS 2026-03-23 22:47:04 +00:00
Nich Overend
05d7bae896 Use MiniMax api-uw endpoint for lower TTS latency 2026-03-23 22:36:02 +00:00
Nich Overend
d1374092d9 Tighten talk mode speech recognition filtering 2026-03-23 21:56:18 +00:00
Nich Overend
83f05ee7a0 Instrument voice output latency and reduce TTS buffering 2026-03-23 21:50:11 +00:00
Nich Overend
c1cc0ffcfc Ship voice provider catalog with the tray app 2026-03-23 21:31:39 +00:00
Nich Overend
85d7b906f1 Make cloud TTS voice settings fully catalog-driven 2026-03-23 21:16:27 +00:00
Nich Overend
47efc3e741 Move voice settings below node mode toggle 2026-03-23 20:27:07 +00:00
Nich Overend
199e534dd3 Rename voice modes to VoiceWake and TalkMode 2026-03-23 13:38:54 +00:00
Nich Overend
ded41a2cfe Generalize cloud TTS providers through catalog contracts 2026-03-23 13:26:56 +00:00
Nich Overend
6dba89bbf8 Extract hosted voice settings panel from settings window 2026-03-23 13:09:56 +00:00
Nich Overend
907a1a0d37 Move voice settings into main settings window 2026-03-23 12:58:53 +00:00
Nich Overend
c64f16851f Add editable TTS provider settings to voice mode 2026-03-23 12:28:36 +00:00
Nich Overend
7f31c12d4f Implement MiniMax TTS for voice mode 2026-03-23 10:54:37 +00:00
Nich Overend
b556c647ec Group voice runtime services under Services/Voice 2026-03-23 10:18:58 +00:00
Nich Overend
fdbf48e040 Fix voice transport connection task reuse 2026-03-23 02:37:41 +00:00
Nich Overend
2c8a46d6d9 Harden tray chat voice message handling 2026-03-23 02:37:21 +00:00
Nich Overend
0f1028a052 Document required Minimax and ElevenLabs provider support 2026-03-23 02:14:22 +00:00
Nich Overend
13364724ec Address voice mode review findings and harden runtime 2026-03-23 01:40:47 +00:00
Nich Overend
25dd06bd81 Add focused coordinator coverage for tray voice chat 2026-03-23 01:34:36 +00:00
Nich Overend
aed8cb84be Remove stale always-on autosubmit setting 2026-03-23 01:34:36 +00:00
Nich Overend
1340bde768 Fix tray voice startup and chat window submission 2026-03-23 01:34:36 +00:00
Nich Overend
197a89b741 Integrate always-on voice mode with tray chat workflow 2026-03-23 01:34:36 +00:00
Nich Overend
a81d31ea70 Add configurable voice mode settings and setup UI 2026-03-23 01:34:36 +00:00
Nich Overend
f40ffc3450 Fix voice chat transport and reply routing 2026-03-23 01:34:36 +00:00
Nich Overend
be624fe452 Add Windows voice mode foundation and AlwaysOn runtime 2026-03-23 01:34:36 +00:00
72 changed files with 12189 additions and 311 deletions

7
.gitignore vendored
View File

@ -62,6 +62,7 @@ BenchmarkDotNet.Artifacts/
project.lock.json
project.fragment.lock.json
artifacts/
.env
# ASP.NET Scaffolding
ScaffoldingReadMe.txt
@ -344,3 +345,9 @@ MigrationBackup/
# Fody - auto-generated XML schema
FodyWeavers.xsd
Output/
# Repo-local tool caches and workspace metadata
.claude/
.dotnet-cli/
.playwright-cli/
output/playwright/

View File

@ -105,6 +105,7 @@ Modern Windows 11-style system tray companion that connects to your local OpenCl
- 🚀 **Auto-start** - Launch with Windows
- ⚙️ **Settings** - Full configuration dialog
- 🎯 **First-run experience** - Welcome dialog guides new users
- <img src="src/OpenClaw.Tray.WinUI/Assets/voice-mode-feature.png" alt="Voice Mode" width="20" height="20" /> **Voice Mode (new)** - Talk to your Claw via your Windows node
#### Quick Send scope requirement
@ -122,13 +123,14 @@ If Quick Send fails with `pairing required` / `NOT_PAIRED`, that is a **device a
### Menu Sections
- **Status** - Gateway connection status with click-to-view details
- **Voice** - Access to Voice controls
- **Sessions** - Active agent sessions with preview and per-session controls
- **Usage** - Provider/cost summary with quick jump to activity details
- **Channels** - Telegram/WhatsApp status with toggle control
- **Nodes** - Online/offline node inventory and copyable summary
- **Recent Activity** - Timestamped event stream for sessions, usage, nodes, and notifications
- **Actions** - Dashboard, Web Chat, Quick Send, Activity Stream, History
- **Settings** - Configuration, auto-start, logs
- **Settings** - Configuration, auto-start, logs, voice
### Mac Parity Status
@ -148,6 +150,7 @@ Comparing against [openclaw-menubar](https://github.com/magimetal/openclaw-menub
| Refresh | ✅ | ✅ | Auto-refresh on menu open |
| Launch at Login | ✅ | ✅ | |
| Notifications toggle | ✅ | ✅ | |
| Voice Mode | ✅ | 🟡 | Talk Mode implemented (half-duplex), WakeWord, Interrupt, etc. in progress
### Windows-Only Features
@ -250,7 +253,7 @@ When Node Mode is enabled in Settings, your Windows PC becomes a **node** that t
> 🔒 **Exec Policy**: `system.run` is gated by an approval policy on the Windows node at `%LOCALAPPDATA%\OpenClawTray\exec-policy.json` (schema: `{ "defaultAction": "...", "rules": [...] }`). This is separate from gateway-side `~/.openclaw/exec-approvals.json`.
>
> Rules are matched against the `command` token (`argv[0]`). If your call runs `powershell.exe -File script.ps1`, allow `powershell.exe`/`pwsh.exe` (not just the script path), or you'll get `No matching rule; default policy applied`.
> Rules are matched against the full command line. Known wrapper payloads such as `cmd /c ...`, `powershell -Command ...`, `pwsh -EncodedCommand ...`, and `bash -c ...` are also evaluated before execution. Dangerous environment overrides like `PATH`, `PATHEXT`, `NODE_OPTIONS`, `GIT_SSH_COMMAND`, `LD_*`, and `DYLD_*` are rejected.
>
> ```bash
> openclaw nodes invoke --node <id> --command system.execApprovals.set --params '{"rules":[{"pattern":"powershell.exe","action":"allow"},{"pattern":"pwsh.exe","action":"allow"},{"pattern":"echo *","action":"allow"},{"pattern":"*","action":"deny"}],"defaultAction":"deny"}'
@ -281,6 +284,14 @@ OpenClaw registers the `openclaw://` URL scheme for automation and integration:
Deep links work even when Molty is already running - they're forwarded via IPC.
### Voice Mode
*contributed by NichUK and his colleagues @codex and @copilot*
Currently supports Talk Mode - Always on talk to your Claw! Wakeword and PTT modes coming soon
- Uses internal Windows STT (cloud providers coming soon)
- Windows/Minimax/Eleven Labs TTS voices
- Give your Claw a voice!
## 📦 OpenClaw.CommandPalette
PowerToys Command Palette extension for quick OpenClaw access.

988
docs/VOICE-MODE.md Normal file
View File

@ -0,0 +1,988 @@
# Voice Mode Architecture
*Author: Nich Overend (NichUK@GitHub) - with @codex and @copilot*
https://github.com/openclaw/openclaw-windows-node
This document defines the voice subsystem for the Windows node only. It introduces the command surface, persisted settings schema, and minimum runtime boundaries needed to add Windows voice support without reshaping the existing node architecture.
## Goals
- Add a node-local voice mode with two activation modes: `VoiceWake` and `TalkMode`
- Utilise minimal touch points to the existing app to reduce the potential for screw-ups
- Use NanoWakeWord for wakeword detection on-device
- Present the user-facing mode names as `Voice Wake` and `Talk Mode`
- Keep STT/TTS provider selection configurable, with Windows implementations as the default built-in baseline
- Implement `MiniMax` TTS and `ElevenLabs` TTS as required non-Windows providers after the Windows baseline
- Make adding new voice providers an update to a Json catalog, rather than requiring code changes where possible
- Reuse the existing node capability pattern instead of introducing a parallel control path
- Ensure that the voice sub-system is extensible
- Ensure that the voice sub-system is controllable from other applications
## Non-Goals
- True full-duplex or chunk-streaming audio transport between node and gateway
- Subtantial changes to the existing project
## Design Position
The Windows node should own device-local audio concerns:
- microphone capture
- wakeword detection
- silence detection / utterance segmentation
- speaker playback
- device enumeration and persisted local settings
OpenClaw remains responsible for conversation/session routing and upstream voice orchestration.
This keeps the Windows node lean for the first implementation and avoids introducing provider-routing settings before they are needed.
## Visible Mode Names
The tray app now uses user-facing names (borrowed from the macOS app) rather than exposing the internal enum names directly:
| Internal Mode | Visible Name | Availability |
|---|---|---|
| `Off` | Off | available |
| `VoiceWake` | Voice Wake | visible but disabled for now |
| `TalkMode` | Talk Mode | available |
The contracts and persisted settings now use `VoiceWake` and `TalkMode` as well.
## Transport Boundary
`TalkMode` follows the current talk-mode style control flow:
- the node captures audio locally
- local speech recognition turns that audio into transcript text on the active STT route
- interim hypotheses are surfaced live, but only final `Medium` or `High` confidence recognizer results are submitted
- if speech activity ends without any usable final transcript surviving, Talk Mode now clears the draft and gives a short local repeat prompt instead of silently doing nothing
- the compact voice repeater window, when open, shows the live transcript draft plus local sent/received turns in a single scrolling surface
- the tray chat window, when open, mirrors the live transcript draft into the compose box only
- the finalized transcript is always sent to OpenClaw via direct `chat.send` on the voice mode target session, which is currently hardcoded in the tray app to `agent:main:main`
- OpenClaw returns the assistant reply as normal chat output
- the node performs local or remote TTS playback of that reply
- assistant replies are queued locally and spoken sequentially, with a short (500 ms currently) pause between queued replies so overlapping responses are not lost
- if a reply arrives after the normal 45-second wait timeout, the tray still accepts and speaks that late reply for a short bounded grace window (currently 120s) so slow upstream responses are not silently lost
- assistant replies are currently accepted from either `agent:main:main` or the `main` alias so the tray can tolerate upstream session-key normalisation differences
To avoid obvious duplicate sends from the Windows recognizer, exact duplicate final transcripts are suppressed within a short 750 ms window.
The current Windows implementation uses a voice-local operator connection inside the tray app while node mode is active. That connection carries assistant chat events for `TalkMode`, while the recognized transcript is always sent through the tray app's direct `chat.send` path.
## Voice APIs
The Windows tray implementation now has two API layers:
- shared node-capability commands in `OpenClaw.Shared`
- in-process tray interfaces used by the windows/forms
### Shared Capability Commands
The node capability command surface is:
- `voice.devices.list`
- `voice.settings.get`
- `voice.settings.set`
- `voice.status.get`
- `voice.start`
- `voice.stop`
- `voice.pause`
- `voice.resume`
- `voice.response.skip`
These commands are defined in [VoiceModeSchema.cs](../src/OpenClaw.Shared/VoiceModeSchema.cs) and handled by [VoiceCapability.cs](../src/OpenClaw.Shared/Capabilities/VoiceCapability.cs).
`voice.settings.get` / `voice.settings.set` are the configuration API.
`voice.start` / `voice.stop` / `voice.pause` / `voice.resume` / `voice.response.skip` are the runtime control API.
### Status Surface
`VoiceStatusInfo` now carries the basic state needed by control surfaces:
- mode
- runtime state
- session key
- input/output device ids
- last wake / last utterance timestamps
- pending reply count
- whether a reply can currently be skipped
- current reply preview
- last error
### In-Process Tray Interfaces
The tray app also exposes in-process interfaces so its own windows do not need to bind directly to the concrete `VoiceService` implementation:
- `IVoiceConfigurationApi`
- get voice settings
- update voice settings
- list devices
- get provider catalog
- get/set provider configuration
- `IVoiceRuntimeControlApi`
- get runtime status
- start / stop
- pause / resume
- skip current reply
- `IVoiceRuntime`
- transcript draft and conversation events for chat integration
This now powers multiple tray-local voice surfaces, including the compact voice repeater window.
### Can the Settings Form Use This API?
Yes. The Settings form can use the configuration API cleanly.
The current tray implementation now uses the voice configuration interface for:
- provider catalog loading
- device enumeration
- applying updated voice settings / provider configuration on save
That means the settings UI is no longer hard-wired only to concrete `VoiceService` internals for its voice-specific behavior.
## Speech Output Implementation
In order to reduce output latency as much as possible, the current Windows implementation has made the following implementation decisions:
- the Windows `SpeechSynthesizer` is created once per `TalkMode` runtime and reused for subsequent replies
- Frankly, no one will probably use it, but everyone has it, so...
- cloud TTS uses a shared static `HttpClient`, so HTTP/TLS connections can be reused across replies
- cloud requests use `ResponseHeadersRead`, which lets the client observe response-header arrival without waiting for full buffering first
- the tray app now logs per-reply synthesis timings for both Windows and cloud TTS paths so latency can be measured directly during testing
The main remaining gap is streaming playback from the first audio chunk. Best practice recommends chunked playback as soon as the first audio arrives, but the current implementation still waits for a complete playable stream before starting output (but not for long...):
- Windows `SpeechSynthesizer` is used through `SynthesizeTextToStreamAsync`, which returns a complete stream for playback
- MiniMax now uses the provider catalog's WebSocket TTS contract, but the current player still waits for a complete playable stream before output starts
- ElevenLabs now uses the provider catalog's `stream-input` WebSocket contract, but the current player still waits for a complete playable stream before output starts
So the current design minimizes avoidable setup and connection latency, but does not yet implement first-chunk playback streaming. This is however, planned for an early release (I'm working on it next).
## Tray Chat Integration Decision
Ideally Voice mode and typed chat should remain part of the same user-visible conversation in the web chat UI, however this proved difficult to achieve, as the gateway treated a message stream from the tray app seperately to that from the WebUI, even with the same session key.
The only way of achieving this vaguely reliably seemed to be to locally insert messages into the DOM, but as this was a brittle, hacky solution, it was disgarded.
### Chosen Approach
It was therefore decided to create a separate *voice repeater form* to serve as a message window for voice, as well as making the messages available via toasts.
The tray app keeps a tray-local interim transcript buffer for the current utterance, independent of whether any chat window or voice repeater form is open.
## Provider Selection
Voice settings now carry explicit provider ids for both STT and TTS:
- `Voice.SpeechToTextProviderId`
- `Voice.TextToSpeechProviderId`
The built-in default for both is `windows`.
Runtime behavior in the current phase:
- `windows` is implemented for both STT and TTS
- the `windows` STT route is a pure `Windows.Media.SpeechRecognition.SpeechRecognizer` path with no `AudioGraph` dependency
- `windows` STT is currently treated as `half-duplex, non-streamed`
- `http/ws` is now catalogued as a visible "coming soon" STT slot for generic streaming HTTP/WebSocket adapters
- built-in catalog entries exist for both `minimax` and `elevenlabs` TTS
- `minimax` defaults to `speech-2.8-turbo` and `English_MatureBoss` at present
- `minimax` now uses a catalog-driven WebSocket contract for synchronous TTS
- `elevenlabs` defaults to `eleven_multilingual_v2` and voice id `6aDn1KB0hjpdcocrUkmq (Tiffany)` for now
- only currently usable providers are selectable in Settings
- `sherpa-onnx` is visible but greyed out as a coming-soon local embedded route
- unsupported providers fall back to Windows at runtime with a status warning
### Settings Surface Notes
The Settings panel now shows short inline descriptions for:
- the selected voice mode
- the selected speech-to-text provider
- the selected text-to-speech provider
Those provider descriptions are drawn directly from the provider catalog.
When `Windows Speech Recognition` is selected for STT, the Settings panel now forces both audio device pickers back to the system defaults and greys them out. That matches the current Windows route limitation and avoids advertising per-device microphone routing that does not exist on this route yet.
### Provider Catalog
The provider catalog now ships with the tray app as a bundled asset:
- `Assets\\voice-providers.json`
Example:
```json
{
"speechToTextProviders": [
{
"id": "windows",
"name": "Windows Speech Recognition",
"runtime": "windows",
"enabled": true,
"description": "Built-in Windows.Media speech recognition, half-duplex, non-streamed."
},
{
"id": "http-ws",
"name": "http/ws",
"runtime": "streaming",
"enabled": false,
"visibleInSettings": true,
"selectable": false,
"description": "Will support most cloud and local stand-alone models full or half-duplex, streaming."
},
],
"textToSpeechProviders": [
{
"id": "windows",
"name": "Windows Speech Synthesis",
"runtime": "windows",
"enabled": true,
"description": "Built-in Windows text-to-speech playback."
},
{
"id": "minimax",
"name": "MiniMax",
"runtime": "cloud",
"enabled": true,
"description": "Cloud TTS using the MiniMax WebSocket text-to-speech API.",
"settings": [
{ "key": "apiKey", "label": "API key", "secret": true },
{
"key": "model",
"label": "Model",
"defaultValue": "speech-2.8-turbo",
"options": [
"speech-2.5-turbo-preview",
"speech-02-turbo",
"speech-02-hd",
"speech-2.6-turbo",
"speech-2.6-hd",
"speech-2.8-turbo",
"speech-2.8-hd"
]
},
{ "key": "voiceId", "label": "Voice ID", "defaultValue": "English_MatureBoss" },
{
"key": "voiceSettingsJson",
"label": "Voice settings JSON",
"defaultValue": "\"voice_setting\": { \"voice_id\": {{voiceId}}, \"speed\": 1, \"vol\": 1, \"pitch\": 0 }",
"placeholder": "\"voice_setting\": { \"voice_id\": \"English_MatureBoss\", \"speed\": 1, \"vol\": 1, \"pitch\": 0 }"
}
],
"textToSpeechWebSocket": {
"endpointTemplate": "wss://api.minimax.io/ws/v1/t2a_v2",
"authenticationHeaderName": "Authorization",
"authenticationScheme": "Bearer",
"apiKeySettingKey": "apiKey",
"connectSuccessEventName": "connected_success",
"startMessageTemplate": "{ \"event\": \"task_start\", \"model\": {{model}}, \"language_boost\": \"English\", {{voiceSettingsJson}}, \"audio_setting\": { \"sample_rate\": 32000, \"bitrate\": 128000, \"format\": \"mp3\", \"channel\": 1 } }",
"startSuccessEventName": "task_started",
"continueMessageTemplate": "{ \"event\": \"task_continue\", \"text\": {{text}} }",
"finishMessageTemplate": "{ \"event\": \"task_finish\" }",
"responseAudioMode": "hexJsonString",
"responseAudioJsonPath": "data.audio",
"responseStatusCodeJsonPath": "base_resp.status_code",
"responseStatusMessageJsonPath": "base_resp.status_msg",
"finalFlagJsonPath": "is_final",
"taskFailedEventName": "task_failed",
"successStatusValue": "0",
"outputContentType": "audio/mpeg"
}
},
{
"id": "elevenlabs",
"name": "ElevenLabs",
"runtime": "cloud",
"enabled": true,
"description": "Cloud TTS using the ElevenLabs WebSocket stream-input API.",
"settings": [
{ "key": "apiKey", "label": "API key", "secret": true },
{
"key": "model",
"label": "Model",
"defaultValue": "eleven_multilingual_v2",
"options": [
"eleven_flash_v2_5",
"eleven_turbo_v2_5",
"eleven_multilingual_v2",
"eleven_monolingual_v1"
]
},
{ "key": "voiceId", "label": "Voice ID", "defaultValue": "6aDn1KB0hjpdcocrUkmq", "placeholder": "Enter an ElevenLabs voice ID" },
{
"key": "voiceSettingsJson",
"label": "Voice settings JSON",
"defaultValue": "\"voice_settings\": { \"speed\": 0.9, \"stability\": 0.5, \"similarity_boost\": 0.75 }",
"placeholder": "\"voice_settings\": { \"speed\": 0.9, \"stability\": 0.5, \"similarity_boost\": 0.75 }"
}
],
"textToSpeechWebSocket": {
"endpointTemplate": "wss://api.elevenlabs.io/v1/text-to-speech/{{voiceId}}/stream-input?model_id={{model}}&output_format=mp3_44100_128&auto_mode=true",
"authenticationHeaderName": "xi-api-key",
"authenticationScheme": "",
"apiKeySettingKey": "apiKey",
"connectSuccessEventName": "",
"startMessageTemplate": "{ \"text\": \" \", {{voiceSettingsJson}}, \"xi_api_key\": {{apiKey}} }",
"startSuccessEventName": "",
"continueMessageTemplate": "{ \"text\": {{textWithTrailingSpace}}, \"try_trigger_generation\": true }",
"finishMessageTemplate": "{ \"text\": \"\" }",
"responseAudioMode": "base64JsonString",
"responseAudioJsonPath": "audio",
"finalFlagJsonPath": "isFinal",
"taskFailedEventName": "error",
"outputContentType": "audio/mpeg"
}
}
]
}
```
For cloud-backed TTS providers, the catalog carries either an HTTP or WebSocket request/response contract. That allows a new provider to be added by shipping an updated catalog file with the app, as long as it follows the same general templated transport approach.
This file defines provider metadata and transport contracts. It does not carry API keys, these are stored with the standard config.
### Local Provider Configuration
That means the current design is:
- local tray settings choose the preferred STT/TTS provider ids
- provider API keys and editable values are stored in `%APPDATA%\\OpenClawTray\\settings.json` under `VoiceProviderConfiguration`
- OpenClaw remains the conversation endpoint for `chat.send`
- the shipped provider catalog remains metadata-only and must not contain secrets
This is an intentional short-term design choice so the Windows tray app can use cloud TTS providers without inventing a second catalog file for secrets. It can be revisited later if provider ownership is split differently.
Current configuration values are keyed by provider id. The built-in providers use:
- `apiKey`
- `model`
- `voiceId`
- `voiceSettingsJson`
When the selected TTS provider in Settings is not `windows`, the tray app shows provider-specific fields in the configuration form so the user can enter or edit:
- API key
- model
- voice id
- voice settings JSON
If a provider setting definition includes an `options` list, the settings UI renders that setting as a drop-down instead of a free-text field. That is how built-in cloud providers expose a provider-level choice plus a separate model choice without recompilation.
If a provider setting definition is marked as JSON, the value is inserted into the provider request template as a raw JSON fragment rather than a quoted string. That allows the provider catalog to define whether the user is entering:
- a bare object
- or a full keyed fragment such as `"voice_setting": { ... }`
without hard-coding provider-specific wrapper keys into the runtime.
The current cloud TTS transports are:
- `MiniMax`: catalog-driven WebSocket synthesis
- `ElevenLabs`: catalog-driven WebSocket synthesis (`stream-input`)
For `VoiceWake`, trigger words are gateway-owned global state. The Windows node should eventually consume the same shared trigger list and keep only a local enabled/disabled toggle plus device/runtime settings.
In-flight voice controls are supported, if supported by the chosen provider and provided in their format, although an abstraction/translation layer is being considered, to accompany support for OpenClaw voice directives in replies records.
Pronunciation dictionaries are also only currently supported directly on the voice provider, however a centralised dictionary is possible, and a proposal is being considered.
## Command Surface
The voice subsystem is introduced as a new node capability category: `voice`.
### Commands
| Command | Purpose | Request Payload | Response Payload |
|---|---|---|---|
| `voice.devices.list` | Enumerate input/output audio devices | none | `VoiceAudioDeviceInfo[]` |
| `voice.settings.get` | Return the effective voice configuration | none | `VoiceSettings` |
| `voice.settings.set` | Update the voice configuration | `VoiceSettingsUpdateArgs` | `VoiceSettings` |
| `voice.status.get` | Return runtime voice status | none | `VoiceStatusInfo` |
| `voice.start` | Start the voice runtime with the supplied or persisted mode | `VoiceStartArgs` | `VoiceStatusInfo` |
| `voice.stop` | Stop the voice runtime | `VoiceStopArgs` | `VoiceStatusInfo` |
| `voice.pause` | Pause the active voice runtime | `VoicePauseArgs` | `VoiceStatusInfo` |
| `voice.resume` | Resume a paused voice runtime | `VoiceResumeArgs` | `VoiceStatusInfo` |
| `voice.response.skip` | Skip the currently spoken reply and advance the queue if another reply is pending | `VoiceSkipArgs` | `VoiceStatusInfo` |
### Payload Types
- `VoiceSettings`
- `VoiceWakeSettings`
- `TalkModeSettings`
- `VoiceAudioDeviceInfo`
- `VoiceStatusInfo`
- `VoiceStartArgs`
- `VoiceStopArgs`
- `VoicePauseArgs`
- `VoiceResumeArgs`
- `VoiceSkipArgs`
- `VoiceSettingsUpdateArgs`
These contracts are defined in [VoiceModeSchema.cs](../src/OpenClaw.Shared/VoiceModeSchema.cs).
## Settings Schema
Voice settings are persisted as `SettingsData.Voice` in [SettingsData.cs](../src/OpenClaw.Shared/SettingsData.cs).
Provider configuration is persisted as `SettingsData.VoiceProviderConfiguration` in the same local settings file.
The compact repeater window state is persisted as `SettingsData.VoiceRepeaterWindow` in the same settings file.
The editable voice configuration now lives in the main Settings window.
The tray `Voice Mode` window is a read-only runtime status/detail surface with a shortcut back into Settings.
### Voice Repeater Window Settings
The compact repeater persists its own local UI state in `SettingsData.VoiceRepeaterWindow`:
| Setting | Type | Default | Meaning |
|---|---|---|---|
| `VoiceRepeaterWindow.AutoScroll` | bool | `true` | Automatically scroll the transcript surface to the latest draft/reply |
| `VoiceRepeaterWindow.FloatingEnabled` | bool | `true` | Keep the repeater floating above other windows |
| `VoiceRepeaterWindow.TextSize` | double | `13` | Repeater transcript font size |
| `VoiceRepeaterWindow.HasSavedPlacement` | bool | `false` | Whether a user placement has been persisted yet |
| `VoiceRepeaterWindow.Width` | int? | `null` | Saved repeater width |
| `VoiceRepeaterWindow.Height` | int? | `null` | Saved repeater height |
| `VoiceRepeaterWindow.X` | int? | `null` | Saved repeater screen X coordinate |
| `VoiceRepeaterWindow.Y` | int? | `null` | Saved repeater screen Y coordinate |
### Effective Schema
```json
{
"Voice": {
"Mode": "VoiceWake",
"Enabled": true,
"ShowRepeaterAtStartup": true,
"SpeechToTextProviderId": "windows",
"TextToSpeechProviderId": "windows",
"InputDeviceId": "default-mic",
"OutputDeviceId": "default-speaker",
"SampleRateHz": 16000,
"CaptureChunkMs": 80,
"BargeInEnabled": true,
"VoiceWake": {
"Engine": "NanoWakeWord",
"ModelId": "hey_openclaw",
"TriggerThreshold": 0.65,
"TriggerCooldownMs": 2000,
"PreRollMs": 1200,
"EndSilenceMs": 900
},
"TalkMode": {
"MinSpeechMs": 250,
"EndSilenceMs": 900,
"MaxUtteranceMs": 15000
}
},
"VoiceProviderConfiguration": {
"Providers": [
{
"ProviderId": "minimax",
"Values": {
"apiKey": "<local secret>",
"model": "speech-2.8-turbo",
"voiceId": "English_MatureBoss",
"voiceSettingsJson": "\"voice_setting\": { \"voice_id\": \"English_MatureBoss\", \"speed\": 1, \"vol\": 1, \"pitch\": 0 }"
}
},
{
"ProviderId": "elevenlabs",
"Values": {
"apiKey": "<local secret>",
"model": "eleven_multilingual_v2",
"voiceId": "voice-id",
"voiceSettingsJson": "\"voice_settings\": { \"stability\": 0.5, \"similarity_boost\": 0.8 }"
}
}
]
}
}
```
### Field Rationale
| Field | Purpose |
|---|---|
| `Mode` | Top-level activation mode: `Off`, `VoiceWake`, `TalkMode` |
| `Enabled` | Global feature kill-switch independent of mode |
| `ShowRepeaterAtStartup` | Opens the compact Voice Mode repeater automatically when the app starts with voice mode active |
| `SpeechToTextProviderId` | Selected STT provider id from the local provider catalog |
| `TextToSpeechProviderId` | Selected TTS provider id from the local provider catalog |
| `InputDeviceId` / `OutputDeviceId` | Preferred audio device binding, with selected-speaker support implemented first |
| `SampleRateHz` | Shared capture sample rate, fixed to a speech-friendly default |
| `CaptureChunkMs` | Frame size for capture, VAD, and wakeword processing |
| `BargeInEnabled` | Allows microphone capture while audio playback is active |
| `VoiceWake.*` | NanoWakeWord and post-trigger utterance capture tuning |
| `TalkMode.*` | Continuous-listening segmentation tuning |
### Complete Settings Definition
| Setting | Type | Default | Applies To | Meaning |
|---|---|---|---|---|
| `Voice.Mode` | enum | `Off` | all | Activation mode: `Off`, `VoiceWake`, `TalkMode` |
| `Voice.Enabled` | bool | `false` | all | Master enable/disable flag for voice mode |
| `Voice.ShowRepeaterAtStartup` | bool | `true` | all | If `true`, the compact Voice Mode repeater opens automatically when the app starts with voice mode active |
| `Voice.SpeechToTextProviderId` | string | `windows` | all | Preferred speech-to-text provider id |
| `Voice.TextToSpeechProviderId` | string | `windows` | all | Preferred text-to-speech provider id |
| `Voice.InputDeviceId` | string? | `null` | all | Preferred microphone device id; `null` means system default |
| `Voice.OutputDeviceId` | string? | `null` | all | Preferred speaker device id; `null` means system default |
| `Voice.SampleRateHz` | int | `16000` | all | Internal capture rate used for wakeword, VAD, and utterance assembly |
| `Voice.CaptureChunkMs` | int | `80` | all | Audio frame duration used by the capture loop |
| `Voice.BargeInEnabled` | bool | `true` | all | If `true`, microphone capture may continue while response audio is playing |
| `Voice.VoiceWake.Engine` | string | `NanoWakeWord` | voice wake | Voice Wake engine identifier |
| `Voice.VoiceWake.ModelId` | string | `hey_openclaw` | voice wake | Voice Wake model/profile identifier |
| `Voice.VoiceWake.TriggerThreshold` | float | `0.65` | voice wake | Minimum score required to trigger Voice Wake activation |
| `Voice.VoiceWake.TriggerCooldownMs` | int | `2000` | voice wake | Minimum delay before another Voice Wake trigger is accepted |
| `Voice.VoiceWake.PreRollMs` | int | `1200` | voice wake | Buffered audio retained before the trigger point |
| `Voice.VoiceWake.EndSilenceMs` | int | `900` | voice wake | Silence timeout used to finalize the post-trigger utterance |
| `Voice.TalkMode.MinSpeechMs` | int | `250` | talk mode | Minimum detected speech duration before an utterance is treated as real input |
| `Voice.TalkMode.EndSilenceMs` | int | `900` | talk mode | Silence timeout used to finalize an utterance |
| `Voice.TalkMode.MaxUtteranceMs` | int | `15000` | talk mode | Hard cap on utterance length before forced submission/finalization |
| `VoiceProviderConfiguration.Providers[].ProviderId` | string | none | cloud providers | Provider id matching an `Assets\\voice-providers.json` entry |
| `VoiceProviderConfiguration.Providers[].Values["apiKey"]` | string? | `null` | cloud providers | API key sent using the provider contract's configured auth header |
| `VoiceProviderConfiguration.Providers[].Values["model"]` | string? | provider default | cloud providers | Model identifier inserted into the configured request template |
| `VoiceProviderConfiguration.Providers[].Values["voiceId"]` | string? | provider default | cloud providers | Voice id inserted into the configured request template or URL |
| `VoiceProviderConfiguration.Providers[].Values["voiceSettingsJson"]` | string? | provider default | cloud providers | Raw JSON fragment inserted into the configured request template; may be a keyed fragment like `"voice_setting": { ... }` |
At runtime today:
- `Voice.OutputDeviceId` is applied to Talk Mode playback through `MediaPlayer.AudioDevice`
- `VoiceCaptureService` now runs an `AudioGraph` capture pipeline in parallel with Talk Mode and binds it to the selected or default microphone device
- `Voice.InputDeviceId` is now used by that `AudioGraph` capture path, but transcript generation still uses the Windows default speech input path until the STT adapter migration is complete
- Talk Mode only advertises `ListeningContinuously` after the capture graph has produced live frames and the recognizer warm-up window has elapsed, so the status acts as a real “you can start talking now” signal instead of a timer-only guess
- recognizer recovery is now speech-triggered rather than silence-triggered: the Windows recognizer is only recycled when sustained capture speech is present but no recognition activity follows
- when a recognizer session ends after real hypothesis activity but before a final result arrives, Talk Mode now promotes the last recent hypothesis and submits it instead of dropping the utterance
- the speech-mismatch recovery watchdog is single-owner and only armed from capture speech, so a new recognition session does not spawn overlapping recovery loops
- when the system default capture device changes and Talk Mode is using the default mic, the recognizer is rebuilt so device switches such as AirPods are picked up without a full app restart
- explicit non-default microphone transcript generation is still pending the planned STT adapter migration
## Current Runtime Architecture
The current Windows implementation is still centred on `VoiceService`, with a few supporting seams around it:
- `VoiceCapability`
exposes shared `voice.*` commands to the node/gateway surface
- `VoiceCaptureService`
owns the new `AudioGraph` capture backbone, selected/default microphone binding, and live signal detection
- `VoiceService`
owns Talk Mode runtime state, recognizer/TTS integration, reply queuing, timeouts, gateway reply handling, and the transition layer between `AudioGraph` capture and the current recognizer-owned STT path
- `VoiceChatCoordinator`
mirrors interim transcript drafts and conversation turns into attached tray windows without making any window part of the transport path
- `OpenClawGatewayClient`
carries direct `chat.send`, final chat events, and the `sessions.preview` fallback path for bare final markers
- `WebChatWindow`
mirrors live transcript drafts into the WebChat compose box
- `VoiceRepeaterWindow`
is the compact local transcript/reply/control surface for Talk Mode
### Current End-to-End Talk Mode
```mermaid
flowchart LR
A["User speech"] --> B["VoiceCaptureService<br/>AudioGraph on selected/default mic"]
A --> C["Windows SpeechRecognizer<br/>continuous dictation on current default mic"]
B --> D["FrameCaptured / SignalDetected"]
D --> E["VoiceService<br/>capture-backed health + device state"]
C --> F["HypothesisGenerated<br/>interim text"]
F --> G["VoiceService<br/>draft event"]
G --> H["VoiceChatCoordinator"]
H --> I["WebChatWindow<br/>compose-box mirror only"]
H --> I2["VoiceRepeaterWindow<br/>compact local draft surface"]
C --> J["ResultGenerated<br/>final Medium/High text"]
J --> K["VoiceService<br/>duplicate guard + late hypothesis promotion"]
K --> L["Stop recognition session"]
L --> M["OpenClawGatewayClient.SendChatMessageAsync<br/>direct chat.send(agent:main:main, transcript)"]
M --> N["OpenClaw / session pipeline"]
K --> H2["VoiceChatCoordinator<br/>outgoing turn event"]
H2 --> I2
N --> O["Chat final event"]
O --> P{"assistant text present?"}
P -- "yes" --> Q["assistant text"]
P -- "no" --> R["sessions.preview fallback<br/>with stale-preview retry guard"]
R --> Q
Q --> H3["VoiceChatCoordinator<br/>incoming turn event"]
H3 --> I2
Q --> S["VoiceService reply queue"]
S --> T{"TTS provider"}
T -- "windows" --> U["SpeechSynthesizer"]
T -- "cloud" --> V["VoiceCloudTextToSpeechClient<br/>MiniMax websocket or other provider"]
U --> W["Complete playable stream"]
V --> W
W --> X["MediaPlayer<br/>selected OutputDeviceId if set"]
X --> Y["Speaker / headset output"]
Y --> Z["Resume recognition when queue drains"]
```
### Current Processing Stages
| Stage | Component | Input | Output |
|---|---|---|---|
| 1 | `VoiceCaptureService` | selected/default microphone device | continuous frame and signal events from `AudioGraph` |
| 2 | `SpeechRecognizer` | Windows default speech-input path | interim/final transcript text |
| 3 | `VoiceService` | capture signal + final transcript text | health/restart decisions, de-duplicated transcript, runtime state changes |
| 4 | `VoiceChatCoordinator` | draft and conversation-turn events | mirrored draft for WebChat plus compact local transcript/reply updates |
| 5 | `OpenClawGatewayClient` | transcript text + session key | `chat.send` request + assistant reply events |
| 6 | `OpenClawGatewayClient` preview fallback | bare final chat marker | assistant preview text, guarded against stale replay |
| 7 | `VoiceService` reply queue | assistant reply text | ordered reply playback work |
| 8 | `VoiceCloudTextToSpeechClient` / `SpeechSynthesizer` | assistant reply text | complete playable audio stream |
| 9 | `MediaPlayer` | complete playable audio stream | rendered audio on default or selected speaker |
## Planned AudioGraph Input Architecture
The next input-phase refactor will move microphone ownership away from `SpeechRecognizer` and into an explicit capture pipeline built around `AudioGraph`.
The purpose of that change is to unlock:
- true selected non-default microphone support
- streaming rather than utterance-owned capture
- a proper ring buffer and VAD pipeline
- future non-Windows and streaming STT providers
- future barge-in / full-duplex work
### Target Input Stack
```mermaid
flowchart TD
A["Selected microphone device id<br/>or system default mic"] --> B["VoiceCaptureService<br/>AudioGraph input node"]
B --> C["PCM frame stream<br/>fixed chunk duration"]
C --> D["Ring buffer<br/>bounded pre-roll"]
C --> E["VoiceActivityDetector"]
C --> F["VoiceWake engine<br/>later"]
C --> G["SpeechToText adapter"]
E --> H["UtteranceAssembler<br/>for non-streaming STT adapters"]
D --> H
H --> G
G --> I["Transcript events<br/>interim + final"]
I --> J["VoiceService / runtime controller"]
J --> K["OpenClawGatewayClient<br/>chat.send + reply events"]
```
### Proposed Seams
The target split should look like this:
- `VoiceCaptureService`
- owns `AudioGraph`
- binds to an explicit input device id when one is selected
- emits continuous PCM frames
- `IVoiceActivityDetector`
- emits speech / silence transitions from frame data
- `IUtteranceAssembler`
- builds bounded utterances from frames for non-streaming STT backends
- `ISpeechToTextAdapter`
- consumes either live frames or completed utterances
- emits interim and final transcript events
- `VoiceService`
- remains the runtime orchestrator rather than the owner of low-level capture
## Selected-Device Roadmap
The current selected-device position is now:
- selected non-default speaker: implemented
- selected/default microphone binding for `SpeechRecognizer` capture: implemented
- selected non-default microphone for actual transcript generation: not implemented yet (requires `AudioGraph` support)
## Control Flow
```mermaid
sequenceDiagram
participant Gateway as Gateway / Operator
participant VoiceCap as VoiceCapability
participant Coord as VoiceService
participant Store as SettingsData.Voice
Gateway->>VoiceCap: voice.settings.get
VoiceCap-->>Gateway: VoiceSettings
Gateway->>VoiceCap: voice.settings.set(settings, persist=true)
VoiceCap->>Store: save VoiceSettings
VoiceCap-->>Gateway: VoiceSettings
Gateway->>VoiceCap: voice.start(mode=TalkMode, sessionKey=...)
VoiceCap->>Coord: Start(VoiceStartArgs)
Coord-->>VoiceCap: VoiceStatusInfo(state=ListeningContinuously)
VoiceCap-->>Gateway: VoiceStatusInfo
Gateway->>VoiceCap: voice.status.get
VoiceCap-->>Gateway: VoiceStatusInfo
Gateway->>VoiceCap: voice.pause(reason=...)
VoiceCap->>Coord: Pause()
Coord-->>VoiceCap: VoiceStatusInfo(state=Paused)
VoiceCap-->>Gateway: VoiceStatusInfo
Gateway->>VoiceCap: voice.resume(reason=...)
VoiceCap->>Coord: Resume()
Coord-->>VoiceCap: VoiceStatusInfo(state=ListeningContinuously)
VoiceCap-->>Gateway: VoiceStatusInfo
Gateway->>VoiceCap: voice.response.skip(reason=...)
VoiceCap->>Coord: SkipCurrentReply()
Coord-->>VoiceCap: VoiceStatusInfo
VoiceCap-->>Gateway: VoiceStatusInfo
Gateway->>VoiceCap: voice.stop(reason=...)
VoiceCap->>Coord: Stop()
Coord-->>VoiceCap: VoiceStatusInfo(state=Stopped)
VoiceCap-->>Gateway: VoiceStatusInfo
```
## Integration Boundaries
### Existing Components Reused
- `NodeService` remains the capability registration and lifecycle owner
- `SettingsData` remains the persisted JSON settings model
- `WindowsNodeClient` remains the gateway/node transport
- existing node capability registration remains the integration pattern
- current request/response transport remains the v1 control plane
### Supporting Components In Current Use
- `VoiceCapability` in `OpenClaw.Shared.Capabilities`
- `VoiceCaptureService` in `OpenClaw.Tray.WinUI.Services`
- `VoiceChatCoordinator` in `OpenClaw.Tray.WinUI.Services`
- `VoiceRepeaterWindow` in `OpenClaw.Tray.WinUI.Windows`
- `WebChatWindow` in `OpenClaw.Tray.WinUI.Windows`
### Components Still Expected Later
- `VoiceWakeService` in `OpenClaw.Tray.WinUI.Services`
- a dedicated `VoicePlaybackService` seam when playback is split out of `VoiceService`
## Parity with macOS Node
Status values used below:
- `Supported`
- `Partial`
- `NotSupported (planned)`
- `Exceeded*`
| macOS feature | Current Windows state | Notes |
|---|---|---|
| Talk Mode continuous loop (`listen -> chat.send(main) -> wait -> speak`) | `Supported` | Windows Talk Mode uses direct `chat.send` on the tray voice target session (`agent:main:main` today, while still accepting the `main` alias on replies) and loops back to listening after reply playback. |
| Talk Mode sends after a short silence window | `Supported` | The current runtime finalizes on recognition pause and uses configurable Talk Mode silence settings. |
| Talk Mode visible phase transitions (`Listening -> Thinking -> Speaking`) | `Partial` | Runtime states, tray icon changes, and the compact voice repeater window exist, but there is no always-visible overlay yet. |
| Talk Mode always-on overlay with click-to-stop / click-X controls | `NotSupported (planned)` | Windows currently has a tray icon, a manually-opened compact repeater window, and WebChat draft mirroring, but no always-on overlay surface. |
| Talk Mode writes replies into WebChat the same way typed chat does | `Partial` | Replies appear in WebChat through normal session updates, but Talk Mode uses direct send rather than a same-as-typing transport path. |
| Talk Mode interrupt-on-speech / barge-in | `NotSupported (planned)` | Windows is still half-duplex during reply playback. |
| Talk Mode voice directives in replies | `NotSupported (planned)` | Windows does not yet parse or apply the JSON voice directive line described in the Talk Mode docs. |
| Talk Mode true streaming TTS playback | `NotSupported (planned)` | MiniMax uses WebSocket transport, but playback still waits for a complete playable stream. |
| Talk Mode cloud TTS provider flexibility | `Exceeded` | Windows already supports Windows built-in TTS plus catalog-driven cloud providers rather than being limited to a single provider path. This exceeds the documented macOS baseline on provider flexibility, but not yet on true streaming playback latency because incremental playback is still pending. |
| Voice Wake wake-word runtime | `NotSupported (planned)` | `VoiceWake` remains a documented target mode, but there is no active wake-word runtime yet. |
| Voice Wake push-to-talk capture | `NotSupported (planned)` | There is no Windows push-to-talk path yet. |
| Voice Wake overlay with committed / volatile transcript states | `NotSupported (planned)` | No Voice Wake overlay exists on Windows yet. |
| Voice Wake restart invariants when UI is dismissed | `NotSupported (planned)` | The macOS overlay-dismiss resilience behavior has no Windows equivalent yet because the overlay/runtime does not exist. |
| Voice Wake forwarding to the active gateway / agent | `NotSupported (planned)` | Forwarding semantics are only implemented for Talk Mode today. |
| Voice Wake machine-hint transcript prefixing | `NotSupported (planned)` | Windows does not currently prepend a machine hint on forwarded wake transcripts. |
| Voice Wake mic picker, live level meter, trigger-word table, and tester | `NotSupported (planned)` | Windows has general voice settings and device lists, but not the Voice Wake-specific settings surface from macOS. |
| Voice mic device selection | `Partial` | When `Windows Speech Recognition` is selected, Settings now locks both audio device pickers to the system defaults. Explicit per-device transcription routing remains a future AudioGraph/streaming-route feature. |
| Voice Wake send / trigger chimes | `NotSupported (planned)` | Windows currently has no configurable trigger/send sounds. |
## Feature List - Backlog - Not in Order, except maybe the first two ;)
### Story: Streaming STT Capture Pipeline
Implement `AudioGraph` to create an extensible streaming speech input pipeline, rather than the current self-contained `Windows.Media.SpeechRecognizer` pipeline.
This will allow us to mix/match components, and reduce latency.
- Will support Cloud or Local http/ws providers (including Microsoft Foundry Local/OpenAI Whisper/etc)
- Will support Embedded sherpa-onnx engine for user-defined/downloaded models
- This will enable selection of best of class model for required use/language
### Story: True streaming TTS playback
Start speaking assistant replies from the first usable audio chunk instead of waiting for a complete playable stream.
Notes:
- the current implementation uses WebSocket transport for MiniMax, but still buffers the entire audio response before playback begins
- `firstChunk=...ms` in the log is currently provider-chunk arrival time, not actual speech-start time
- implement a playback path that can consume incremental audio data as it arrives from the provider
- the provider catalog contract should remain transport-driven and provider-agnostic, so streaming behavior should be expressed through the existing TTS contract model rather than hard-coded for MiniMax
- preserve the existing queued reply behavior, skip support, and late-reply handling while switching playback to progressive output
- add timing logs that separate `firstChunk`, `playbackStart`, and `playbackEnd` so latency improvements are measurable
### Story: True selected-microphone transcription support
Make actual STT transcription follow the selected microphone device, not just the default device.
- depends on `AudioGraph` support
### Story: Talk Mode overlay and visible phase parity
Add a Talk Mode overlay that makes `Listening`, `Thinking`, and `Speaking` visible to the user in the same way the macOS experience does. Probably via the current voice mode form. I haven't actually seen the macOS version, so not sure how they do it.
### Story: Talk Mode overlay controls
Add explicit Talk Mode overlay controls for stopping speech playback and exiting Talk Mode.
Notes:
- macOS exposes click-to-stop and click-to-exit controls directly on the overlay
- Windows currently requires tray or settings interaction instead
- this should plug into the shared runtime control API rather than directly manipulating `VoiceService`
### Story: Voice directives in replies
Support the Talk Mode reply-prefix JSON directive described in the OpenClaw docs.
Notes:
- parse only the first non-empty reply line
- strip the directive before playback
- support per-reply `once: true` and persistent default updates
- supported keys should at least include voice, model, and the documented voice-shaping parameters
- provider-specific validation should happen through the provider contract layer where possible
### Story: Foundry Local STT provider
Implement the AudioGraph-fed streaming STT adapter for Foundry Local.
Notes:
- provider metadata now lives in the provider catalog, but it should stay disabled in settings until the runtime adapter exists
- this route should use the shared streaming STT path rather than the Windows.Media recognizer path
- endpoint and model selection should come from the provider catalog settings contract
### Story: OpenAI Whisper STT provider
Implement the AudioGraph-fed streaming STT adapter for OpenAI Whisper transcription.
Notes:
- this should be catalog-driven and disabled in settings until the adapter is production-ready
- the initial implementation only needs the basic transcription path, not translation or diarization
- API key and model configuration should come from the provider catalog
### Story: ElevenLabs Speech to Text provider
Implement the AudioGraph-fed streaming STT adapter for ElevenLabs speech-to-text.
Notes:
- keep it catalog-driven and disabled in settings until the runtime path is implemented
- match the same route abstraction used by the other non-Windows STT providers
- any provider-specific partial/final transcript semantics should be normalized in the adapter layer
### Story: Azure AI Speech STT provider
Implement the AudioGraph-fed streaming STT adapter for Azure AI Speech.
Notes:
- use the official Azure AI Speech naming in settings and docs rather than an internal "Foundry Azure STT" label
- keep the provider catalog entry disabled until the adapter is functional end to end
- endpoint and credential handling should come from the provider settings contract
### Story: sherpa-onnx embedded STT provider
Implement the local embedded sherpa-onnx STT route for user-supplied model bundles.
Notes:
- keep this visible but greyed out in settings until the embedded runtime is implemented
- the user should be able to choose their own downloaded model bundle and language-appropriate package
- model lifecycle, validation, and error reporting should be handled in the embedded adapter rather than in the Windows.Media route
### Story: Full-duplex / barge-in Talk Mode
Allow the node to keep listening while it is speaking, so the user can interrupt or interleave speech without waiting for reply playback to finish.
Notes:
- the current Windows implementation is half-duplex: recognition is stopped or ignored while a reply is being spoken
- practical requirements are likely to include:
- microphone capture that can remain active during playback
- acoustic echo cancellation / echo suppression
- barge-in detection and playback interruption rules
- a policy for whether interrupt speech cancels the current reply or queues behind it
- additional runtime control/status so the UI can show when barge-in is armed
- this should be treated as a separate engineering phase, not a small extension of the current Talk Mode runtime
### Story: Voice Wake wake-word runtime
Implement the actual Windows Voice Wake runtime.
Notes:
- this should cover wake-word listening, trigger detection, post-trigger capture, silence finalization, hard-stop protection, and debounce between sessions
- the runtime should restart cleanly after send and should remain armed whenever Voice Wake is enabled and permissions are available
- the implementation should be based on the planned `AudioGraph` capture pipeline rather than a second unrelated microphone stack
### Story: Voice Wake push-to-talk
Implement a Windows push-to-talk capture path alongside wake-word activation.
Notes:
- this should support press-to-capture, release-to-finalize semantics
- it should pause the wake runtime while push-to-talk capture is active, then resume it cleanly afterward
- Windows-specific hotkey and permissions behavior should be documented explicitly once chosen
### Story: Voice Wake settings parity
Add the user-facing Voice Wake settings surface that exists on macOS.
Notes:
- include language and mic pickers
- include a live level meter
- include trigger-word editing or table management
- include a local-only tester that does not forward
- preserve the chosen mic if it disconnects, surface a disconnected hint, and fall back to the system default until it returns
### Story: Voice Wake sounds and chimes
Add configurable trigger and send sounds for Voice Wake.
Notes:
- trigger and send events should be independently configurable
- support `No Sound`
- keep the sound implementation distinct from assistant reply playback
### Story: Voice Wake forwarding semantics
Implement the documented Voice Wake forwarding behavior.
Notes:
- forwarded transcripts should go to the active gateway / agent path
- reply delivery and logging behavior should match the rest of the node session model
- the forwarding path should be resilient even when UI surfaces are closed
### Story: Voice Wake machine-hint prefixing
Implement the documented transcript prefixing / machine-hint behavior for forwarded Voice Wake utterances.
Notes:
- the prefixing rule should be explicit and testable
- both wake-word and push-to-talk paths should share the same forwarding helper
### Story: Voice Wake trigger tuning and pause semantics
Implement the documented Voice Wake trigger-gap, silence-window, hard-stop, and debounce semantics.
Notes:
- include the wake-word gap behavior before command capture begins
- support distinct silence windows for trigger-only vs flowing speech cases
- include a hard maximum capture duration
- expose the tuning through voice settings rather than hard-coded constants alone

View File

@ -5,6 +5,7 @@
<Platform Project="x64" />
</Project>
<Project Path="src/OpenClaw.Shared/OpenClaw.Shared.csproj" />
<Project Path="src/OpenClaw.Tray.Shared/OpenClaw.Tray.Shared.csproj" />
<Project Path="src/OpenClaw.Tray.WinUI/OpenClaw.Tray.WinUI.csproj" />
</Folder>
<Folder Name="/tests/">

View File

@ -269,13 +269,22 @@ public class SystemCapability : NodeCapabilityBase
request.Args.TryGetProperty("env", out var envEl) &&
envEl.ValueKind == System.Text.Json.JsonValueKind.Object)
{
env = new Dictionary<string, string>();
env = new Dictionary<string, string>(StringComparer.OrdinalIgnoreCase);
foreach (var prop in envEl.EnumerateObject())
{
if (prop.Value.ValueKind == System.Text.Json.JsonValueKind.String)
env[prop.Name] = prop.Value.GetString() ?? "";
}
}
var envResult = ExecEnvSanitizer.Sanitize(env);
if (envResult.Blocked.Length > 0)
{
var blockedList = string.Join(", ", envResult.Blocked.OrderBy(n => n, StringComparer.OrdinalIgnoreCase));
Logger.Warn($"system.run DENIED: blocked environment overrides [{blockedList}]");
return Error($"Unsafe environment variable override blocked: {blockedList}");
}
env = envResult.Allowed;
// Build the full command string for policy evaluation and logging.
// When command arrives as an argv array, we must evaluate the entire
@ -296,6 +305,23 @@ public class SystemCapability : NodeCapabilityBase
Logger.Warn($"system.run DENIED: {fullCommand} ({approval.Reason})");
return Error($"Command denied by exec policy: {approval.Reason}");
}
var parseResult = ExecShellWrapperParser.Expand(fullCommand, shell);
if (!string.IsNullOrWhiteSpace(parseResult.Error))
{
Logger.Warn($"system.run DENIED: {fullCommand} ({parseResult.Error})");
return Error($"Command denied by exec policy: {parseResult.Error}");
}
foreach (var target in parseResult.Targets)
{
var innerApproval = _approvalPolicy.Evaluate(target.Command, target.Shell);
if (!innerApproval.Allowed)
{
Logger.Warn($"system.run DENIED: {target.Command} ({innerApproval.Reason})");
return Error($"Command denied by exec policy: {innerApproval.Reason}");
}
}
}
try

View File

@ -0,0 +1,248 @@
using System;
using System.Collections.Generic;
using System.Text.Json;
using System.Threading.Tasks;
namespace OpenClaw.Shared.Capabilities;
public class VoiceCapability : NodeCapabilityBase
{
private const string LegacySkipCommand = "voice.skip";
private static readonly JsonSerializerOptions s_jsonOptions = new()
{
PropertyNameCaseInsensitive = true
};
public override string Category => "voice";
public override IReadOnlyList<string> Commands => VoiceCommands.All;
public event Func<Task<VoiceAudioDeviceInfo[]>>? ListDevicesRequested;
public event Func<Task<VoiceSettings>>? SettingsRequested;
public event Func<VoiceSettingsUpdateArgs, Task<VoiceSettings>>? SettingsUpdateRequested;
public event Func<Task<VoiceStatusInfo>>? StatusRequested;
public event Func<VoiceStartArgs, Task<VoiceStatusInfo>>? StartRequested;
public event Func<VoiceStopArgs, Task<VoiceStatusInfo>>? StopRequested;
public event Func<VoicePauseArgs, Task<VoiceStatusInfo>>? PauseRequested;
public event Func<VoiceResumeArgs, Task<VoiceStatusInfo>>? ResumeRequested;
public event Func<VoiceSkipArgs, Task<VoiceStatusInfo>>? SkipRequested;
public VoiceCapability(IOpenClawLogger logger) : base(logger)
{
}
public override async Task<NodeInvokeResponse> ExecuteAsync(NodeInvokeRequest request)
{
return request.Command switch
{
VoiceCommands.ListDevices => await HandleListDevicesAsync(),
VoiceCommands.GetSettings => await HandleGetSettingsAsync(),
VoiceCommands.SetSettings => await HandleSetSettingsAsync(request),
VoiceCommands.GetStatus => await HandleGetStatusAsync(),
VoiceCommands.Start => await HandleStartAsync(request),
VoiceCommands.Stop => await HandleStopAsync(request),
VoiceCommands.Pause => await HandlePauseAsync(request),
VoiceCommands.Resume => await HandleResumeAsync(request),
VoiceCommands.Skip or LegacySkipCommand => await HandleSkipAsync(request),
_ => Error($"Unknown command: {request.Command}")
};
}
private async Task<NodeInvokeResponse> HandleListDevicesAsync()
{
Logger.Info(VoiceCommands.ListDevices);
if (ListDevicesRequested == null)
return Error("Voice device enumeration not available");
try
{
return Success(await ListDevicesRequested());
}
catch (Exception ex)
{
Logger.Error("Voice device enumeration failed", ex);
return Error($"Device enumeration failed: {ex.Message}");
}
}
private async Task<NodeInvokeResponse> HandleGetSettingsAsync()
{
Logger.Info(VoiceCommands.GetSettings);
if (SettingsRequested == null)
return Error("Voice settings not available");
try
{
return Success(await SettingsRequested());
}
catch (Exception ex)
{
Logger.Error("Voice settings get failed", ex);
return Error($"Get settings failed: {ex.Message}");
}
}
private async Task<NodeInvokeResponse> HandleSetSettingsAsync(NodeInvokeRequest request)
{
Logger.Info(VoiceCommands.SetSettings);
if (SettingsUpdateRequested == null)
return Error("Voice settings update not available");
try
{
var rawArgs = request.Args.ValueKind is JsonValueKind.Undefined or JsonValueKind.Null
? "{}"
: request.Args.GetRawText();
VoiceSettingsUpdateArgs? update = null;
if (request.Args.ValueKind == JsonValueKind.Object &&
request.Args.TryGetProperty("update", out var updateEl))
{
update = JsonSerializer.Deserialize<VoiceSettingsUpdateArgs>(updateEl.GetRawText(), s_jsonOptions);
}
update ??= JsonSerializer.Deserialize<VoiceSettingsUpdateArgs>(rawArgs, s_jsonOptions);
if (update == null)
return Error("Missing update payload");
return Success(await SettingsUpdateRequested(update));
}
catch (Exception ex)
{
Logger.Error("Voice settings update failed", ex);
return Error($"Set settings failed: {ex.Message}");
}
}
private async Task<NodeInvokeResponse> HandleGetStatusAsync()
{
Logger.Info(VoiceCommands.GetStatus);
if (StatusRequested == null)
return Error("Voice status not available");
try
{
return Success(await StatusRequested());
}
catch (Exception ex)
{
Logger.Error("Voice status get failed", ex);
return Error($"Get status failed: {ex.Message}");
}
}
private async Task<NodeInvokeResponse> HandleStartAsync(NodeInvokeRequest request)
{
Logger.Info(VoiceCommands.Start);
if (StartRequested == null)
return Error("Voice start not available");
try
{
var rawArgs = request.Args.ValueKind is JsonValueKind.Undefined or JsonValueKind.Null
? "{}"
: request.Args.GetRawText();
var args = JsonSerializer.Deserialize<VoiceStartArgs>(rawArgs, s_jsonOptions) ?? new VoiceStartArgs();
return Success(await StartRequested(args));
}
catch (Exception ex)
{
Logger.Error("Voice start failed", ex);
return Error($"Start failed: {ex.Message}");
}
}
private async Task<NodeInvokeResponse> HandleStopAsync(NodeInvokeRequest request)
{
Logger.Info(VoiceCommands.Stop);
if (StopRequested == null)
return Error("Voice stop not available");
try
{
var rawArgs = request.Args.ValueKind is JsonValueKind.Undefined or JsonValueKind.Null
? "{}"
: request.Args.GetRawText();
var args = JsonSerializer.Deserialize<VoiceStopArgs>(rawArgs, s_jsonOptions) ?? new VoiceStopArgs();
return Success(await StopRequested(args));
}
catch (Exception ex)
{
Logger.Error("Voice stop failed", ex);
return Error($"Stop failed: {ex.Message}");
}
}
private async Task<NodeInvokeResponse> HandlePauseAsync(NodeInvokeRequest request)
{
Logger.Info(VoiceCommands.Pause);
if (PauseRequested == null)
return Error("Voice pause not available");
try
{
var rawArgs = request.Args.ValueKind is JsonValueKind.Undefined or JsonValueKind.Null
? "{}"
: request.Args.GetRawText();
var args = JsonSerializer.Deserialize<VoicePauseArgs>(rawArgs, s_jsonOptions) ?? new VoicePauseArgs();
return Success(await PauseRequested(args));
}
catch (Exception ex)
{
Logger.Error("Voice pause failed", ex);
return Error($"Pause failed: {ex.Message}");
}
}
private async Task<NodeInvokeResponse> HandleResumeAsync(NodeInvokeRequest request)
{
Logger.Info(VoiceCommands.Resume);
if (ResumeRequested == null)
return Error("Voice resume not available");
try
{
var rawArgs = request.Args.ValueKind is JsonValueKind.Undefined or JsonValueKind.Null
? "{}"
: request.Args.GetRawText();
var args = JsonSerializer.Deserialize<VoiceResumeArgs>(rawArgs, s_jsonOptions) ?? new VoiceResumeArgs();
return Success(await ResumeRequested(args));
}
catch (Exception ex)
{
Logger.Error("Voice resume failed", ex);
return Error($"Resume failed: {ex.Message}");
}
}
private async Task<NodeInvokeResponse> HandleSkipAsync(NodeInvokeRequest request)
{
Logger.Info(VoiceCommands.Skip);
if (SkipRequested == null)
return Error("Voice skip not available");
try
{
var rawArgs = request.Args.ValueKind is JsonValueKind.Undefined or JsonValueKind.Null
? "{}"
: request.Args.GetRawText();
var args = JsonSerializer.Deserialize<VoiceSkipArgs>(rawArgs, s_jsonOptions) ?? new VoiceSkipArgs();
return Success(await SkipRequested(args));
}
catch (Exception ex)
{
Logger.Error("Voice skip failed", ex);
return Error($"Skip failed: {ex.Message}");
}
}
}

View File

@ -0,0 +1,93 @@
using System;
using System.Collections.Generic;
namespace OpenClaw.Shared;
internal sealed class ExecEnvSanitizeResult
{
public Dictionary<string, string>? Allowed { get; init; }
public string[] Blocked { get; init; } = Array.Empty<string>();
}
internal static class ExecEnvSanitizer
{
private static readonly HashSet<string> _blockedNames = new(StringComparer.OrdinalIgnoreCase)
{
"PATH",
"PATHEXT",
"ComSpec",
"PSModulePath",
"NODE_OPTIONS",
"NODE_PATH",
"PYTHONPATH",
"PYTHONSTARTUP",
"PYTHONUSERBASE",
"RUBYOPT",
"RUBYLIB",
"PERL5OPT",
"PERL5LIB",
"PERLIO",
"GIT_SSH",
"GIT_SSH_COMMAND",
"GIT_EXEC_PATH",
"GIT_PROXY_COMMAND",
"GIT_ASKPASS",
"BASH_ENV",
"ENV",
"CDPATH",
"PROMPT_COMMAND",
"ZDOTDIR",
"LD_PRELOAD",
"LD_LIBRARY_PATH",
"LD_AUDIT",
"DYLD_INSERT_LIBRARIES",
"DYLD_LIBRARY_PATH"
};
internal static ExecEnvSanitizeResult Sanitize(Dictionary<string, string>? env)
{
if (env is not { Count: > 0 })
{
return new ExecEnvSanitizeResult { Allowed = env };
}
var allowed = new Dictionary<string, string>(StringComparer.OrdinalIgnoreCase);
var blocked = new List<string>();
foreach (var (name, value) in env)
{
if (IsBlocked(name))
{
blocked.Add(name);
continue;
}
allowed[name] = value;
}
return new ExecEnvSanitizeResult
{
Allowed = allowed.Count > 0 ? allowed : null,
Blocked = blocked.ToArray()
};
}
internal static bool IsBlocked(string? name)
{
if (string.IsNullOrWhiteSpace(name))
return true;
if (name.IndexOfAny(['=', '\0', '\r', '\n']) >= 0)
return true;
foreach (var c in name)
{
if (char.IsControl(c) || char.IsWhiteSpace(c))
return true;
}
return _blockedNames.Contains(name)
|| name.StartsWith("LD_", StringComparison.OrdinalIgnoreCase)
|| name.StartsWith("DYLD_", StringComparison.OrdinalIgnoreCase);
}
}

View File

@ -0,0 +1,294 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
namespace OpenClaw.Shared;
internal sealed class ExecShellEvaluationTarget
{
public string Command { get; init; } = "";
public string? Shell { get; init; }
}
internal sealed class ExecShellParseResult
{
public List<ExecShellEvaluationTarget> Targets { get; } = new();
public string? Error { get; init; }
}
internal static class ExecShellWrapperParser
{
private const int MaxDepth = 4;
internal static ExecShellParseResult Expand(string command, string? shell = null)
{
var result = new ExecShellParseResult();
if (string.IsNullOrWhiteSpace(command))
return result;
var pending = new Queue<(string Command, string? Shell, int Depth)>();
var seen = new HashSet<string>(StringComparer.OrdinalIgnoreCase);
pending.Enqueue((command, NormalizeShell(shell), 0));
while (pending.Count > 0)
{
var (current, currentShell, depth) = pending.Dequeue();
if (string.IsNullOrWhiteSpace(current) || depth > MaxDepth)
continue;
var segments = SplitTopLevelCommands(current);
var hasMultipleSegments = segments.Count > 1;
foreach (var rawSegment in segments)
{
var segment = TrimMatchingQuotes(rawSegment.Trim());
if (string.IsNullOrWhiteSpace(segment))
continue;
if ((depth > 0 || hasMultipleSegments) && seen.Add($"{currentShell}|{segment}"))
{
result.Targets.Add(new ExecShellEvaluationTarget
{
Command = segment,
Shell = currentShell
});
}
var wrapped = TryExtractWrappedPayload(segment);
if (wrapped.Error != null)
{
return new ExecShellParseResult { Error = wrapped.Error };
}
if (!string.IsNullOrWhiteSpace(wrapped.Payload))
{
pending.Enqueue((wrapped.Payload!, wrapped.Shell ?? currentShell, depth + 1));
}
}
}
return result;
}
private static (string? Payload, string? Shell, string? Error) TryExtractWrappedPayload(string command)
{
var tokens = Tokenize(command);
if (tokens.Count < 2)
return default;
var executable = Path.GetFileName(tokens[0]);
if (string.IsNullOrWhiteSpace(executable))
return default;
if (executable.Equals("cmd", StringComparison.OrdinalIgnoreCase) ||
executable.Equals("cmd.exe", StringComparison.OrdinalIgnoreCase))
{
for (var i = 1; i < tokens.Count; i++)
{
if (tokens[i].Equals("/c", StringComparison.OrdinalIgnoreCase) ||
tokens[i].Equals("/k", StringComparison.OrdinalIgnoreCase))
{
var payload = string.Join(" ", tokens.Skip(i + 1)).Trim();
return string.IsNullOrWhiteSpace(payload)
? ("", "cmd", "Shell wrapper payload was empty")
: (payload, "cmd", null);
}
}
}
if (executable.Equals("powershell", StringComparison.OrdinalIgnoreCase) ||
executable.Equals("powershell.exe", StringComparison.OrdinalIgnoreCase))
{
return ParsePowerShellPayload(tokens, "powershell");
}
if (executable.Equals("pwsh", StringComparison.OrdinalIgnoreCase) ||
executable.Equals("pwsh.exe", StringComparison.OrdinalIgnoreCase))
{
return ParsePowerShellPayload(tokens, "pwsh");
}
if (executable.Equals("bash", StringComparison.OrdinalIgnoreCase) ||
executable.Equals("bash.exe", StringComparison.OrdinalIgnoreCase) ||
executable.Equals("sh", StringComparison.OrdinalIgnoreCase) ||
executable.Equals("sh.exe", StringComparison.OrdinalIgnoreCase))
{
for (var i = 1; i < tokens.Count; i++)
{
if (tokens[i].Equals("-c", StringComparison.OrdinalIgnoreCase))
{
var payload = string.Join(" ", tokens.Skip(i + 1)).Trim();
return string.IsNullOrWhiteSpace(payload)
? ("", "sh", "Shell wrapper payload was empty")
: (payload, "sh", null);
}
}
}
return default;
}
private static (string? Payload, string? Shell, string? Error) ParsePowerShellPayload(IReadOnlyList<string> tokens, string shell)
{
for (var i = 1; i < tokens.Count; i++)
{
var option = tokens[i];
if (option.Equals("-Command", StringComparison.OrdinalIgnoreCase) ||
option.Equals("-c", StringComparison.OrdinalIgnoreCase))
{
var payload = string.Join(" ", tokens.Skip(i + 1)).Trim();
return string.IsNullOrWhiteSpace(payload)
? ("", shell, "Shell wrapper payload was empty")
: (payload, shell, null);
}
if (option.Equals("-EncodedCommand", StringComparison.OrdinalIgnoreCase) ||
option.Equals("-enc", StringComparison.OrdinalIgnoreCase) ||
option.Equals("-ec", StringComparison.OrdinalIgnoreCase))
{
var encoded = i + 1 < tokens.Count ? tokens[i + 1] : null;
if (string.IsNullOrWhiteSpace(encoded))
return ("", shell, "Shell wrapper payload was empty");
try
{
var bytes = Convert.FromBase64String(encoded);
var payload = Encoding.Unicode.GetString(bytes).Trim();
return string.IsNullOrWhiteSpace(payload)
? ("", shell, "EncodedCommand decoded to an empty payload")
: (payload, shell, null);
}
catch (FormatException)
{
return ("", shell, "EncodedCommand could not be decoded");
}
}
}
return default;
}
private static List<string> SplitTopLevelCommands(string command)
{
var parts = new List<string>();
var current = new StringBuilder();
var inSingleQuotes = false;
var inDoubleQuotes = false;
for (var i = 0; i < command.Length; i++)
{
var c = command[i];
if (c == '"' && !inSingleQuotes)
{
inDoubleQuotes = !inDoubleQuotes;
current.Append(c);
continue;
}
if (c == '\'' && !inDoubleQuotes)
{
inSingleQuotes = !inSingleQuotes;
current.Append(c);
continue;
}
if (!inSingleQuotes && !inDoubleQuotes)
{
if (c == ';' || c == '&')
{
FlushCurrent(parts, current);
if (c == '&' && i + 1 < command.Length && command[i + 1] == '&')
i++;
continue;
}
if (c == '|' && i + 1 < command.Length && command[i + 1] == '|')
{
FlushCurrent(parts, current);
i++;
continue;
}
}
current.Append(c);
}
FlushCurrent(parts, current);
return parts;
}
private static List<string> Tokenize(string command)
{
var tokens = new List<string>();
var current = new StringBuilder();
var inSingleQuotes = false;
var inDoubleQuotes = false;
var escapeNext = false;
foreach (var c in command)
{
if (escapeNext)
{
current.Append(c);
escapeNext = false;
continue;
}
if (c == '\\' && inDoubleQuotes)
{
escapeNext = true;
continue;
}
if (c == '"' && !inSingleQuotes)
{
inDoubleQuotes = !inDoubleQuotes;
continue;
}
if (c == '\'' && !inDoubleQuotes)
{
inSingleQuotes = !inSingleQuotes;
continue;
}
if (!inSingleQuotes && !inDoubleQuotes && char.IsWhiteSpace(c))
{
FlushCurrent(tokens, current);
continue;
}
current.Append(c);
}
FlushCurrent(tokens, current);
return tokens;
}
private static string TrimMatchingQuotes(string value)
{
if (value.Length >= 2 &&
((value[0] == '"' && value[^1] == '"') || (value[0] == '\'' && value[^1] == '\'')))
{
return value[1..^1];
}
return value;
}
private static string? NormalizeShell(string? shell) =>
string.IsNullOrWhiteSpace(shell) ? "powershell" : shell.ToLowerInvariant();
private static void FlushCurrent(List<string> parts, StringBuilder current)
{
if (current.Length == 0)
return;
parts.Add(current.ToString());
current.Clear();
}
}

View File

@ -88,6 +88,14 @@ public class OpenClawNotification
public string[]? Tags { get; set; } // free-form routing tags
}
public class ChatMessageEventArgs : EventArgs
{
public string SessionKey { get; set; } = "main";
public string Role { get; set; } = "";
public string Message { get; set; } = "";
public bool IsFinal { get; set; }
}
/// <summary>
/// A user-defined notification categorization rule.
/// </summary>

View File

@ -41,8 +41,11 @@ public class OpenClawGatewayClient : WebSocketClientBase
private GatewayUsageStatusInfo? _usageStatus;
private GatewayCostUsageInfo? _usageCost;
private readonly Dictionary<string, string> _pendingRequestMethods = new();
private readonly Dictionary<string, PendingChatPreviewState> _pendingChatPreviewSessionKeys = new();
private readonly Dictionary<string, string> _lastAssistantMessagesBySession = new();
private readonly Dictionary<string, TaskCompletionSource<bool>> _pendingChatSendRequests = new();
private readonly object _pendingRequestLock = new();
private readonly object _pendingChatPreviewLock = new();
private readonly object _pendingChatSendLock = new();
private readonly object _sessionsLock = new();
private readonly object _nodesLock = new();
@ -58,11 +61,19 @@ public class OpenClawGatewayClient : WebSocketClientBase
private bool _usageCostUnsupported;
private bool _sessionPreviewUnsupported;
private bool _nodeListUnsupported;
private string _defaultChatSessionKey = DefaultChatSessionKey;
private bool _operatorReadScopeUnavailable;
private bool _pairingRequiredAwaitingApproval;
private IReadOnlyList<UserNotificationRule>? _userRules;
private bool _preferStructuredCategories = true;
private const string DefaultChatSessionKey = "main";
private sealed class PendingChatPreviewState
{
public string? LastKnownAssistantText { get; init; }
public int AttemptCount { get; set; }
}
/// <summary>
/// Controls whether structured notification metadata (Intent, Channel) takes priority
/// over keyword-based classification. Call after construction and whenever settings change.
@ -111,15 +122,18 @@ public class OpenClawGatewayClient : WebSocketClientBase
protected override void OnDisconnected()
{
ClearPendingRequests();
ClearPendingChatPreviewSessions();
}
protected override void OnDisposing()
{
ClearPendingRequests();
ClearPendingChatPreviewSessions();
}
// Events
public event EventHandler<OpenClawNotification>? NotificationReceived;
public event EventHandler<ChatMessageEventArgs>? ChatMessageReceived;
public event EventHandler<AgentActivity>? ActivityChanged;
public event EventHandler<ChannelHealth[]>? ChannelHealthUpdated;
public event EventHandler<SessionInfo[]>? SessionsUpdated;
@ -191,35 +205,32 @@ public class OpenClawGatewayClient : WebSocketClientBase
}
}
public async Task SendChatMessageAsync(string message, string? sessionKey = null)
public async Task SendChatMessageAsync(string message, string? sessionKey = null, string? idempotencyKey = null)
{
if (!IsConnected)
throw new InvalidOperationException("Gateway connection is not open");
if (string.IsNullOrWhiteSpace(message))
throw new ArgumentException("Message is required", nameof(message));
var effectiveSessionKey = string.IsNullOrWhiteSpace(sessionKey)
? _mainSessionKey
: sessionKey.Trim();
var requestId = Guid.NewGuid().ToString();
var completion = new TaskCompletionSource<bool>(TaskCreationOptions.RunContinuationsAsynchronously);
TrackPendingChatSend(requestId, completion);
var resolvedSessionKey = ResolveChatSessionKey(sessionKey);
var resolvedIdempotencyKey = string.IsNullOrWhiteSpace(idempotencyKey)
? Guid.NewGuid().ToString()
: idempotencyKey;
var parameters = BuildChatSendParameters(message, resolvedSessionKey, resolvedIdempotencyKey);
var req = new
TrackPendingRequest(requestId, "chat.send");
try
{
type = "req",
id = requestId,
method = "chat.send",
@params = new
{
sessionKey = effectiveSessionKey,
message,
idempotencyKey = Guid.NewGuid().ToString()
}
};
await SendRawAsync(JsonSerializer.Serialize(req));
await SendRawAsync(SerializeRequest(requestId, "chat.send", parameters));
}
catch
{
RemovePendingRequest(requestId);
throw;
}
var completedTask = await Task.WhenAny(completion.Task, Task.Delay(5000, CancellationToken));
if (completedTask != completion.Task)
@ -459,6 +470,31 @@ public class OpenClawGatewayClient : WebSocketClientBase
}
}
private object BuildConnectParameters()
{
return new
{
minProtocol = 3,
maxProtocol = 3,
client = new
{
id = "cli",
version = "1.0.0",
platform = "windows",
mode = "cli",
displayName = "OpenClaw Windows Tray"
},
role = "operator",
scopes = new[] { "operator.read", "operator.write", "operator.admin", "operator.approvals", "operator.pairing" },
caps = Array.Empty<string>(),
commands = Array.Empty<string>(),
permissions = new { },
auth = new { token = _token },
locale = "en-US",
userAgent = "openclaw-windows-tray/1.0.0"
};
}
private async Task SendTrackedRequestAsync(string method, object? parameters = null)
{
if (!IsConnected) return;
@ -666,6 +702,7 @@ public class OpenClawGatewayClient : WebSocketClientBase
// Handle handshake acknowledgement payload.
if (payload.TryGetProperty("type", out var t) && t.GetString() == "hello-ok")
{
UpdateDefaultChatSessionKeyFromHello(payload);
_pairingRequiredAwaitingApproval = false;
_operatorDeviceId = TryGetHandshakeDeviceId(payload);
_grantedOperatorScopes = TryGetHandshakeScopes(payload);
@ -677,7 +714,6 @@ public class OpenClawGatewayClient : WebSocketClientBase
_connectAuthToken = newDeviceToken;
_logger.Info("Operator device token stored for reconnect");
}
_logger.Info("Handshake complete (hello-ok)");
if (!string.IsNullOrWhiteSpace(_operatorDeviceId))
{
@ -1257,13 +1293,17 @@ public class OpenClawGatewayClient : WebSocketClientBase
{
var rawText = root.GetRawText();
_logger.Debug($"Chat event received: {rawText.Substring(0, Math.Min(200, rawText.Length))}");
if (!root.TryGetProperty("payload", out var payload)) return;
var sessionKey = NormalizeChatSessionKey(TryGetSessionKey(root, payload));
var isFinal = !payload.TryGetProperty("state", out var state) ||
string.Equals(state.GetString(), "final", StringComparison.OrdinalIgnoreCase);
var emittedAssistantText = false;
// Try new format: payload.message.role + payload.message.content[].text
if (payload.TryGetProperty("message", out var message))
{
if (message.TryGetProperty("role", out var role) && role.GetString() == "assistant")
var roleName = GetString(message, "role");
if (roleName == "assistant")
{
// Extract text from content array
if (message.TryGetProperty("content", out var content) && content.ValueKind == JsonValueKind.Array)
@ -1274,11 +1314,11 @@ public class OpenClawGatewayClient : WebSocketClientBase
item.TryGetProperty("text", out var textProp))
{
var text = textProp.GetString() ?? "";
if (!string.IsNullOrEmpty(text) &&
payload.TryGetProperty("state", out var state) &&
state.GetString() == "final")
if (!string.IsNullOrEmpty(text) && isFinal)
{
emittedAssistantText = true;
_logger.Info($"Assistant response: {text.Substring(0, Math.Min(100, text.Length))}...");
EmitChatMessage(sessionKey, roleName ?? "assistant", text, isFinal);
EmitChatNotification(text);
}
}
@ -1291,14 +1331,40 @@ public class OpenClawGatewayClient : WebSocketClientBase
else if (payload.TryGetProperty("text", out var textProp))
{
var text = textProp.GetString() ?? "";
if (payload.TryGetProperty("role", out var role) &&
role.GetString() == "assistant" &&
var roleName = GetString(payload, "role");
if (roleName == "assistant" &&
!string.IsNullOrEmpty(text))
{
emittedAssistantText = true;
_logger.Info($"Assistant response (legacy): {text.Substring(0, Math.Min(100, text.Length))}");
EmitChatMessage(sessionKey, roleName, text, isFinal: true);
EmitChatNotification(text);
}
}
if (isFinal && !emittedAssistantText)
{
RequestChatPreviewForFinalState(sessionKey);
}
}
private void EmitChatMessage(string sessionKey, string role, string text, bool isFinal)
{
if (isFinal && string.Equals(role, "assistant", StringComparison.OrdinalIgnoreCase))
{
lock (_pendingChatPreviewLock)
{
_lastAssistantMessagesBySession[NormalizeChatSessionKey(sessionKey)] = text;
}
}
ChatMessageReceived?.Invoke(this, new ChatMessageEventArgs
{
SessionKey = sessionKey,
Role = role,
Message = text,
IsFinal = isFinal
});
}
private void EmitChatNotification(string text)
@ -1512,6 +1578,7 @@ public class OpenClawGatewayClient : WebSocketClientBase
}
snapshot = GetSessionListInternal();
UpdateDefaultChatSessionKeyFromSessions();
}
SessionsUpdated?.Invoke(this, snapshot);
@ -1540,6 +1607,205 @@ public class OpenClawGatewayClient : WebSocketClientBase
PopulateSessionFromObject(session, item);
_sessions[session.Key] = session;
if (session.IsMain)
{
UpdateDefaultChatSessionKey(session.Key);
}
}
private object BuildChatSendParameters(string message, string sessionKey, string idempotencyKey)
{
return new
{
message,
sessionKey,
idempotencyKey
};
}
private string ResolveChatSessionKey(string? sessionKey)
{
if (!string.IsNullOrWhiteSpace(sessionKey))
{
return NormalizeChatSessionKey(sessionKey);
}
return string.IsNullOrWhiteSpace(_defaultChatSessionKey)
? DefaultChatSessionKey
: _defaultChatSessionKey;
}
private void UpdateDefaultChatSessionKeyFromHello(JsonElement payload)
{
if (!payload.TryGetProperty("snapshot", out var snapshot) ||
snapshot.ValueKind != JsonValueKind.Object ||
!snapshot.TryGetProperty("sessionDefaults", out var sessionDefaults) ||
sessionDefaults.ValueKind != JsonValueKind.Object)
{
return;
}
var mainSessionKey = GetString(sessionDefaults, "mainKey") ??
GetString(sessionDefaults, "mainSessionKey");
if (!string.IsNullOrWhiteSpace(mainSessionKey))
{
UpdateDefaultChatSessionKey(mainSessionKey);
}
}
private void UpdateDefaultChatSessionKeyFromSessions()
{
var mainSession = _sessions.Values.FirstOrDefault(s => s.IsMain && !string.IsNullOrWhiteSpace(s.Key));
if (!string.IsNullOrWhiteSpace(mainSession?.Key))
{
UpdateDefaultChatSessionKey(mainSession.Key);
}
}
private void UpdateDefaultChatSessionKey(string? sessionKey)
{
if (!string.IsNullOrWhiteSpace(sessionKey))
{
_defaultChatSessionKey = NormalizeChatSessionKey(sessionKey);
}
}
private void RequestChatPreviewForFinalState(string sessionKey)
{
if (string.IsNullOrWhiteSpace(sessionKey) || _sessionPreviewUnsupported)
{
return;
}
var normalizedSessionKey = NormalizeChatSessionKey(sessionKey);
string? lastKnownAssistantText;
lock (_pendingChatPreviewLock)
{
if (_pendingChatPreviewSessionKeys.ContainsKey(normalizedSessionKey))
{
return;
}
_lastAssistantMessagesBySession.TryGetValue(normalizedSessionKey, out lastKnownAssistantText);
_pendingChatPreviewSessionKeys[normalizedSessionKey] = new PendingChatPreviewState
{
LastKnownAssistantText = lastKnownAssistantText,
AttemptCount = 0
};
}
RequestChatPreviewForFinalStateAsync(normalizedSessionKey, delayMs: 0);
}
private void RequestChatPreviewForFinalStateAsync(string normalizedSessionKey, int delayMs)
{
_ = Task.Run(async () =>
{
try
{
if (delayMs > 0)
{
await Task.Delay(delayMs);
}
await RequestSessionPreviewAsync([normalizedSessionKey], limit: 2, maxChars: 4000);
}
catch (Exception ex)
{
_logger.Warn($"sessions.preview request failed for {normalizedSessionKey}: {ex.Message}");
lock (_pendingChatPreviewLock)
{
_pendingChatPreviewSessionKeys.Remove(normalizedSessionKey);
}
}
});
}
private void EmitPendingChatPreviewMessages(SessionsPreviewPayloadInfo payload)
{
foreach (var preview in payload.Previews)
{
var normalizedSessionKey = NormalizeChatSessionKey(preview.Key);
PendingChatPreviewState? pendingState = null;
lock (_pendingChatPreviewLock)
{
_pendingChatPreviewSessionKeys.TryGetValue(normalizedSessionKey, out pendingState);
}
if (pendingState == null)
{
continue;
}
var assistantText = preview.Items
.LastOrDefault(item => string.Equals(item.Role, "assistant", StringComparison.OrdinalIgnoreCase))?
.Text?
.Trim();
if (string.IsNullOrWhiteSpace(assistantText))
{
continue;
}
if (string.Equals(assistantText, pendingState.LastKnownAssistantText, StringComparison.Ordinal))
{
if (pendingState.AttemptCount < 3)
{
pendingState.AttemptCount++;
_logger.Warn(
$"sessions.preview returned the previous assistant reply for {normalizedSessionKey}; retrying preview ({pendingState.AttemptCount}/3)");
RequestChatPreviewForFinalStateAsync(normalizedSessionKey, delayMs: 400 * pendingState.AttemptCount);
continue;
}
}
lock (_pendingChatPreviewLock)
{
_pendingChatPreviewSessionKeys.Remove(normalizedSessionKey);
}
_logger.Info($"Assistant response (preview): {assistantText.Substring(0, Math.Min(100, assistantText.Length))}...");
EmitChatMessage(normalizedSessionKey, "assistant", assistantText, isFinal: true);
EmitChatNotification(assistantText);
}
}
private void ClearPendingChatPreviewSessions()
{
lock (_pendingChatPreviewLock)
{
_pendingChatPreviewSessionKeys.Clear();
_lastAssistantMessagesBySession.Clear();
}
}
private static string NormalizeChatSessionKey(string? sessionKey)
{
if (string.IsNullOrWhiteSpace(sessionKey))
{
return DefaultChatSessionKey;
}
return sessionKey == "main" || sessionKey.Contains(":main:", StringComparison.Ordinal)
? DefaultChatSessionKey
: sessionKey;
}
private static string? TryGetSessionKey(JsonElement root, JsonElement payload)
{
if (root.TryGetProperty("sessionKey", out var rootSessionKey))
{
return rootSessionKey.GetString();
}
if (payload.ValueKind == JsonValueKind.Object &&
payload.TryGetProperty("sessionKey", out var payloadSessionKey))
{
return payloadSessionKey.GetString();
}
return null;
}
private void PopulateSessionFromObject(SessionInfo session, JsonElement item)
@ -1853,6 +2119,7 @@ public class OpenClawGatewayClient : WebSocketClientBase
}
SessionPreviewUpdated?.Invoke(this, previewPayload);
EmitPendingChatPreviewMessages(previewPayload);
}
catch (Exception ex)
{

View File

@ -1,3 +1,5 @@
using System;
using System.Text.Json.Serialization;
using System.Text.Json;
namespace OpenClaw.Shared;
@ -32,6 +34,11 @@ public class SettingsData
public bool NotifyChatResponses { get; set; } = true;
public bool PreferStructuredCategories { get; set; } = true;
public List<UserNotificationRule>? UserRules { get; set; }
public VoiceSettings Voice { get; set; } = new();
public VoiceRepeaterWindowSettings VoiceRepeaterWindow { get; set; } = new();
public VoiceProviderConfigurationStore VoiceProviderConfiguration { get; set; } = new();
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)]
public VoiceProviderCredentials? VoiceProviderCredentials { get; set; }
private static readonly JsonSerializerOptions s_options = new() { WriteIndented = true };
@ -43,11 +50,39 @@ public class SettingsData
return null;
try
{
return JsonSerializer.Deserialize<SettingsData>(json);
return JsonSerializer.Deserialize<SettingsData>(MigrateLegacyVoiceJson(json));
}
catch (JsonException)
{
return null;
}
}
private static string MigrateLegacyVoiceJson(string json)
{
return json
.Replace("\"WakeWord\":", "\"VoiceWake\":", StringComparison.Ordinal)
.Replace("\"AlwaysOn\":", "\"TalkMode\":", StringComparison.Ordinal)
.Replace("\"WakeWordModelId\":", "\"VoiceWakeModelId\":", StringComparison.Ordinal)
.Replace("\"WakeWordLoaded\":", "\"VoiceWakeLoaded\":", StringComparison.Ordinal)
.Replace("\"LastWakeWordUtc\":", "\"LastVoiceWakeUtc\":", StringComparison.Ordinal)
.Replace("\"Mode\":\"WakeWord\"", "\"Mode\":\"VoiceWake\"", StringComparison.Ordinal)
.Replace("\"Mode\": \"WakeWord\"", "\"Mode\": \"VoiceWake\"", StringComparison.Ordinal)
.Replace("\"Mode\":\"AlwaysOn\"", "\"Mode\":\"TalkMode\"", StringComparison.Ordinal)
.Replace("\"Mode\": \"AlwaysOn\"", "\"Mode\": \"TalkMode\"", StringComparison.Ordinal)
.Replace("\"State\":\"ListeningForWakeWord\"", "\"State\":\"ListeningForVoiceWake\"", StringComparison.Ordinal)
.Replace("\"State\": \"ListeningForWakeWord\"", "\"State\": \"ListeningForVoiceWake\"", StringComparison.Ordinal);
}
}
public sealed class VoiceRepeaterWindowSettings
{
public bool AutoScroll { get; set; } = true;
public bool FloatingEnabled { get; set; } = true;
public bool HasSavedPlacement { get; set; }
public double TextSize { get; set; } = 13;
public int? Width { get; set; }
public int? Height { get; set; }
public int? X { get; set; }
public int? Y { get; set; }
}

View File

@ -0,0 +1,354 @@
using System;
using System.Collections.ObjectModel;
using System.Text.Json;
using System.Text.Json.Serialization;
namespace OpenClaw.Shared;
public static class VoiceCommands
{
public const string ListDevices = "voice.devices.list";
public const string GetSettings = "voice.settings.get";
public const string SetSettings = "voice.settings.set";
public const string GetStatus = "voice.status.get";
public const string Start = "voice.start";
public const string Stop = "voice.stop";
public const string Pause = "voice.pause";
public const string Resume = "voice.resume";
public const string Skip = "voice.response.skip";
private static readonly ReadOnlyCollection<string> s_all = Array.AsReadOnly(
[
ListDevices,
GetSettings,
SetSettings,
GetStatus,
Start,
Stop,
Pause,
Resume,
Skip
]);
public static IReadOnlyList<string> All => s_all;
}
[JsonConverter(typeof(VoiceActivationModeJsonConverter))]
public enum VoiceActivationMode
{
Off,
VoiceWake,
TalkMode
}
[JsonConverter(typeof(VoiceRuntimeStateJsonConverter))]
public enum VoiceRuntimeState
{
Stopped,
Paused,
Idle,
Arming,
ListeningForVoiceWake,
ListeningContinuously,
RecordingUtterance,
SubmittingAudio,
AwaitingResponse,
PlayingResponse,
Error
}
public sealed class VoiceSettings
{
public VoiceActivationMode Mode { get; set; } = VoiceActivationMode.Off;
public bool Enabled { get; set; }
public bool ShowRepeaterAtStartup { get; set; } = true;
public bool ShowConversationToasts { get; set; }
public string SpeechToTextProviderId { get; set; } = VoiceProviderIds.Windows;
public string TextToSpeechProviderId { get; set; } = VoiceProviderIds.Windows;
public string? InputDeviceId { get; set; }
public string? OutputDeviceId { get; set; }
public int SampleRateHz { get; set; } = 16000;
public int CaptureChunkMs { get; set; } = 80;
public bool BargeInEnabled { get; set; } = true;
public VoiceWakeSettings VoiceWake { get; set; } = new();
public TalkModeSettings TalkMode { get; set; } = new();
}
public sealed class VoiceWakeSettings
{
public string Engine { get; set; } = "NanoWakeWord";
public string ModelId { get; set; } = "hey_openclaw";
public float TriggerThreshold { get; set; } = 0.65f;
public int TriggerCooldownMs { get; set; } = 2000;
public int PreRollMs { get; set; } = 1200;
public int EndSilenceMs { get; set; } = 900;
}
public sealed class TalkModeSettings
{
public int MinSpeechMs { get; set; } = 250;
public int EndSilenceMs { get; set; } = 900;
public int MaxUtteranceMs { get; set; } = 15000;
}
public sealed class VoiceAudioDeviceInfo
{
public string DeviceId { get; set; } = "";
public string Name { get; set; } = "";
public bool IsDefault { get; set; }
public bool IsInput { get; set; }
public bool IsOutput { get; set; }
}
public sealed class VoiceStatusInfo
{
public bool Available { get; set; }
public bool Running { get; set; }
public VoiceActivationMode Mode { get; set; } = VoiceActivationMode.Off;
public VoiceRuntimeState State { get; set; } = VoiceRuntimeState.Stopped;
public string? SessionKey { get; set; }
public string? InputDeviceId { get; set; }
public string? OutputDeviceId { get; set; }
public string? VoiceWakeModelId { get; set; }
public bool VoiceWakeLoaded { get; set; }
public DateTime? LastVoiceWakeUtc { get; set; }
public DateTime? LastUtteranceUtc { get; set; }
public int PendingReplyCount { get; set; }
public bool CanSkipReply { get; set; }
public string? CurrentReplyPreview { get; set; }
public string? LastError { get; set; }
}
public sealed class VoiceStartArgs
{
public VoiceActivationMode? Mode { get; set; }
public string? SessionKey { get; set; }
}
public sealed class VoiceStopArgs
{
public string? Reason { get; set; }
}
public sealed class VoicePauseArgs
{
public string? Reason { get; set; }
}
public sealed class VoiceResumeArgs
{
public string? Reason { get; set; }
}
public sealed class VoiceSkipArgs
{
public string? Reason { get; set; }
}
public sealed class VoiceSettingsUpdateArgs
{
public VoiceSettings Settings { get; set; } = new();
public bool Persist { get; set; } = true;
}
public static class VoiceProviderIds
{
public const string Windows = "windows";
public const string HttpWs = "http-ws";
public const string FoundryLocal = "foundry-local";
public const string OpenAiWhisper = "openai-whisper";
public const string ElevenLabsSpeechToText = "elevenlabs-stt";
public const string AzureAiSpeech = "azure-ai-speech";
public const string SherpaOnnx = "sherpa-onnx";
public const string MiniMax = "minimax";
public const string ElevenLabs = "elevenlabs";
}
public static class VoiceProviderRuntimeIds
{
public const string Windows = "windows";
public const string Streaming = "streaming";
public const string Embedded = "embedded";
public const string Cloud = "cloud";
}
public static class VoiceProviderSettingKeys
{
public const string ApiKey = "apiKey";
public const string Endpoint = "endpoint";
public const string Model = "model";
public const string ModelPath = "modelPath";
public const string VoiceId = "voiceId";
public const string VoiceSettingsJson = "voiceSettingsJson";
}
public static class VoiceTextToSpeechResponseModes
{
public const string Binary = "binary";
public const string HexJsonString = "hexJsonString";
public const string Base64JsonString = "base64JsonString";
}
public sealed class VoiceProviderCredentials
{
public string? MiniMaxApiKey { get; set; }
public string MiniMaxModel { get; set; } = "speech-2.8-turbo";
public string MiniMaxVoiceId { get; set; } = "English_MatureBoss";
public string? ElevenLabsApiKey { get; set; }
public string? ElevenLabsModel { get; set; }
public string? ElevenLabsVoiceId { get; set; }
}
public sealed class VoiceProviderConfigurationStore
{
public List<VoiceProviderConfiguration> Providers { get; set; } = [];
}
public sealed class VoiceProviderConfiguration
{
public string ProviderId { get; set; } = "";
public Dictionary<string, string> Values { get; set; } = [];
}
public sealed class VoiceProviderSettingDefinition
{
public string Key { get; set; } = "";
public string Label { get; set; } = "";
public bool Secret { get; set; }
public bool Required { get; set; } = true;
public bool JsonValue { get; set; }
public string? DefaultValue { get; set; }
public string? Placeholder { get; set; }
public string? Description { get; set; }
public List<string> Options { get; set; } = [];
}
public sealed class VoiceTextToSpeechHttpContract
{
public string EndpointTemplate { get; set; } = "";
public string HttpMethod { get; set; } = "POST";
public string AuthenticationHeaderName { get; set; } = "Authorization";
public string? AuthenticationScheme { get; set; } = "Bearer";
public string ApiKeySettingKey { get; set; } = VoiceProviderSettingKeys.ApiKey;
public string RequestContentType { get; set; } = "application/json";
public string RequestBodyTemplate { get; set; } = "";
public string ResponseAudioMode { get; set; } = VoiceTextToSpeechResponseModes.Binary;
public string? ResponseAudioJsonPath { get; set; }
public string? ResponseStatusCodeJsonPath { get; set; }
public string? ResponseStatusMessageJsonPath { get; set; }
public string? SuccessStatusValue { get; set; }
public string OutputContentType { get; set; } = "audio/mpeg";
}
public sealed class VoiceTextToSpeechWebSocketContract
{
public string EndpointTemplate { get; set; } = "";
public string AuthenticationHeaderName { get; set; } = "Authorization";
public string? AuthenticationScheme { get; set; } = "Bearer";
public string ApiKeySettingKey { get; set; } = VoiceProviderSettingKeys.ApiKey;
public string ConnectSuccessEventName { get; set; } = "connected_success";
public string StartMessageTemplate { get; set; } = "";
public string StartSuccessEventName { get; set; } = "task_started";
public string ContinueMessageTemplate { get; set; } = "";
public string FinishMessageTemplate { get; set; } = "{ \"event\": \"task_finish\" }";
public string ResponseAudioMode { get; set; } = VoiceTextToSpeechResponseModes.Binary;
public string? ResponseAudioJsonPath { get; set; } = "data.audio";
public string? ResponseStatusCodeJsonPath { get; set; } = "base_resp.status_code";
public string? ResponseStatusMessageJsonPath { get; set; } = "base_resp.status_msg";
public string? FinalFlagJsonPath { get; set; } = "is_final";
public string TaskFailedEventName { get; set; } = "task_failed";
public string? SuccessStatusValue { get; set; } = "0";
public string OutputContentType { get; set; } = "audio/mpeg";
}
public sealed class VoiceProviderOption
{
public string Id { get; set; } = "";
public string Name { get; set; } = "";
public string Runtime { get; set; } = VoiceProviderRuntimeIds.Windows;
public bool Enabled { get; set; } = true;
public bool VisibleInSettings { get; set; } = true;
public bool Selectable { get; set; } = true;
public string? Description { get; set; }
public List<VoiceProviderSettingDefinition> Settings { get; set; } = [];
public VoiceTextToSpeechHttpContract? TextToSpeechHttp { get; set; }
public VoiceTextToSpeechWebSocketContract? TextToSpeechWebSocket { get; set; }
[JsonIgnore]
public string DisplayName => Selectable ? Name : $"{Name} (coming soon)";
[JsonIgnore]
public double DisplayOpacity => Selectable ? 1.0 : 0.55;
}
public sealed class VoiceProviderCatalog
{
public List<VoiceProviderOption> SpeechToTextProviders { get; set; } = [];
public List<VoiceProviderOption> TextToSpeechProviders { get; set; } = [];
}
public sealed class VoiceActivationModeJsonConverter : JsonConverter<VoiceActivationMode>
{
public override VoiceActivationMode Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options)
{
var value = reader.GetString();
return value switch
{
"VoiceWake" or "WakeWord" => VoiceActivationMode.VoiceWake,
"TalkMode" or "AlwaysOn" => VoiceActivationMode.TalkMode,
_ => VoiceActivationMode.Off
};
}
public override void Write(Utf8JsonWriter writer, VoiceActivationMode value, JsonSerializerOptions options)
{
writer.WriteStringValue(value switch
{
VoiceActivationMode.VoiceWake => "VoiceWake",
VoiceActivationMode.TalkMode => "TalkMode",
_ => "Off"
});
}
}
public sealed class VoiceRuntimeStateJsonConverter : JsonConverter<VoiceRuntimeState>
{
public override VoiceRuntimeState Read(ref Utf8JsonReader reader, Type typeToConvert, JsonSerializerOptions options)
{
var value = reader.GetString();
return value switch
{
"ListeningForVoiceWake" or "ListeningForWakeWord" => VoiceRuntimeState.ListeningForVoiceWake,
"Stopped" => VoiceRuntimeState.Stopped,
"Paused" => VoiceRuntimeState.Paused,
"Idle" => VoiceRuntimeState.Idle,
"Arming" => VoiceRuntimeState.Arming,
"ListeningContinuously" => VoiceRuntimeState.ListeningContinuously,
"RecordingUtterance" => VoiceRuntimeState.RecordingUtterance,
"SubmittingAudio" => VoiceRuntimeState.SubmittingAudio,
"AwaitingResponse" => VoiceRuntimeState.AwaitingResponse,
"PlayingResponse" => VoiceRuntimeState.PlayingResponse,
"Error" => VoiceRuntimeState.Error,
_ => VoiceRuntimeState.Stopped
};
}
public override void Write(Utf8JsonWriter writer, VoiceRuntimeState value, JsonSerializerOptions options)
{
writer.WriteStringValue(value switch
{
VoiceRuntimeState.ListeningForVoiceWake => "ListeningForVoiceWake",
VoiceRuntimeState.Stopped => "Stopped",
VoiceRuntimeState.Paused => "Paused",
VoiceRuntimeState.Idle => "Idle",
VoiceRuntimeState.Arming => "Arming",
VoiceRuntimeState.ListeningContinuously => "ListeningContinuously",
VoiceRuntimeState.RecordingUtterance => "RecordingUtterance",
VoiceRuntimeState.SubmittingAudio => "SubmittingAudio",
VoiceRuntimeState.AwaitingResponse => "AwaitingResponse",
VoiceRuntimeState.PlayingResponse => "PlayingResponse",
VoiceRuntimeState.Error => "Error",
_ => "Stopped"
});
}
}

View File

@ -0,0 +1,161 @@
using System;
using System.Collections.Generic;
using System.Linq;
namespace OpenClaw.Shared;
public static class VoiceProviderConfigurationStoreExtensions
{
public static VoiceProviderConfiguration GetOrAddProvider(
this VoiceProviderConfigurationStore store,
string providerId)
{
ArgumentNullException.ThrowIfNull(store);
var existing = store.Providers.FirstOrDefault(p =>
string.Equals(p.ProviderId, providerId, StringComparison.OrdinalIgnoreCase));
if (existing != null)
{
return existing;
}
var created = new VoiceProviderConfiguration
{
ProviderId = providerId
};
store.Providers.Add(created);
return created;
}
public static VoiceProviderConfiguration? FindProvider(
this VoiceProviderConfigurationStore store,
string? providerId)
{
ArgumentNullException.ThrowIfNull(store);
if (string.IsNullOrWhiteSpace(providerId))
{
return null;
}
return store.Providers.FirstOrDefault(p =>
string.Equals(p.ProviderId, providerId, StringComparison.OrdinalIgnoreCase));
}
public static string? GetValue(
this VoiceProviderConfigurationStore store,
string? providerId,
string settingKey)
{
return store.FindProvider(providerId)?.GetValue(settingKey);
}
public static string? GetValue(this VoiceProviderConfiguration configuration, string settingKey)
{
ArgumentNullException.ThrowIfNull(configuration);
if (string.IsNullOrWhiteSpace(settingKey))
{
return null;
}
return configuration.Values.FirstOrDefault(entry =>
string.Equals(entry.Key, settingKey, StringComparison.OrdinalIgnoreCase)).Value;
}
public static void SetValue(
this VoiceProviderConfigurationStore store,
string providerId,
string settingKey,
string? value)
{
ArgumentNullException.ThrowIfNull(store);
var provider = store.GetOrAddProvider(providerId);
provider.SetValue(settingKey, value);
}
public static void SetValue(
this VoiceProviderConfiguration configuration,
string settingKey,
string? value)
{
ArgumentNullException.ThrowIfNull(configuration);
if (string.IsNullOrWhiteSpace(settingKey))
{
return;
}
var existingKey = configuration.Values.Keys.FirstOrDefault(key =>
string.Equals(key, settingKey, StringComparison.OrdinalIgnoreCase));
if (string.IsNullOrWhiteSpace(value))
{
if (existingKey != null)
{
configuration.Values.Remove(existingKey);
}
return;
}
if (existingKey != null)
{
configuration.Values[existingKey] = value.Trim();
return;
}
configuration.Values[settingKey] = value.Trim();
}
public static void MigrateLegacyCredentials(
this VoiceProviderConfigurationStore store,
VoiceProviderCredentials? legacy)
{
ArgumentNullException.ThrowIfNull(store);
if (legacy == null)
{
return;
}
var hasMiniMaxValues =
!string.IsNullOrWhiteSpace(legacy.MiniMaxApiKey) ||
!string.IsNullOrWhiteSpace(legacy.MiniMaxModel) ||
!string.IsNullOrWhiteSpace(legacy.MiniMaxVoiceId);
if (hasMiniMaxValues)
{
store.SetValue(VoiceProviderIds.MiniMax, VoiceProviderSettingKeys.ApiKey, legacy.MiniMaxApiKey);
store.SetValue(VoiceProviderIds.MiniMax, VoiceProviderSettingKeys.Model, legacy.MiniMaxModel);
store.SetValue(VoiceProviderIds.MiniMax, VoiceProviderSettingKeys.VoiceId, legacy.MiniMaxVoiceId);
}
var hasElevenLabsValues =
!string.IsNullOrWhiteSpace(legacy.ElevenLabsApiKey) ||
!string.IsNullOrWhiteSpace(legacy.ElevenLabsModel) ||
!string.IsNullOrWhiteSpace(legacy.ElevenLabsVoiceId);
if (hasElevenLabsValues)
{
store.SetValue(VoiceProviderIds.ElevenLabs, VoiceProviderSettingKeys.ApiKey, legacy.ElevenLabsApiKey);
store.SetValue(VoiceProviderIds.ElevenLabs, VoiceProviderSettingKeys.Model, legacy.ElevenLabsModel);
store.SetValue(VoiceProviderIds.ElevenLabs, VoiceProviderSettingKeys.VoiceId, legacy.ElevenLabsVoiceId);
}
}
public static VoiceProviderConfigurationStore Clone(this VoiceProviderConfigurationStore source)
{
ArgumentNullException.ThrowIfNull(source);
return new VoiceProviderConfigurationStore
{
Providers = source.Providers
.Select(provider => new VoiceProviderConfiguration
{
ProviderId = provider.ProviderId,
Values = new Dictionary<string, string>(provider.Values, StringComparer.OrdinalIgnoreCase)
})
.ToList()
};
}
}

View File

@ -0,0 +1,174 @@
using System;
using System.Drawing;
using System.IO;
using System.Runtime.InteropServices;
namespace OpenClawTray.Helpers;
public enum VoiceTrayIconState
{
Off,
Armed,
Listening,
Speaking
}
public static class VoiceTrayIconHelper
{
private static readonly string GeneratedIconsPath = Path.Combine(
Environment.GetFolderPath(Environment.SpecialFolder.LocalApplicationData),
"OpenClawTray",
"GeneratedIcons");
private static string? _voiceArmedIconPath;
private static string? _voiceListeningIconPath;
private static string? _voiceSpeakingIconPath;
public static string GetBaseAppIconPath()
{
return Path.Combine(ResolveAssetsPath(), "openclaw.ico");
}
public static string GetVoiceTrayIconPath(VoiceTrayIconState state)
{
return state switch
{
VoiceTrayIconState.Armed => GetOrCreateVoiceIconPath(ref _voiceArmedIconPath, VoiceTrayIconState.Armed),
VoiceTrayIconState.Listening => GetOrCreateVoiceIconPath(ref _voiceListeningIconPath, VoiceTrayIconState.Listening),
VoiceTrayIconState.Speaking => GetOrCreateVoiceIconPath(ref _voiceSpeakingIconPath, VoiceTrayIconState.Speaking),
_ => GetBaseAppIconPath()
};
}
private static string GetOrCreateVoiceIconPath(ref string? cachedPath, VoiceTrayIconState state)
{
if (!string.IsNullOrWhiteSpace(cachedPath) && File.Exists(cachedPath))
{
return cachedPath;
}
Directory.CreateDirectory(GeneratedIconsPath);
var outputPath = Path.Combine(GeneratedIconsPath, $"voice-{state.ToString().ToLowerInvariant()}.ico");
using var bitmap = CreateVoiceTrayBitmap(state);
using var icon = CreateIcon(bitmap);
using var stream = File.Create(outputPath);
icon.Save(stream);
cachedPath = outputPath;
return outputPath;
}
private static Bitmap CreateVoiceTrayBitmap(VoiceTrayIconState state)
{
const int size = 32;
var bitmap = new Bitmap(size, size);
using var graphics = Graphics.FromImage(bitmap);
graphics.Clear(Color.Transparent);
graphics.SmoothingMode = System.Drawing.Drawing2D.SmoothingMode.AntiAlias;
graphics.InterpolationMode = System.Drawing.Drawing2D.InterpolationMode.HighQualityBicubic;
using (var baseIcon = new Icon(GetBaseAppIconPath(), size, size))
using (var baseBitmap = baseIcon.ToBitmap())
{
graphics.DrawImage(baseBitmap, 0, 0, size, size);
}
switch (state)
{
case VoiceTrayIconState.Armed:
DrawHeadphones(graphics);
break;
case VoiceTrayIconState.Listening:
DrawHeadphones(graphics);
DrawHeadphoneWaves(graphics);
break;
case VoiceTrayIconState.Speaking:
DrawMicrophone(graphics);
break;
}
return bitmap;
}
private static void DrawHeadphones(Graphics graphics)
{
using var shadowPen = new Pen(Color.FromArgb(96, 255, 255, 255), 4f);
using var bandPen = new Pen(Color.FromArgb(42, 48, 58), 3f);
using var earBrush = new SolidBrush(Color.FromArgb(42, 48, 58));
graphics.DrawArc(shadowPen, 6, 3, 20, 16, 180, 180);
graphics.DrawArc(bandPen, 6, 3, 20, 16, 180, 180);
graphics.FillPath(earBrush, CreateRoundedRectanglePath(4, 12, 5, 10, 3));
graphics.FillPath(earBrush, CreateRoundedRectanglePath(23, 12, 5, 10, 3));
}
private static void DrawMicrophone(Graphics graphics)
{
using var brush = new SolidBrush(Color.FromArgb(33, 150, 243));
using var pen = new Pen(Color.FromArgb(33, 150, 243), 2f);
graphics.FillPath(brush, CreateRoundedRectanglePath(22, 17, 6, 9, 3));
graphics.FillRectangle(brush, 24, 25, 2, 4);
graphics.DrawArc(pen, 21, 27, 8, 5, 0, 180);
graphics.DrawLine(pen, 20, 21, 15, 19);
}
private static void DrawHeadphoneWaves(Graphics graphics)
{
using var wavePen = new Pen(Color.FromArgb(76, 175, 80), 2f);
using var accentPen = new Pen(Color.FromArgb(76, 175, 80), 1.5f);
graphics.DrawArc(wavePen, 0, 12, 8, 8, 270, 180);
graphics.DrawArc(accentPen, 2, 14, 4, 4, 270, 180);
graphics.DrawArc(wavePen, 24, 12, 8, 8, 90, 180);
graphics.DrawArc(accentPen, 26, 14, 4, 4, 90, 180);
}
private static Icon CreateIcon(Bitmap bitmap)
{
var handle = bitmap.GetHicon();
var icon = Icon.FromHandle(handle);
var result = (Icon)icon.Clone();
DestroyIcon(handle);
return result;
}
private static System.Drawing.Drawing2D.GraphicsPath CreateRoundedRectanglePath(int x, int y, int width, int height, int radius)
{
var path = new System.Drawing.Drawing2D.GraphicsPath();
path.AddArc(x, y, radius, radius, 180, 90);
path.AddArc(x + width - radius, y, radius, radius, 270, 90);
path.AddArc(x + width - radius, y + height - radius, radius, radius, 0, 90);
path.AddArc(x, y + height - radius, radius, radius, 90, 90);
path.CloseFigure();
return path;
}
private static string ResolveAssetsPath()
{
var bundledPath = Path.Combine(AppContext.BaseDirectory, "Assets");
if (File.Exists(Path.Combine(bundledPath, "openclaw.ico")))
{
return bundledPath;
}
var current = new DirectoryInfo(AppContext.BaseDirectory);
while (current != null)
{
var sourcePath = Path.Combine(current.FullName, "src", "OpenClaw.Tray.WinUI", "Assets");
if (Directory.Exists(sourcePath))
{
return sourcePath;
}
current = current.Parent;
}
return bundledPath;
}
[DllImport("user32.dll", CharSet = CharSet.Auto)]
private static extern bool DestroyIcon(IntPtr handle);
}

View File

@ -0,0 +1,19 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>net10.0-windows10.0.19041.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<RootNamespace>OpenClawTray</RootNamespace>
</PropertyGroup>
<ItemGroup>
<ProjectReference Include="..\OpenClaw.Shared\OpenClaw.Shared.csproj" />
</ItemGroup>
<ItemGroup>
<PackageReference Include="Microsoft.Windows.SDK.BuildTools" Version="10.0.26100.4654" />
<PackageReference Include="System.Drawing.Common" Version="10.0.2" />
</ItemGroup>
</Project>

View File

@ -0,0 +1,48 @@
namespace OpenClawTray.Services.Voice;
public static class VoiceCaptureMath
{
private const float DefaultSignalThreshold = 0.015f;
public static uint ResolveDesiredSamplesPerQuantum(int sampleRateHz, int chunkMs)
{
if (sampleRateHz <= 0)
{
sampleRateHz = 16000;
}
if (chunkMs <= 0)
{
chunkMs = 80;
}
var desired = (sampleRateHz * chunkMs) / 1000;
return (uint)Math.Max(desired, 128);
}
public static bool HasAudibleSignal(float peakLevel, float threshold = DefaultSignalThreshold)
{
return peakLevel >= threshold;
}
public static float ComputePeakLevel(byte[] data)
{
if (data.Length < sizeof(float))
{
return 0f;
}
float peak = 0f;
var alignedLength = data.Length - (data.Length % sizeof(float));
for (var offset = 0; offset < alignedLength; offset += sizeof(float))
{
var sample = Math.Abs(BitConverter.ToSingle(data, offset));
if (sample > peak)
{
peak = sample;
}
}
return float.IsFinite(peak) ? peak : 0f;
}
}

View File

@ -0,0 +1,43 @@
using OpenClaw.Shared;
namespace OpenClawTray.Services.Voice;
public interface IUiDispatcher
{
bool TryEnqueue(Action callback);
}
public interface IVoiceRuntime
{
event EventHandler<VoiceConversationTurnEventArgs>? ConversationTurnAvailable;
event EventHandler<VoiceTranscriptDraftEventArgs>? TranscriptDraftUpdated;
}
public interface IVoiceConfigurationApi
{
Task<VoiceSettings> GetSettingsAsync();
Task<VoiceSettings> UpdateSettingsAsync(VoiceSettingsUpdateArgs update);
Task<VoiceAudioDeviceInfo[]> ListDevicesAsync();
VoiceProviderCatalog GetProviderCatalog();
VoiceProviderConfigurationStore GetProviderConfiguration();
void SetProviderConfiguration(VoiceProviderConfigurationStore configurationStore);
}
public interface IVoiceRuntimeControlApi
{
VoiceStatusInfo CurrentStatus { get; }
Task<VoiceStatusInfo> GetStatusAsync();
Task<VoiceStatusInfo> StartAsync(VoiceStartArgs args);
Task<VoiceStatusInfo> StopAsync(VoiceStopArgs args);
Task<VoiceStatusInfo> PauseAsync(VoicePauseArgs? args = null);
Task<VoiceStatusInfo> ResumeAsync(VoiceResumeArgs? args = null);
Task<VoiceStatusInfo> SkipCurrentReplyAsync(VoiceSkipArgs? args = null);
Task<VoiceStatusInfo> ToggleQuickPauseAsync();
}
public interface IVoiceChatWindow
{
bool IsClosed { get; }
Task UpdateVoiceTranscriptDraftAsync(string text, bool clear);
Task AppendVoiceConversationTurnAsync(VoiceConversationTurnEventArgs args);
}

View File

@ -0,0 +1,154 @@
using System;
using System.Collections.Generic;
using System.Threading.Tasks;
namespace OpenClawTray.Services.Voice;
public sealed class VoiceChatCoordinator : IDisposable
{
private const int MaxBufferedConversationTurns = 8;
private readonly IVoiceRuntime _voiceService;
private readonly IUiDispatcher _dispatcher;
private readonly object _gate = new();
private readonly List<IVoiceChatWindow> _windows = [];
private string _voiceTranscriptDraftText = string.Empty;
private readonly List<VoiceConversationTurnEventArgs> _bufferedConversationTurns = [];
private bool _disposed;
public event EventHandler<VoiceConversationTurnEventArgs>? ConversationTurnAvailable;
public VoiceChatCoordinator(
IVoiceRuntime voiceService,
IUiDispatcher dispatcher)
{
_voiceService = voiceService;
_dispatcher = dispatcher;
_voiceService.ConversationTurnAvailable += OnVoiceConversationTurnAvailable;
_voiceService.TranscriptDraftUpdated += OnVoiceTranscriptDraftUpdated;
}
public void AttachWindow(IVoiceChatWindow window)
{
ArgumentNullException.ThrowIfNull(window);
lock (_gate)
{
if (_windows.Contains(window))
{
return;
}
_windows.Add(window);
}
_ = window.UpdateVoiceTranscriptDraftAsync(
_voiceTranscriptDraftText,
clear: string.IsNullOrWhiteSpace(_voiceTranscriptDraftText));
List<VoiceConversationTurnEventArgs> bufferedTurns;
lock (_gate)
{
bufferedTurns = [.. _bufferedConversationTurns];
}
foreach (var turn in bufferedTurns)
{
_ = window.AppendVoiceConversationTurnAsync(turn);
}
}
public void DetachWindow(IVoiceChatWindow? window)
{
lock (_gate)
{
if (_windows.Count == 0)
{
return;
}
if (window == null)
{
_windows.Clear();
return;
}
_windows.Remove(window);
}
}
public void Dispose()
{
if (_disposed)
{
return;
}
_disposed = true;
DetachWindow(null);
_voiceService.ConversationTurnAvailable -= OnVoiceConversationTurnAvailable;
_voiceService.TranscriptDraftUpdated -= OnVoiceTranscriptDraftUpdated;
}
private void OnVoiceConversationTurnAvailable(object? sender, VoiceConversationTurnEventArgs args)
{
_dispatcher.TryEnqueue(() =>
{
List<IVoiceChatWindow> windows;
lock (_gate)
{
_bufferedConversationTurns.Add(CloneTurn(args));
if (_bufferedConversationTurns.Count > MaxBufferedConversationTurns)
{
_bufferedConversationTurns.RemoveAt(0);
}
windows = [.. _windows];
}
foreach (var window in windows)
{
if (!window.IsClosed)
{
_ = window.AppendVoiceConversationTurnAsync(args);
}
}
ConversationTurnAvailable?.Invoke(this, args);
});
}
private void OnVoiceTranscriptDraftUpdated(object? sender, VoiceTranscriptDraftEventArgs args)
{
_dispatcher.TryEnqueue(() =>
{
_voiceTranscriptDraftText = args.Clear ? string.Empty : (args.Text ?? string.Empty);
List<IVoiceChatWindow> windows;
lock (_gate)
{
windows = [.. _windows];
}
foreach (var window in windows)
{
if (!window.IsClosed)
{
_ = window.UpdateVoiceTranscriptDraftAsync(_voiceTranscriptDraftText, args.Clear);
}
}
});
}
private static VoiceConversationTurnEventArgs CloneTurn(VoiceConversationTurnEventArgs args)
{
return new VoiceConversationTurnEventArgs
{
Direction = args.Direction,
Message = args.Message,
SessionKey = args.SessionKey,
Mode = args.Mode
};
}
}

View File

@ -0,0 +1,592 @@
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
using System.Net.Http;
using System.Net.Http.Headers;
using System.Net.WebSockets;
using System.Runtime.InteropServices.WindowsRuntime;
using System.Text;
using System.Text.Json;
using System.Threading;
using System.Threading.Tasks;
using OpenClaw.Shared;
using Windows.Storage.Streams;
namespace OpenClawTray.Services.Voice;
public sealed class VoiceCloudTextToSpeechClient
{
private static readonly HttpClient s_httpClient = CreateHttpClient();
public async Task<VoiceCloudTextToSpeechResult> SynthesizeAsync(
string text,
VoiceProviderOption provider,
VoiceProviderConfigurationStore configurationStore,
IOpenClawLogger? logger = null,
CancellationToken cancellationToken = default)
{
ArgumentException.ThrowIfNullOrWhiteSpace(text);
ArgumentNullException.ThrowIfNull(provider);
ArgumentNullException.ThrowIfNull(configurationStore);
if (provider.TextToSpeechWebSocket != null)
{
return await SynthesizeViaWebSocketAsync(text, provider, configurationStore, logger, cancellationToken);
}
var contract = provider.TextToSpeechHttp
?? throw new InvalidOperationException($"TTS provider '{provider.Name}' does not expose an HTTP contract.");
var providerConfiguration = configurationStore.FindProvider(provider.Id);
var templateValues = BuildTemplateValues(text, provider, providerConfiguration, contract);
var endpoint = ApplyUrlTemplate(contract.EndpointTemplate, templateValues);
using var request = new HttpRequestMessage(ParseHttpMethod(contract.HttpMethod), endpoint);
ApplyAuthenticationHeader(request, contract, templateValues);
if (!string.IsNullOrWhiteSpace(contract.RequestBodyTemplate))
{
var requestBody = ApplyJsonTemplate(contract.RequestBodyTemplate, templateValues);
request.Content = new StringContent(
requestBody,
Encoding.UTF8,
string.IsNullOrWhiteSpace(contract.RequestContentType) ? "application/json" : contract.RequestContentType);
}
var stopwatch = Stopwatch.StartNew();
using var response = await s_httpClient.SendAsync(request, HttpCompletionOption.ResponseHeadersRead, cancellationToken);
var headersElapsedMs = stopwatch.ElapsedMilliseconds;
if (!response.IsSuccessStatusCode)
{
throw new InvalidOperationException(
$"{provider.Name} TTS request failed: {(int)response.StatusCode} {response.ReasonPhrase}");
}
if (string.Equals(contract.ResponseAudioMode, VoiceTextToSpeechResponseModes.Binary, StringComparison.OrdinalIgnoreCase))
{
await using var responseStream = await response.Content.ReadAsStreamAsync(cancellationToken);
var result = await CreateResultAsync(responseStream, contract.OutputContentType);
logger?.Info($"{provider.Name} TTS latency: headers={headersElapsedMs}ms total={stopwatch.ElapsedMilliseconds}ms (binary)");
return result;
}
var responseText = await response.Content.ReadAsStringAsync(cancellationToken);
using var document = JsonDocument.Parse(responseText);
ValidateResponseStatus(provider, contract, document.RootElement);
var audioString = GetRequiredJsonString(document.RootElement, contract.ResponseAudioJsonPath);
var audioBytesFromJson = DecodeAudioBytes(contract.ResponseAudioMode, audioString, provider.Name);
var jsonResult = await CreateResultAsync(audioBytesFromJson, contract.OutputContentType);
logger?.Info($"{provider.Name} TTS latency: headers={headersElapsedMs}ms total={stopwatch.ElapsedMilliseconds}ms ({contract.ResponseAudioMode})");
return jsonResult;
}
private static async Task<VoiceCloudTextToSpeechResult> SynthesizeViaWebSocketAsync(
string text,
VoiceProviderOption provider,
VoiceProviderConfigurationStore configurationStore,
IOpenClawLogger? logger,
CancellationToken cancellationToken)
{
var contract = provider.TextToSpeechWebSocket
?? throw new InvalidOperationException($"TTS provider '{provider.Name}' does not expose a WebSocket contract.");
var providerConfiguration = configurationStore.FindProvider(provider.Id);
var templateValues = BuildTemplateValues(text, provider, providerConfiguration, contract.ApiKeySettingKey);
var endpoint = ApplyUrlTemplate(contract.EndpointTemplate, templateValues);
using var socket = new ClientWebSocket();
ApplyAuthenticationHeader(socket.Options, contract, templateValues);
using var cts = CancellationTokenSource.CreateLinkedTokenSource(cancellationToken);
cts.CancelAfter(TimeSpan.FromSeconds(30));
var ct = cts.Token;
var stopwatch = Stopwatch.StartNew();
await socket.ConnectAsync(new Uri(endpoint), ct);
if (!string.IsNullOrWhiteSpace(contract.ConnectSuccessEventName))
{
var connectedMessage = await ReceiveJsonMessageAsync(socket, ct);
ValidateWebSocketEvent(provider.Name, contract.ConnectSuccessEventName, connectedMessage, contract);
}
var startMessage = ApplyJsonTemplate(contract.StartMessageTemplate, templateValues);
await SendTextMessageAsync(socket, startMessage, ct);
if (!string.IsNullOrWhiteSpace(contract.StartSuccessEventName))
{
var startedMessage = await ReceiveJsonMessageAsync(socket, ct);
ValidateWebSocketEvent(provider.Name, contract.StartSuccessEventName, startedMessage, contract);
}
var continueMessage = ApplyJsonTemplate(contract.ContinueMessageTemplate, templateValues);
await SendTextMessageAsync(socket, continueMessage, ct);
if (!string.IsNullOrWhiteSpace(contract.FinishMessageTemplate))
{
await SendTextMessageAsync(socket, ApplyJsonTemplate(contract.FinishMessageTemplate, templateValues), ct);
}
var audioBytes = new List<byte>();
long? firstChunkMs = null;
while (true)
{
var message = await ReceiveJsonMessageAsync(socket, ct);
EnsureWebSocketNotFailed(provider.Name, contract, message);
if (TryGetJsonString(message, contract.ResponseAudioJsonPath, out var audioChunk) &&
!string.IsNullOrWhiteSpace(audioChunk))
{
if (!firstChunkMs.HasValue)
{
firstChunkMs = stopwatch.ElapsedMilliseconds;
}
audioBytes.AddRange(DecodeAudioBytes(contract.ResponseAudioMode, audioChunk, provider.Name));
}
if (IsFinalWebSocketMessage(message, contract))
{
break;
}
}
try
{
await socket.CloseAsync(WebSocketCloseStatus.NormalClosure, "done", ct);
}
catch
{
}
if (audioBytes.Count == 0)
{
throw new InvalidOperationException($"{provider.Name} TTS did not return any audio data.");
}
var result = await CreateResultAsync(audioBytes.ToArray(), contract.OutputContentType);
logger?.Info($"{provider.Name} TTS latency: firstChunk={(firstChunkMs?.ToString() ?? "n/a")}ms total={stopwatch.ElapsedMilliseconds}ms (websocket)");
return result;
}
private static Dictionary<string, TemplateValue> BuildTemplateValues(
string text,
VoiceProviderOption provider,
VoiceProviderConfiguration? providerConfiguration,
VoiceTextToSpeechHttpContract contract)
{
return BuildTemplateValues(text, provider, providerConfiguration, contract.ApiKeySettingKey);
}
private static Dictionary<string, TemplateValue> BuildTemplateValues(
string text,
VoiceProviderOption provider,
VoiceProviderConfiguration? providerConfiguration,
string apiKeySettingKey)
{
var values = new Dictionary<string, TemplateValue>(StringComparer.OrdinalIgnoreCase)
{
["text"] = TemplateValue.FromString(text),
["textWithTrailingSpace"] = TemplateValue.FromString(
text.EndsWith(' ') ? text : text + " ")
};
foreach (var setting in provider.Settings)
{
var configuredValue = providerConfiguration?.GetValue(setting.Key);
var effectiveValue = string.IsNullOrWhiteSpace(configuredValue)
? setting.DefaultValue
: configuredValue.Trim();
if (string.IsNullOrWhiteSpace(effectiveValue))
{
if (setting.Secret || string.Equals(setting.Key, apiKeySettingKey, StringComparison.OrdinalIgnoreCase))
{
throw new InvalidOperationException(
$"{provider.Name} API key is not configured. Open Settings and complete the {provider.Name} voice provider fields.");
}
if (setting.Required)
{
throw new InvalidOperationException(
$"{provider.Name} setting '{setting.Label}' is required. Open Settings and complete the {provider.Name} voice provider fields.");
}
continue;
}
values[setting.Key] = setting.JsonValue
? TemplateValue.FromJson(effectiveValue, provider.Name, setting.Label, values)
: TemplateValue.FromString(effectiveValue);
}
return values;
}
private static string ApplyUrlTemplate(string template, IReadOnlyDictionary<string, TemplateValue> values)
{
var result = template;
foreach (var entry in values)
{
result = result.Replace(
"{{" + entry.Key + "}}",
Uri.EscapeDataString(entry.Value.Value),
StringComparison.Ordinal);
}
return result;
}
private static string ApplyJsonTemplate(string template, IReadOnlyDictionary<string, TemplateValue> values)
{
var result = template;
foreach (var entry in values)
{
result = result.Replace(
"{{" + entry.Key + "}}",
entry.Value.JsonFragment ? entry.Value.Value : JsonSerializer.Serialize(entry.Value.Value),
StringComparison.Ordinal);
}
return result;
}
private static void ApplyAuthenticationHeader(
HttpRequestMessage request,
VoiceTextToSpeechHttpContract contract,
IReadOnlyDictionary<string, TemplateValue> values)
{
if (!values.TryGetValue(contract.ApiKeySettingKey, out var apiKey) || string.IsNullOrWhiteSpace(apiKey.Value))
{
throw new InvalidOperationException("Voice provider API key is not configured.");
}
if (string.Equals(contract.AuthenticationHeaderName, "Authorization", StringComparison.OrdinalIgnoreCase) &&
!string.IsNullOrWhiteSpace(contract.AuthenticationScheme))
{
request.Headers.Authorization = new AuthenticationHeaderValue(contract.AuthenticationScheme, apiKey.Value);
return;
}
var headerValue = string.IsNullOrWhiteSpace(contract.AuthenticationScheme)
? apiKey.Value
: $"{contract.AuthenticationScheme} {apiKey.Value}";
request.Headers.TryAddWithoutValidation(contract.AuthenticationHeaderName, headerValue);
}
private static void ApplyAuthenticationHeader(
ClientWebSocketOptions options,
VoiceTextToSpeechWebSocketContract contract,
IReadOnlyDictionary<string, TemplateValue> values)
{
if (!values.TryGetValue(contract.ApiKeySettingKey, out var apiKey) || string.IsNullOrWhiteSpace(apiKey.Value))
{
throw new InvalidOperationException("Voice provider API key is not configured.");
}
var headerValue = string.Equals(contract.AuthenticationHeaderName, "Authorization", StringComparison.OrdinalIgnoreCase) &&
!string.IsNullOrWhiteSpace(contract.AuthenticationScheme)
? $"{contract.AuthenticationScheme} {apiKey.Value}"
: string.IsNullOrWhiteSpace(contract.AuthenticationScheme)
? apiKey.Value
: $"{contract.AuthenticationScheme} {apiKey.Value}";
options.SetRequestHeader(contract.AuthenticationHeaderName, headerValue);
}
private static HttpMethod ParseHttpMethod(string? method)
{
if (string.Equals(method, HttpMethod.Post.Method, StringComparison.OrdinalIgnoreCase))
{
return HttpMethod.Post;
}
return new HttpMethod(string.IsNullOrWhiteSpace(method) ? HttpMethod.Post.Method : method);
}
private static void ValidateResponseStatus(
VoiceProviderOption provider,
VoiceTextToSpeechHttpContract contract,
JsonElement root)
{
if (string.IsNullOrWhiteSpace(contract.ResponseStatusCodeJsonPath))
{
return;
}
var statusValue = GetJsonValue(root, contract.ResponseStatusCodeJsonPath);
var statusText = statusValue.HasValue ? JsonElementToString(statusValue.Value) : null;
var successValue = contract.SuccessStatusValue ?? "0";
if (string.Equals(statusText, successValue, StringComparison.OrdinalIgnoreCase))
{
return;
}
var statusMessage = string.IsNullOrWhiteSpace(contract.ResponseStatusMessageJsonPath)
? null
: GetJsonValue(root, contract.ResponseStatusMessageJsonPath).HasValue
? JsonElementToString(GetJsonValue(root, contract.ResponseStatusMessageJsonPath)!.Value)
: null;
throw new InvalidOperationException(
string.IsNullOrWhiteSpace(statusMessage)
? $"{provider.Name} TTS returned an error."
: $"{provider.Name} TTS returned an error: {statusMessage}");
}
private static void ValidateWebSocketEvent(
string providerName,
string expectedEvent,
JsonElement message,
VoiceTextToSpeechWebSocketContract contract)
{
EnsureWebSocketNotFailed(providerName, contract, message);
if (!TryGetJsonString(message, "event", out var eventName) ||
!string.Equals(eventName, expectedEvent, StringComparison.OrdinalIgnoreCase))
{
throw new InvalidOperationException($"{providerName} TTS returned an unexpected WebSocket event.");
}
}
private static void EnsureWebSocketNotFailed(
string providerName,
VoiceTextToSpeechWebSocketContract contract,
JsonElement message)
{
if (TryGetJsonString(message, "event", out var eventName) &&
string.Equals(eventName, contract.TaskFailedEventName, StringComparison.OrdinalIgnoreCase))
{
var statusMessage = string.IsNullOrWhiteSpace(contract.ResponseStatusMessageJsonPath)
? null
: TryGetJsonString(message, contract.ResponseStatusMessageJsonPath, out var value)
? value
: null;
throw new InvalidOperationException(
string.IsNullOrWhiteSpace(statusMessage)
? $"{providerName} TTS returned an error."
: $"{providerName} TTS returned an error: {statusMessage}");
}
}
private static JsonElement? GetJsonValue(JsonElement root, string? path)
{
if (string.IsNullOrWhiteSpace(path))
{
return null;
}
var current = root;
foreach (var segment in path.Split('.', StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries))
{
if (current.ValueKind != JsonValueKind.Object || !current.TryGetProperty(segment, out current))
{
return null;
}
}
return current;
}
private static string GetRequiredJsonString(JsonElement root, string? path)
{
var value = GetJsonValue(root, path);
if (!value.HasValue)
{
throw new InvalidOperationException("Voice provider response did not contain audio data.");
}
var text = value.Value.GetString();
if (string.IsNullOrWhiteSpace(text))
{
throw new InvalidOperationException("Voice provider response did not contain audio data.");
}
return text;
}
private static bool TryGetJsonString(JsonElement root, string? path, out string value)
{
value = string.Empty;
var found = GetJsonValue(root, path);
if (!found.HasValue)
{
return false;
}
var text = JsonElementToString(found.Value);
if (string.IsNullOrWhiteSpace(text))
{
return false;
}
value = text;
return true;
}
private static bool IsFinalWebSocketMessage(JsonElement root, VoiceTextToSpeechWebSocketContract contract)
{
var finalFlag = GetJsonValue(root, contract.FinalFlagJsonPath);
return finalFlag.HasValue && finalFlag.Value.ValueKind == JsonValueKind.True;
}
private static string? JsonElementToString(JsonElement element)
{
return element.ValueKind switch
{
JsonValueKind.String => element.GetString(),
JsonValueKind.Number => element.ToString(),
JsonValueKind.True => bool.TrueString,
JsonValueKind.False => bool.FalseString,
_ => element.ToString()
};
}
private static byte[] DecodeAudioBytes(string responseAudioMode, string audioValue, string providerName)
{
try
{
if (string.Equals(responseAudioMode, VoiceTextToSpeechResponseModes.HexJsonString, StringComparison.OrdinalIgnoreCase))
{
return Convert.FromHexString(audioValue);
}
if (string.Equals(responseAudioMode, VoiceTextToSpeechResponseModes.Base64JsonString, StringComparison.OrdinalIgnoreCase))
{
return Convert.FromBase64String(audioValue);
}
throw new InvalidOperationException($"Unsupported TTS response mode '{responseAudioMode}'.");
}
catch (FormatException ex)
{
throw new InvalidOperationException($"{providerName} TTS returned invalid audio data.", ex);
}
}
private static async Task<VoiceCloudTextToSpeechResult> CreateResultAsync(byte[] audioBytes, string contentType)
{
var stream = new InMemoryRandomAccessStream();
await stream.WriteAsync(audioBytes.AsBuffer());
await stream.FlushAsync();
stream.Seek(0);
return new VoiceCloudTextToSpeechResult(stream, string.IsNullOrWhiteSpace(contentType) ? "audio/mpeg" : contentType);
}
private static async Task<VoiceCloudTextToSpeechResult> CreateResultAsync(Stream sourceStream, string contentType, CancellationToken cancellationToken = default)
{
var stream = new InMemoryRandomAccessStream();
await using (var output = stream.AsStreamForWrite())
{
await sourceStream.CopyToAsync(output, cancellationToken);
await output.FlushAsync(cancellationToken);
}
stream.Seek(0);
return new VoiceCloudTextToSpeechResult(stream, string.IsNullOrWhiteSpace(contentType) ? "audio/mpeg" : contentType);
}
private static async Task SendTextMessageAsync(ClientWebSocket socket, string message, CancellationToken cancellationToken)
{
var bytes = Encoding.UTF8.GetBytes(message);
await socket.SendAsync(bytes, WebSocketMessageType.Text, true, cancellationToken);
}
private static async Task<JsonElement> ReceiveJsonMessageAsync(ClientWebSocket socket, CancellationToken cancellationToken)
{
using var buffer = new MemoryStream();
var receiveBuffer = new byte[8192];
while (true)
{
var segment = new ArraySegment<byte>(receiveBuffer);
var result = await socket.ReceiveAsync(segment, cancellationToken);
if (result.MessageType == WebSocketMessageType.Close)
{
var closeStatus = socket.CloseStatus?.ToString() ?? "Unknown";
var closeDescription = string.IsNullOrWhiteSpace(socket.CloseStatusDescription)
? null
: socket.CloseStatusDescription;
throw new InvalidOperationException(
string.IsNullOrWhiteSpace(closeDescription)
? $"Voice provider closed the WebSocket unexpectedly ({closeStatus})."
: $"Voice provider closed the WebSocket unexpectedly ({closeStatus}: {closeDescription}).");
}
buffer.Write(receiveBuffer, 0, result.Count);
if (result.EndOfMessage)
{
break;
}
}
var text = Encoding.UTF8.GetString(buffer.ToArray());
using var document = JsonDocument.Parse(text);
return document.RootElement.Clone();
}
private static HttpClient CreateHttpClient()
{
return new HttpClient
{
Timeout = TimeSpan.FromSeconds(30)
};
}
private readonly record struct TemplateValue(string Value, bool JsonFragment)
{
public static TemplateValue FromString(string value) => new(value, false);
public static TemplateValue FromJson(
string json,
string providerName,
string label,
IReadOnlyDictionary<string, TemplateValue>? templateValues = null)
{
var substituted = templateValues == null
? json
: ApplyJsonTemplate(json, templateValues);
try
{
using var document = JsonDocument.Parse(substituted);
return new(document.RootElement.GetRawText(), true);
}
catch (JsonException ex)
{
try
{
using var wrapped = JsonDocument.Parse("{ " + substituted + " }");
var wrappedJson = wrapped.RootElement.GetRawText();
return new(wrappedJson[1..^1], true);
}
catch (JsonException)
{
throw new InvalidOperationException(
$"{providerName} setting '{label}' must be valid JSON.",
ex);
}
}
}
public static implicit operator string(TemplateValue value) => value.Value;
}
}
public sealed class VoiceCloudTextToSpeechResult : IDisposable
{
public VoiceCloudTextToSpeechResult(IRandomAccessStream stream, string contentType)
{
Stream = stream;
ContentType = contentType;
}
public IRandomAccessStream Stream { get; }
public string ContentType { get; }
public void Dispose()
{
Stream.Dispose();
}
}

View File

@ -0,0 +1,25 @@
using OpenClaw.Shared;
namespace OpenClawTray.Services.Voice;
public enum VoiceConversationDirection
{
Outgoing,
Incoming
}
public sealed class VoiceConversationTurnEventArgs : EventArgs
{
public VoiceConversationDirection Direction { get; set; }
public string SessionKey { get; set; } = "main";
public string Message { get; set; } = "";
public VoiceActivationMode Mode { get; set; } = VoiceActivationMode.Off;
}
public sealed class VoiceTranscriptDraftEventArgs : EventArgs
{
public string SessionKey { get; set; } = "main";
public string Text { get; set; } = "";
public bool Clear { get; set; }
public VoiceActivationMode Mode { get; set; } = VoiceActivationMode.Off;
}

View File

@ -0,0 +1,256 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text.Json;
using OpenClaw.Shared;
namespace OpenClawTray.Services.Voice;
public static class VoiceProviderCatalogService
{
private const long MaxCatalogBytes = 256 * 1024;
private const string CatalogRelativePath = "Assets\\voice-providers.json";
private static readonly JsonSerializerOptions s_jsonOptions = new()
{
PropertyNameCaseInsensitive = true,
WriteIndented = true
};
public static string CatalogFilePath => ResolveCatalogFilePath();
public static VoiceProviderCatalog LoadCatalog(IOpenClawLogger? logger = null)
{
var catalogFilePath = ResolveCatalogFilePath();
try
{
if (!File.Exists(catalogFilePath))
{
throw new FileNotFoundException("Voice provider catalog asset not found.", catalogFilePath);
}
var fileInfo = new FileInfo(catalogFilePath);
if (fileInfo.Length > MaxCatalogBytes)
{
throw new InvalidOperationException($"Voice provider catalog exceeds {MaxCatalogBytes} bytes.");
}
var json = File.ReadAllText(catalogFilePath);
var catalog = JsonSerializer.Deserialize<VoiceProviderCatalog>(json, s_jsonOptions);
if (catalog == null)
{
throw new InvalidOperationException("Voice provider catalog asset is empty or invalid.");
}
return NormalizeCatalog(catalog);
}
catch (Exception ex)
{
throw new InvalidOperationException(
$"Failed to load voice provider catalog from '{catalogFilePath}': {ex.Message}",
ex);
}
}
public static VoiceProviderOption ResolveSpeechToTextProvider(string? providerId, IOpenClawLogger? logger = null)
{
var catalog = LoadCatalog(logger);
return ResolveProvider(catalog.SpeechToTextProviders, providerId);
}
public static VoiceProviderOption ResolveTextToSpeechProvider(string? providerId, IOpenClawLogger? logger = null)
{
var catalog = LoadCatalog(logger);
return ResolveProvider(catalog.TextToSpeechProviders, providerId);
}
public static bool SupportsWindowsRuntime(string? providerId)
{
return string.Equals(providerId, VoiceProviderIds.Windows, StringComparison.OrdinalIgnoreCase);
}
public static bool SupportsSpeechToTextRuntime(string? providerId)
{
try
{
var provider = ResolveSpeechToTextProvider(providerId);
return VoiceSpeechToTextRouteResolver.ResolveRouteKind(provider) == VoiceSpeechToTextRouteKind.WindowsMedia;
}
catch
{
return false;
}
}
public static bool SupportsTextToSpeechRuntime(string? providerId)
{
if (SupportsWindowsRuntime(providerId))
{
return true;
}
try
{
var provider = ResolveTextToSpeechProvider(providerId);
return provider.TextToSpeechHttp != null || provider.TextToSpeechWebSocket != null;
}
catch
{
return false;
}
}
private static VoiceProviderCatalog NormalizeCatalog(VoiceProviderCatalog catalog)
{
return new VoiceProviderCatalog
{
SpeechToTextProviders = NormalizeProviders(catalog.SpeechToTextProviders),
TextToSpeechProviders = NormalizeProviders(catalog.TextToSpeechProviders)
};
}
private static List<VoiceProviderOption> NormalizeProviders(List<VoiceProviderOption>? providers)
{
return (providers ?? [])
.Where(p => !string.IsNullOrWhiteSpace(p.Id))
.Select(Clone)
.Where(p => p.Enabled || p.VisibleInSettings)
.OrderByDescending(p => string.Equals(p.Id, VoiceProviderIds.Windows, StringComparison.OrdinalIgnoreCase))
.ThenBy(p => p.Name, StringComparer.OrdinalIgnoreCase)
.ToList();
}
private static VoiceProviderOption ResolveProvider(IEnumerable<VoiceProviderOption> providers, string? providerId)
{
if (!string.IsNullOrWhiteSpace(providerId))
{
var configured = providers.FirstOrDefault(p => string.Equals(p.Id, providerId, StringComparison.OrdinalIgnoreCase));
if (configured != null)
{
return Clone(configured);
}
}
return providers
.Select(Clone)
.FirstOrDefault(p => string.Equals(p.Id, VoiceProviderIds.Windows, StringComparison.OrdinalIgnoreCase))
?? new VoiceProviderOption
{
Id = VoiceProviderIds.Windows,
Name = "Windows Speech",
Runtime = "windows"
};
}
private static VoiceProviderOption Clone(VoiceProviderOption source)
{
return new VoiceProviderOption
{
Id = source.Id,
Name = source.Name,
Runtime = source.Runtime,
Enabled = source.Enabled,
VisibleInSettings = source.VisibleInSettings,
Selectable = source.Selectable,
Description = source.Description,
Settings = source.Settings.Select(Clone).ToList(),
TextToSpeechHttp = Clone(source.TextToSpeechHttp),
TextToSpeechWebSocket = Clone(source.TextToSpeechWebSocket)
};
}
private static VoiceProviderSettingDefinition Clone(VoiceProviderSettingDefinition source)
{
return new VoiceProviderSettingDefinition
{
Key = source.Key,
Label = source.Label,
Secret = source.Secret,
Required = source.Required,
JsonValue = source.JsonValue,
DefaultValue = source.DefaultValue,
Placeholder = source.Placeholder,
Description = source.Description,
Options = source.Options.ToList()
};
}
private static VoiceTextToSpeechHttpContract? Clone(VoiceTextToSpeechHttpContract? source)
{
if (source == null)
{
return null;
}
return new VoiceTextToSpeechHttpContract
{
EndpointTemplate = source.EndpointTemplate,
HttpMethod = source.HttpMethod,
AuthenticationHeaderName = source.AuthenticationHeaderName,
AuthenticationScheme = source.AuthenticationScheme,
ApiKeySettingKey = source.ApiKeySettingKey,
RequestContentType = source.RequestContentType,
RequestBodyTemplate = source.RequestBodyTemplate,
ResponseAudioMode = source.ResponseAudioMode,
ResponseAudioJsonPath = source.ResponseAudioJsonPath,
ResponseStatusCodeJsonPath = source.ResponseStatusCodeJsonPath,
ResponseStatusMessageJsonPath = source.ResponseStatusMessageJsonPath,
SuccessStatusValue = source.SuccessStatusValue,
OutputContentType = source.OutputContentType
};
}
private static VoiceTextToSpeechWebSocketContract? Clone(VoiceTextToSpeechWebSocketContract? source)
{
if (source == null)
{
return null;
}
return new VoiceTextToSpeechWebSocketContract
{
EndpointTemplate = source.EndpointTemplate,
AuthenticationHeaderName = source.AuthenticationHeaderName,
AuthenticationScheme = source.AuthenticationScheme,
ApiKeySettingKey = source.ApiKeySettingKey,
ConnectSuccessEventName = source.ConnectSuccessEventName,
StartMessageTemplate = source.StartMessageTemplate,
StartSuccessEventName = source.StartSuccessEventName,
ContinueMessageTemplate = source.ContinueMessageTemplate,
FinishMessageTemplate = source.FinishMessageTemplate,
ResponseAudioMode = source.ResponseAudioMode,
ResponseAudioJsonPath = source.ResponseAudioJsonPath,
ResponseStatusCodeJsonPath = source.ResponseStatusCodeJsonPath,
ResponseStatusMessageJsonPath = source.ResponseStatusMessageJsonPath,
FinalFlagJsonPath = source.FinalFlagJsonPath,
TaskFailedEventName = source.TaskFailedEventName,
SuccessStatusValue = source.SuccessStatusValue,
OutputContentType = source.OutputContentType
};
}
private static string ResolveCatalogFilePath()
{
var bundledPath = Path.Combine(AppContext.BaseDirectory, CatalogRelativePath);
if (File.Exists(bundledPath))
{
return bundledPath;
}
var current = new DirectoryInfo(AppContext.BaseDirectory);
while (current != null)
{
var sourcePath = Path.Combine(current.FullName, "src", "OpenClaw.Tray.WinUI", CatalogRelativePath);
if (File.Exists(sourcePath))
{
return sourcePath;
}
current = current.Parent;
}
return bundledPath;
}
}

View File

@ -0,0 +1,255 @@
using OpenClaw.Shared;
using Windows.Media.Devices;
using Windows.Media.SpeechRecognition;
namespace OpenClawTray.Services.Voice;
public static class VoiceServiceTransportLogic
{
private static readonly TimeSpan HypothesisPromotionWindow = TimeSpan.FromSeconds(2);
public static TaskCompletionSource<bool> GetOrCreateTransportReadySource(
ConnectionStatus transportStatus,
TaskCompletionSource<bool>? existingReadySource,
out bool shouldStartConnection)
{
if (transportStatus == ConnectionStatus.Connecting && existingReadySource != null)
{
shouldStartConnection = false;
return existingReadySource;
}
shouldStartConnection = true;
return new TaskCompletionSource<bool>(TaskCreationOptions.RunContinuationsAsynchronously);
}
public static bool UsesCloudTextToSpeechRuntime(VoiceProviderOption provider)
{
return provider.TextToSpeechHttp != null || provider.TextToSpeechWebSocket != null;
}
public static bool ShouldAcceptAssistantReply(
bool awaitingReply,
bool isSpeaking,
int queuedReplyCount,
bool acceptedViaLateReplyGrace = false)
{
return awaitingReply || isSpeaking || queuedReplyCount > 0 || acceptedViaLateReplyGrace;
}
public static bool ShouldAcceptLateAssistantReply(
bool awaitingReply,
bool isSpeaking,
int queuedReplyCount,
string? lateReplySessionKey,
DateTime? lateReplyGraceUntilUtc,
string? incomingSessionKey,
DateTime utcNow)
{
return !awaitingReply &&
!isSpeaking &&
queuedReplyCount == 0 &&
!string.IsNullOrWhiteSpace(lateReplySessionKey) &&
!string.IsNullOrWhiteSpace(incomingSessionKey) &&
IsMatchingSessionKey(incomingSessionKey, lateReplySessionKey) &&
lateReplyGraceUntilUtc.HasValue &&
utcNow <= lateReplyGraceUntilUtc.Value;
}
public static bool ShouldRestartRecognitionAfterCompletion(
bool running,
VoiceActivationMode mode,
bool restartInProgress,
bool awaitingReply,
bool isSpeaking)
{
return running &&
mode == VoiceActivationMode.TalkMode &&
!restartInProgress &&
!awaitingReply &&
!isSpeaking;
}
public static string DescribeRecognitionCompletionRestartDecision(
bool running,
VoiceActivationMode mode,
bool restartInProgress,
bool awaitingReply,
bool isSpeaking)
{
if (!running)
{
return "runtime-not-running";
}
if (mode != VoiceActivationMode.TalkMode)
{
return $"mode={mode}";
}
if (restartInProgress)
{
return "controlled-restart-in-progress";
}
if (awaitingReply)
{
return "awaiting-reply";
}
if (isSpeaking)
{
return "speaking";
}
return "eligible";
}
public static bool ShouldRebuildRecognitionAfterCompletion(
SpeechRecognitionResultStatus status,
bool sessionHadActivity,
bool sessionHadCaptureSignal,
bool restartInProgress,
bool awaitingReply,
bool isSpeaking)
{
if (restartInProgress || awaitingReply || isSpeaking || sessionHadActivity)
{
return false;
}
return status == SpeechRecognitionResultStatus.UserCanceled;
}
public static string DescribeRecognitionCompletionRebuildDecision(
SpeechRecognitionResultStatus status,
bool sessionHadActivity,
bool sessionHadCaptureSignal,
bool restartInProgress,
bool awaitingReply,
bool isSpeaking)
{
if (restartInProgress)
{
return "controlled-restart-in-progress";
}
if (awaitingReply)
{
return "awaiting-reply";
}
if (isSpeaking)
{
return "speaking";
}
if (sessionHadActivity)
{
return "session-had-activity";
}
if (sessionHadCaptureSignal)
{
return "capture-signal-without-recognition";
}
return status switch
{
SpeechRecognitionResultStatus.UserCanceled => "user-canceled-without-activity",
SpeechRecognitionResultStatus.TimeoutExceeded => "disabled-official-session-restart-only (status=TimeoutExceeded)",
_ => $"disabled-official-session-restart-only (status={status})"
};
}
public static string SelectRecognizedText(
string recognizedText,
string? latestHypothesisText,
DateTime latestHypothesisUtc,
DateTime utcNow,
out bool promotedHypothesis)
{
promotedHypothesis = false;
if (string.IsNullOrWhiteSpace(recognizedText) ||
string.IsNullOrWhiteSpace(latestHypothesisText) ||
utcNow - latestHypothesisUtc > HypothesisPromotionWindow)
{
return recognizedText;
}
var normalizedResult = recognizedText.Trim();
var normalizedHypothesis = latestHypothesisText.Trim();
if (normalizedHypothesis.Length <= normalizedResult.Length + 3)
{
return normalizedResult;
}
if (!normalizedHypothesis.EndsWith(normalizedResult, StringComparison.OrdinalIgnoreCase))
{
return normalizedResult;
}
promotedHypothesis = true;
return normalizedHypothesis;
}
public static string? SelectCompletionFallbackText(
bool sessionHadActivity,
string? latestHypothesisText,
DateTime latestHypothesisUtc,
DateTime utcNow)
{
if (!sessionHadActivity ||
string.IsNullOrWhiteSpace(latestHypothesisText) ||
utcNow - latestHypothesisUtc > HypothesisPromotionWindow)
{
return null;
}
return latestHypothesisText.Trim();
}
public static bool ShouldClearTranscriptDraftAfterCompletion(
bool awaitingReply,
bool isSpeaking,
bool usedFallbackTranscript)
{
return !awaitingReply &&
!isSpeaking &&
!usedFallbackTranscript;
}
public static bool ShouldRepromptAfterIncompleteRecognition(
bool sessionHadActivity,
bool awaitingReply,
bool isSpeaking,
bool usedFallbackTranscript)
{
return sessionHadActivity &&
!awaitingReply &&
!isSpeaking &&
!usedFallbackTranscript;
}
public static bool ShouldRefreshRecognitionForDefaultCaptureDeviceChange(
bool running,
VoiceActivationMode mode,
string? configuredInputDeviceId,
AudioDeviceRole role)
{
return running &&
mode == VoiceActivationMode.TalkMode &&
string.IsNullOrWhiteSpace(configuredInputDeviceId) &&
role == AudioDeviceRole.Default;
}
private static bool IsMatchingSessionKey(string? first, string? second)
{
return string.Equals(
string.IsNullOrWhiteSpace(first) ? "main" : first,
string.IsNullOrWhiteSpace(second) ? "main" : second,
StringComparison.OrdinalIgnoreCase);
}
}

View File

@ -0,0 +1,8 @@
namespace OpenClawTray.Services.Voice;
public enum VoiceSpeechToTextRouteKind
{
WindowsMedia,
Streaming,
SherpaOnnx
}

View File

@ -0,0 +1,28 @@
using OpenClaw.Shared;
namespace OpenClawTray.Services.Voice;
public static class VoiceSpeechToTextRouteResolver
{
public static VoiceSpeechToTextRouteKind ResolveRouteKind(VoiceProviderOption provider)
{
ArgumentNullException.ThrowIfNull(provider);
if (string.Equals(provider.Id, VoiceProviderIds.SherpaOnnx, StringComparison.OrdinalIgnoreCase))
{
return VoiceSpeechToTextRouteKind.SherpaOnnx;
}
if (string.Equals(provider.Runtime, VoiceProviderRuntimeIds.Streaming, StringComparison.OrdinalIgnoreCase))
{
return VoiceSpeechToTextRouteKind.Streaming;
}
if (string.Equals(provider.Runtime, VoiceProviderRuntimeIds.Embedded, StringComparison.OrdinalIgnoreCase))
{
return VoiceSpeechToTextRouteKind.SherpaOnnx;
}
return VoiceSpeechToTextRouteKind.WindowsMedia;
}
}

View File

@ -0,0 +1,95 @@
using System.Text.Json;
namespace OpenClawTray.Windows;
public static class WebChatVoiceDomBridge
{
public const string DocumentCreatedScript = """
(() => {
const isVisible = (el) => !!el && !(el.disabled === true) && el.getClientRects().length > 0;
let desiredDraft = '';
const findComposer = () => {
const candidates = Array.from(document.querySelectorAll('textarea, input[type="text"], [contenteditable="true"], [contenteditable="plaintext-only"]'));
return candidates.find(isVisible) || null;
};
const setElementValue = (el, value) => {
const text = typeof value === 'string' ? value : '';
if ('value' in el) {
const proto = el.tagName === 'TEXTAREA' ? HTMLTextAreaElement.prototype : HTMLInputElement.prototype;
const descriptor = Object.getOwnPropertyDescriptor(proto, 'value');
if (descriptor && descriptor.set) {
descriptor.set.call(el, text);
} else {
el.value = text;
}
el.dispatchEvent(new InputEvent('input', { bubbles: true, data: text, inputType: 'insertText' }));
el.dispatchEvent(new Event('change', { bubbles: true }));
return;
}
if (el.isContentEditable) {
el.textContent = text;
el.dispatchEvent(new InputEvent('input', { bubbles: true, data: text, inputType: 'insertText' }));
el.dispatchEvent(new Event('change', { bubbles: true }));
}
};
const applyDraftIfPossible = () => {
const composer = findComposer();
if (!composer) return false;
setElementValue(composer, desiredDraft);
return true;
};
const clearLegacyTurnsHost = () => {
const host = document.getElementById('openclaw-tray-voice-turns');
if (host) {
host.remove();
}
};
const observer = new MutationObserver(() => applyDraftIfPossible());
const start = () => {
if (!document.body) return;
observer.observe(document.body, { childList: true, subtree: true });
applyDraftIfPossible();
clearLegacyTurnsHost();
};
if (document.readyState === 'loading') {
document.addEventListener('DOMContentLoaded', start, { once: true });
} else {
start();
}
window.__openClawTrayVoice = {
setDraft(text) {
desiredDraft = text || '';
return applyDraftIfPossible();
},
clearDraft() {
desiredDraft = '';
return applyDraftIfPossible();
},
setTurns() {
clearLegacyTurnsHost();
return true;
}
};
})();
""";
public static string BuildSetDraftScript(string? text)
{
if (string.IsNullOrWhiteSpace(text))
{
return "window.__openClawTrayVoice?.clearDraft?.();";
}
return $"window.__openClawTrayVoice?.setDraft?.({JsonSerializer.Serialize(text)});";
}
public const string ClearLegacyTurnsScript = "window.__openClawTrayVoice?.setTurns?.([]);";
}

View File

@ -6,6 +6,7 @@ using OpenClaw.Shared.Capabilities;
using OpenClawTray.Dialogs;
using OpenClawTray.Helpers;
using OpenClawTray.Services;
using OpenClawTray.Services.Voice;
using OpenClawTray.Windows;
using System;
using System.Collections.Frozen;
@ -39,6 +40,7 @@ public partial class App : Application
private GlobalHotkeyService? _globalHotkey;
private System.Timers.Timer? _healthCheckTimer;
private System.Timers.Timer? _sessionPollTimer;
private Microsoft.UI.Dispatching.DispatcherQueueTimer? _voiceTrayIconTimer;
private Mutex? _mutex;
private Microsoft.UI.Dispatching.DispatcherQueue? _dispatcherQueue;
private CancellationTokenSource? _deepLinkCts;
@ -57,6 +59,7 @@ public partial class App : Application
private GatewayCostUsageInfo? _lastUsageCost;
private DateTime _lastCheckTime = DateTime.Now;
private DateTime _lastUsageActivityLogUtc = DateTime.MinValue;
private string? _lastTrayIconPath;
// FrozenDictionary for O(1) case-insensitive notification type → setting lookup — no per-call allocation.
private static readonly System.Collections.Frozen.FrozenDictionary<string, Func<SettingsManager, bool>> s_notifTypeMap =
@ -81,6 +84,8 @@ public partial class App : Application
// Windows (created on demand)
private SettingsWindow? _settingsWindow;
private VoiceRepeaterWindow? _voiceRepeaterWindow;
private VoiceModeWindow? _voiceModeWindow;
private WebChatWindow? _webChatWindow;
private StatusDetailWindow? _statusDetailWindow;
private NotificationHistoryWindow? _notificationHistoryWindow;
@ -90,6 +95,8 @@ public partial class App : Application
// Node service (optional, enabled in settings)
private NodeService? _nodeService;
private VoiceService? _voiceService;
private VoiceChatCoordinator? _voiceChatCoordinator;
// Keep-alive window to anchor WinUI runtime (prevents GC/threading issues)
private Window? _keepAliveWindow;
@ -269,6 +276,11 @@ public partial class App : Application
// Register toast activation handler
ToastNotificationManagerCompat.OnActivated += OnToastActivated;
_voiceService = new VoiceService(new AppLogger(), _settings);
_voiceChatCoordinator = new VoiceChatCoordinator(
_voiceService,
new DispatcherQueueAdapter(_dispatcherQueue!));
_voiceChatCoordinator.ConversationTurnAvailable += OnVoiceConversationTurnAvailable;
_sshTunnelService = new SshTunnelService(new AppLogger());
_sshTunnelService.TunnelExited += OnSshTunnelExited;
@ -297,6 +309,7 @@ public partial class App : Application
// Start health check timer
StartHealthCheckTimer();
StartVoiceTrayIconTimer();
// Start deep link server
StartDeepLinkServer();
@ -305,7 +318,8 @@ public partial class App : Application
if (_settings.GlobalHotkeyEnabled)
{
_globalHotkey = new GlobalHotkeyService();
_globalHotkey.HotkeyPressed += OnGlobalHotkeyPressed;
_globalHotkey.QuickSendHotkeyPressed += OnGlobalQuickSendHotkeyPressed;
_globalHotkey.VoiceToggleHotkeyPressed += OnGlobalVoiceToggleHotkeyPressed;
_globalHotkey.Register();
}
@ -318,6 +332,11 @@ public partial class App : Application
HandleDeepLink(startupDeepLink);
}
if (ShouldShowVoiceRepeaterAtStartup())
{
_dispatcherQueue?.TryEnqueue(ShowVoiceModeSettings);
}
Logger.Info("Application started (WinUI 3)");
}
@ -341,13 +360,28 @@ public partial class App : Application
// Pre-create tray menu window at startup to avoid creation crashes later
InitializeTrayMenuWindow();
var iconPath = IconHelper.GetStatusIconPath(ConnectionStatus.Disconnected);
var iconPath = AppIconHelper.GetStatusIconPath(ConnectionStatus.Disconnected);
_trayIcon = new TrayIcon(1, iconPath, "OpenClaw Tray — Disconnected");
_lastTrayIconPath = iconPath;
_trayIcon.IsVisible = true;
_trayIcon.Selected += OnTrayIconSelected;
_trayIcon.ContextMenu += OnTrayContextMenu;
}
private void StartVoiceTrayIconTimer()
{
if (_dispatcherQueue == null || _voiceTrayIconTimer != null)
{
return;
}
_voiceTrayIconTimer = _dispatcherQueue.CreateTimer();
_voiceTrayIconTimer.Interval = TimeSpan.FromMilliseconds(250);
_voiceTrayIconTimer.IsRepeating = true;
_voiceTrayIconTimer.Tick += (s, e) => UpdateTrayIcon();
_voiceTrayIconTimer.Start();
}
private void InitializeTrayMenuWindow()
{
// Pre-create menu window once - reuse to avoid crash on window creation after idle
@ -535,6 +569,8 @@ public partial class App : Application
switch (action)
{
case "status": ShowStatusDetail(); break;
case "voice-settings": ShowVoiceModeSettings(); break;
case "voice-toggle-pause": _ = ToggleVoiceQuickPauseAsync(); break;
case "dashboard": OpenDashboard(); break;
case "webchat": ShowWebChat(); break;
case "quicksend": ShowQuickSend(); break;
@ -742,6 +778,60 @@ public partial class App : Application
.ToList();
}
private string GetRunningVoiceModeLabel()
{
var status = _voiceService?.CurrentStatus;
if (status == null)
{
return "Off";
}
return VoiceDisplayHelper.GetRuntimeLabel(status);
}
private bool CanQuickToggleVoiceMode()
{
if (_settings?.EnableNodeMode != true || _voiceService == null)
{
return false;
}
var status = _voiceService.CurrentStatus;
if (status.State == VoiceRuntimeState.Paused)
{
return true;
}
return _settings.Voice.Enabled && _settings.Voice.Mode != VoiceActivationMode.Off;
}
private bool ShouldShowVoiceRepeaterAtStartup()
{
return _settings?.EnableNodeMode == true &&
_settings.Voice.Enabled &&
_settings.Voice.Mode != VoiceActivationMode.Off &&
_settings.Voice.ShowRepeaterAtStartup;
}
private string GetVoiceQuickToggleLabel()
{
var status = _voiceService?.CurrentStatus;
return status?.State == VoiceRuntimeState.Paused
? "Resume Voice"
: "Pause Voice";
}
private string GetVoiceDeviceSummary()
{
var voice = _settings?.Voice;
if (voice == null)
return "Talk: system default · Listen: system default";
var talk = string.IsNullOrWhiteSpace(voice.OutputDeviceId) ? "system default" : "selected speaker";
var listen = string.IsNullOrWhiteSpace(voice.InputDeviceId) ? "system default" : "selected microphone";
return $"Talk: {talk} · Listen: {listen}";
}
private void BuildTrayMenuPopup(TrayMenuWindow menu)
{
// Brand header
@ -758,6 +848,14 @@ public partial class App : Application
menu.AddMenuItem(_currentActivity.DisplayText, _currentActivity.Glyph, "", isEnabled: false);
}
menu.AddMenuItem($"Voice Mode: {GetRunningVoiceModeLabel()}", "🎙️", "voice-settings");
menu.AddMenuItem($"↳ {GetVoiceDeviceSummary()}", "", "", isEnabled: false, indent: true);
menu.AddMenuItem($"↳ {GetVoiceQuickToggleLabel()} (Ctrl+Alt+Shift+V)", "", "voice-toggle-pause", isEnabled: CanQuickToggleVoiceMode(), indent: true);
if (_settings?.EnableNodeMode != true)
{
menu.AddMenuItem("↳ Enable Node Mode to activate voice runtime", "", "", isEnabled: false, indent: true);
}
// Usage
if (_lastUsage != null || _lastUsageStatus != null || _lastUsageCost != null)
{
@ -1147,7 +1245,7 @@ public partial class App : Application
{
Logger.Info("Initializing Windows Node service...");
_nodeService = new NodeService(new AppLogger(), _dispatcherQueue, DataPath);
_nodeService = new NodeService(new AppLogger(), _dispatcherQueue, _voiceService!, DataPath);
_nodeService.StatusChanged += OnNodeStatusChanged;
_nodeService.NotificationRequested += OnNodeNotificationRequested;
_nodeService.PairingStatusChanged += OnPairingStatusChanged;
@ -1558,13 +1656,7 @@ public partial class App : Application
{
if (_trayIcon == null) return;
var status = _currentStatus;
if (_currentActivity != null && _currentActivity.Kind != OpenClaw.Shared.ActivityKind.Idle)
{
status = ConnectionStatus.Connecting; // Use connecting icon for activity
}
var iconPath = IconHelper.GetStatusIconPath(status);
var iconPath = GetTrayIconPathForCurrentState();
var tooltip = $"OpenClaw Tray — {_currentStatus}";
if (_currentActivity != null && !string.IsNullOrEmpty(_currentActivity.DisplayText))
@ -1576,7 +1668,11 @@ public partial class App : Application
try
{
_trayIcon.SetIcon(iconPath);
if (!string.Equals(_lastTrayIconPath, iconPath, StringComparison.OrdinalIgnoreCase))
{
_trayIcon.SetIcon(iconPath);
_lastTrayIconPath = iconPath;
}
_trayIcon.Tooltip = tooltip;
}
catch (Exception ex)
@ -1585,15 +1681,60 @@ public partial class App : Application
}
}
private string GetTrayIconPathForCurrentState()
{
var voiceIconState = GetVoiceTrayIconState();
if (voiceIconState != VoiceTrayIconState.Off)
{
return VoiceTrayIconHelper.GetVoiceTrayIconPath(voiceIconState);
}
if (_voiceService?.CurrentStatus.State == VoiceRuntimeState.Paused)
{
return VoiceTrayIconHelper.GetVoiceTrayIconPath(VoiceTrayIconState.Off);
}
var status = _currentStatus;
if (_currentActivity != null && _currentActivity.Kind != OpenClaw.Shared.ActivityKind.Idle)
{
status = ConnectionStatus.Connecting;
}
return AppIconHelper.GetStatusIconPath(status);
}
private VoiceTrayIconState GetVoiceTrayIconState()
{
var voiceStatus = _voiceService?.CurrentStatus;
if (voiceStatus == null || !voiceStatus.Running)
{
return VoiceTrayIconState.Off;
}
return voiceStatus.State switch
{
VoiceRuntimeState.PlayingResponse => VoiceTrayIconState.Speaking,
VoiceRuntimeState.ListeningForVoiceWake => VoiceTrayIconState.Listening,
VoiceRuntimeState.ListeningContinuously => VoiceTrayIconState.Listening,
VoiceRuntimeState.RecordingUtterance => VoiceTrayIconState.Listening,
VoiceRuntimeState.Paused => VoiceTrayIconState.Off,
_ when voiceStatus.Mode == VoiceActivationMode.Off => VoiceTrayIconState.Off,
_ => VoiceTrayIconState.Off
};
}
#endregion
#region Window Management
private void ShowSettings()
{
if (_settings == null || _voiceService == null)
return;
if (_settingsWindow == null || _settingsWindow.IsClosed)
{
_settingsWindow = new SettingsWindow(_settings!);
_settingsWindow = new SettingsWindow(_settings, _voiceService);
_settingsWindow.Closed += (s, e) =>
{
_settingsWindow.SettingsSaved -= OnSettingsSaved;
@ -1604,40 +1745,143 @@ public partial class App : Application
_settingsWindow.Activate();
}
private void OnSettingsSaved(object? sender, EventArgs e)
private void ShowVoiceModeSettings()
{
if (_settings == null || _voiceService == null)
return;
if (_voiceRepeaterWindow == null || _voiceRepeaterWindow.IsClosed)
{
_voiceRepeaterWindow = new VoiceRepeaterWindow(_settings, _voiceService);
_voiceRepeaterWindow.OpenVoiceStatusRequested += OnOpenVoiceStatusRequested;
_voiceRepeaterWindow.Closed += (s, e) =>
{
_voiceChatCoordinator?.DetachWindow(_voiceRepeaterWindow);
_voiceRepeaterWindow.OpenVoiceStatusRequested -= OnOpenVoiceStatusRequested;
_voiceRepeaterWindow = null;
};
_voiceChatCoordinator?.AttachWindow(_voiceRepeaterWindow);
}
_voiceRepeaterWindow.RefreshStatus();
_voiceRepeaterWindow.Activate();
}
private void ShowVoiceStatusWindow()
{
if (_settings == null || _voiceService == null)
{
return;
}
if (_voiceModeWindow == null || _voiceModeWindow.IsClosed)
{
_voiceModeWindow = new VoiceModeWindow(_settings, _voiceService, _voiceService);
_voiceModeWindow.OpenSettingsRequested += OnVoiceModeOpenSettingsRequested;
_voiceModeWindow.Closed += (s, e) =>
{
if (_voiceModeWindow != null)
{
_voiceModeWindow.OpenSettingsRequested -= OnVoiceModeOpenSettingsRequested;
}
_voiceModeWindow = null;
};
}
_voiceModeWindow.RefreshStatus();
_voiceModeWindow.Activate();
}
private void OnOpenVoiceStatusRequested(object? sender, EventArgs e)
{
ShowVoiceStatusWindow();
}
private void OnVoiceModeOpenSettingsRequested(object? sender, EventArgs e)
{
ShowSettings();
}
private async void OnSettingsSaved(object? sender, EventArgs e)
{
// Reconnect with new settings — mirror the startup if/else pattern
// to avoid dual connections that cause gateway conflicts.
UnsubscribeGatewayEvents();
_gatewayClient?.Dispose();
_gatewayClient = null;
var oldNodeService = _nodeService;
_nodeService = null;
try { oldNodeService?.Dispose(); } catch (Exception ex) { Logger.Warn($"Node dispose error: {ex.Message}"); }
if (_settings?.UseSshTunnel != true)
try
{
_sshTunnelService?.Stop();
}
if (_gatewayClient != null)
{
try
{
await _gatewayClient.DisconnectAsync();
}
catch (Exception ex)
{
Logger.Warn($"Gateway disconnect error: {ex.Message}");
}
// Reset status so the tray doesn't show a stale "Connected" from the previous mode
_currentStatus = ConnectionStatus.Disconnected;
UpdateTrayIcon();
if (_settings?.EnableNodeMode == true)
{
InitializeNodeService();
_gatewayClient.Dispose();
_gatewayClient = null;
}
var oldNodeService = _nodeService;
_nodeService = null;
if (oldNodeService != null)
{
try
{
await oldNodeService.DisconnectAsync();
}
catch (Exception ex)
{
Logger.Warn($"Node disconnect error: {ex.Message}");
}
try
{
oldNodeService.Dispose();
}
catch (Exception ex)
{
Logger.Warn($"Node dispose error: {ex.Message}");
}
}
if (_settings?.UseSshTunnel != true)
{
_sshTunnelService?.Stop();
}
// Reset status so the tray doesn't show a stale "Connected" from the previous mode
_currentStatus = ConnectionStatus.Disconnected;
UpdateTrayIcon();
if (_settings?.EnableNodeMode == true)
{
InitializeNodeService();
}
else
{
InitializeGatewayClient();
if (_voiceService != null)
{
await _voiceService.StopAsync(new VoiceStopArgs { Reason = "Node mode disabled" });
}
}
}
else
catch (Exception ex)
{
InitializeGatewayClient();
Logger.Warn($"Settings reconnect failed: {ex.Message}");
}
// Update global hotkey
if (_settings!.GlobalHotkeyEnabled)
{
_globalHotkey ??= new GlobalHotkeyService();
_globalHotkey.HotkeyPressed -= OnGlobalHotkeyPressed;
_globalHotkey.HotkeyPressed += OnGlobalHotkeyPressed;
_globalHotkey.QuickSendHotkeyPressed -= OnGlobalQuickSendHotkeyPressed;
_globalHotkey.QuickSendHotkeyPressed += OnGlobalQuickSendHotkeyPressed;
_globalHotkey.VoiceToggleHotkeyPressed -= OnGlobalVoiceToggleHotkeyPressed;
_globalHotkey.VoiceToggleHotkeyPressed += OnGlobalVoiceToggleHotkeyPressed;
_globalHotkey.Register();
}
else
@ -1645,6 +1889,9 @@ public partial class App : Application
_globalHotkey?.Unregister();
}
_voiceRepeaterWindow?.RefreshStatus();
_voiceModeWindow?.RefreshStatus();
// Update auto-start
AutoStartManager.SetAutoStart(_settings.AutoStart);
}
@ -1656,8 +1903,15 @@ public partial class App : Application
if (_webChatWindow == null || _webChatWindow.IsClosed)
{
_webChatWindow = new WebChatWindow(_settings.GetEffectiveGatewayUrl(), _settings.Token);
_webChatWindow.Closed += (s, e) => _webChatWindow = null;
_webChatWindow = new WebChatWindow(
_settings.GetEffectiveGatewayUrl(),
_settings.Token);
_webChatWindow.Closed += (s, e) =>
{
_voiceChatCoordinator?.DetachWindow(_webChatWindow);
_webChatWindow = null;
};
_voiceChatCoordinator?.AttachWindow(_webChatWindow);
}
_webChatWindow.Activate();
}
@ -1874,7 +2128,7 @@ public partial class App : Application
}
}
private void OnGlobalHotkeyPressed(object? sender, EventArgs e)
private void OnGlobalQuickSendHotkeyPressed(object? sender, EventArgs e)
{
// Hotkey events are raised from a dedicated Win32 message-loop thread.
// Creating/activating WinUI windows must happen on the app's UI thread.
@ -1891,6 +2145,137 @@ public partial class App : Application
}
}
private void OnGlobalVoiceToggleHotkeyPressed(object? sender, EventArgs e)
{
if (_dispatcherQueue == null)
{
Logger.Warn("Voice hotkey pressed but DispatcherQueue is null");
return;
}
var enqueued = _dispatcherQueue.TryEnqueue(async () => await ToggleVoiceQuickPauseAsync());
if (!enqueued)
{
Logger.Warn("Voice hotkey pressed but failed to enqueue Voice quick pause on UI thread");
}
}
private async Task ToggleVoiceQuickPauseAsync()
{
if (_voiceService == null)
{
return;
}
if (_settings?.EnableNodeMode != true)
{
Logger.Warn("Voice quick pause blocked: Node Mode is disabled");
return;
}
if (!CanQuickToggleVoiceMode())
{
Logger.Warn("Voice quick pause blocked: Voice Mode is off");
return;
}
try
{
var status = await _voiceService.ToggleQuickPauseAsync();
_voiceRepeaterWindow?.RefreshStatus();
_voiceModeWindow?.RefreshStatus();
ShowVoiceQuickToggleToast(status);
}
catch (Exception ex)
{
Logger.Warn($"Voice quick pause failed: {ex.Message}");
}
}
private static void ShowVoiceQuickToggleToast(VoiceStatusInfo status)
{
try
{
var title = status.State == VoiceRuntimeState.Paused
? "Voice paused"
: "Voice resumed";
var detail = status.State == VoiceRuntimeState.Paused
? $"{status.Mode} is paused. Press Ctrl+Alt+Shift+V to resume."
: $"{status.Mode} is active again.";
new ToastContentBuilder()
.AddText(title)
.AddText(detail)
.Show();
}
catch (Exception ex)
{
Logger.Warn($"Failed to show voice pause toast: {ex.Message}");
}
}
private void OnVoiceConversationTurnAvailable(object? sender, VoiceConversationTurnEventArgs args)
{
if (_dispatcherQueue == null)
{
return;
}
_dispatcherQueue.TryEnqueue(() => ShowVoiceConversationToast(args));
}
private void ShowVoiceConversationToast(VoiceConversationTurnEventArgs args)
{
if (_settings?.Voice.ShowConversationToasts != true)
{
return;
}
var title = args.Direction == VoiceConversationDirection.Outgoing
? "Voice heard"
: "Voice reply";
AddRecentActivity(
$"voice: {title}",
category: "voice",
details: args.Message,
dashboardPath: "chat",
sessionKey: args.SessionKey);
NotificationHistoryService.AddNotification(new Services.GatewayNotification
{
Title = title,
Message = args.Message,
Category = "voice"
});
if (_settings.ShowNotifications != true)
{
return;
}
try
{
var builder = new ToastContentBuilder()
.AddText(title)
.AddText(args.Message);
if (args.Direction == VoiceConversationDirection.Incoming)
{
builder.AddArgument("action", "open_chat")
.AddButton(new ToastButton()
.SetContent("Open Chat")
.AddArgument("action", "open_chat"));
}
builder.Show();
}
catch (Exception ex)
{
Logger.Warn($"Failed to show voice conversation toast: {ex.Message}");
}
}
#endregion
#region Updates
@ -2125,7 +2510,11 @@ public partial class App : Application
_sessionPollTimer?.Dispose();
_sessionPollTimer = null;
});
SafeShutdownStep("voice tray icon timer", () =>
{
_voiceTrayIconTimer?.Stop();
_voiceTrayIconTimer = null;
});
// Cleanup hotkey
SafeShutdownStep("global hotkey", () =>
{
@ -2191,6 +2580,22 @@ public partial class App : Application
_deepLinkCts = null;
});
SafeShutdownStep("voice chat coordinator", () =>
{
if (_voiceChatCoordinator != null)
{
_voiceChatCoordinator.ConversationTurnAvailable -= OnVoiceConversationTurnAvailable;
_voiceChatCoordinator.Dispose();
_voiceChatCoordinator = null;
}
});
SafeShutdownStep("voice service", () =>
{
_voiceService?.Dispose();
_voiceService = null;
});
Logger.Info("Shutdown complete; calling Exit() now");
Exit();
}
@ -2262,7 +2667,6 @@ public partial class App : Application
return true;
}
#endregion
private async void OnSshTunnelExited(object? sender, int exitCode)

Binary file not shown.

After

Width:  |  Height:  |  Size: 1.0 KiB

View File

@ -0,0 +1,274 @@
{
"speechToTextProviders": [
{
"id": "windows",
"name": "Windows Speech Recognition",
"runtime": "windows",
"enabled": true,
"description": "Built-in Windows.Media speech recognition, half-duplex, non-streamed."
},
{
"id": "http-ws",
"name": "http/ws",
"runtime": "streaming",
"enabled": false,
"visibleInSettings": true,
"selectable": false,
"description": "Will support most cloud and local stand-alone models full or half-duplex, streaming."
},
{
"id": "foundry-local",
"name": "Foundry Local",
"runtime": "streaming",
"enabled": false,
"visibleInSettings": false,
"selectable": false,
"description": "AudioGraph-fed streaming STT route for Foundry Local or compatible streaming adapters.",
"settings": [
{
"key": "endpoint",
"label": "Endpoint",
"required": false,
"defaultValue": "http://localhost:5273",
"placeholder": "http://localhost:5273",
"description": "Local Foundry-compatible transcription endpoint for the AudioGraph streaming STT route."
},
{
"key": "model",
"label": "Model",
"required": false,
"defaultValue": "whisper-tiny",
"placeholder": "whisper-tiny",
"description": "Transcription model identifier for the streaming STT adapter."
}
]
},
{
"id": "openai-whisper",
"name": "OpenAI Whisper",
"runtime": "streaming",
"enabled": false,
"visibleInSettings": false,
"selectable": false,
"description": "AudioGraph-fed cloud STT route for the OpenAI Whisper transcription API.",
"settings": [
{
"key": "apiKey",
"label": "API key",
"secret": true
},
{
"key": "model",
"label": "Model",
"required": false,
"defaultValue": "whisper-1",
"placeholder": "whisper-1",
"description": "Transcription model identifier for the OpenAI speech-to-text adapter."
}
]
},
{
"id": "elevenlabs-stt",
"name": "ElevenLabs Speech to Text",
"runtime": "streaming",
"enabled": false,
"visibleInSettings": false,
"selectable": false,
"description": "AudioGraph-fed cloud STT route for the ElevenLabs speech-to-text API.",
"settings": [
{
"key": "apiKey",
"label": "API key",
"secret": true
},
{
"key": "model",
"label": "Model",
"required": false,
"defaultValue": "scribe_v1",
"placeholder": "scribe_v1",
"description": "Transcription model identifier for the ElevenLabs speech-to-text adapter."
}
]
},
{
"id": "azure-ai-speech",
"name": "Azure AI Speech",
"runtime": "streaming",
"enabled": false,
"visibleInSettings": false,
"selectable": false,
"description": "AudioGraph-fed cloud STT route for Azure AI Speech real-time transcription.",
"settings": [
{
"key": "apiKey",
"label": "API key",
"secret": true
},
{
"key": "endpoint",
"label": "Endpoint",
"required": false,
"defaultValue": "",
"placeholder": "https://your-speech-resource.cognitiveservices.azure.com",
"description": "Azure AI Speech endpoint for the streaming STT adapter."
}
]
},
{
"id": "sherpa-onnx",
"name": "sherpa-onnx",
"runtime": "embedded",
"enabled": false,
"visibleInSettings": true,
"selectable": false,
"description": "Can load a variety of models including OpenAI/Whisper, full-duplex, streaming.",
"settings": [
{
"key": "modelPath",
"label": "Model path",
"required": false,
"defaultValue": "",
"placeholder": "C:\\models\\sherpa-onnx\\model.onnx",
"description": "Path to the downloaded sherpa-onnx model bundle the embedded STT route should use."
},
{
"key": "model",
"label": "Model preset",
"required": false,
"defaultValue": "",
"placeholder": "tiny / base / small / medium",
"description": "Optional human-readable model preset to help track which local bundle is selected."
}
]
}
],
"textToSpeechProviders": [
{
"id": "windows",
"name": "Windows Speech Synthesis",
"runtime": "windows",
"enabled": true,
"description": "Built-in Windows text-to-speech playback."
},
{
"id": "minimax",
"name": "MiniMax",
"runtime": "cloud",
"enabled": true,
"description": "Cloud TTS using the MiniMax HTTP text-to-speech API.",
"settings": [
{
"key": "apiKey",
"label": "API key",
"secret": true
},
{
"key": "model",
"label": "Model",
"defaultValue": "speech-2.8-turbo",
"options": [
"speech-2.5-turbo-preview",
"speech-02-turbo",
"speech-02-hd",
"speech-2.6-turbo",
"speech-2.6-hd",
"speech-2.8-turbo",
"speech-2.8-hd"
]
},
{
"key": "voiceId",
"label": "Voice ID",
"required": false,
"defaultValue": "English_MatureBoss"
},
{
"key": "voiceSettingsJson",
"label": "Voice settings JSON",
"required": false,
"jsonValue": true,
"defaultValue": "\"voice_setting\": { \"voice_id\": {{voiceId}}, \"speed\": 1, \"vol\": 1, \"pitch\": 0 }",
"placeholder": "\"voice_setting\": { \"voice_id\": \"English_MatureBoss\", \"speed\": 1, \"vol\": 1, \"pitch\": 0 }",
"description": "Optional full MiniMax request fragment. If present, it controls the full voice_setting payload."
}
],
"textToSpeechWebSocket": {
"endpointTemplate": "wss://api.minimax.io/ws/v1/t2a_v2",
"authenticationHeaderName": "Authorization",
"authenticationScheme": "Bearer",
"apiKeySettingKey": "apiKey",
"connectSuccessEventName": "connected_success",
"startMessageTemplate": "{ \"event\": \"task_start\", \"model\": {{model}}, \"language_boost\": \"English\", {{voiceSettingsJson}}, \"audio_setting\": { \"sample_rate\": 32000, \"bitrate\": 128000, \"format\": \"mp3\", \"channel\": 1 } }",
"startSuccessEventName": "task_started",
"continueMessageTemplate": "{ \"event\": \"task_continue\", \"text\": {{text}} }",
"finishMessageTemplate": "{ \"event\": \"task_finish\" }",
"responseAudioMode": "hexJsonString",
"responseAudioJsonPath": "data.audio",
"responseStatusCodeJsonPath": "base_resp.status_code",
"responseStatusMessageJsonPath": "base_resp.status_msg",
"finalFlagJsonPath": "is_final",
"taskFailedEventName": "task_failed",
"successStatusValue": "0",
"outputContentType": "audio/mpeg"
}
},
{
"id": "elevenlabs",
"name": "ElevenLabs",
"runtime": "cloud",
"enabled": true,
"description": "Cloud TTS using the ElevenLabs WebSocket stream-input API.",
"settings": [
{
"key": "apiKey",
"label": "API key",
"secret": true
},
{
"key": "model",
"label": "Model",
"defaultValue": "eleven_multilingual_v2",
"options": [
"eleven_flash_v2_5",
"eleven_turbo_v2_5",
"eleven_multilingual_v2",
"eleven_monolingual_v1"
]
},
{
"key": "voiceId",
"label": "Voice ID",
"required": false,
"defaultValue": "6aDn1KB0hjpdcocrUkmq",
"placeholder": "Enter an ElevenLabs voice ID"
},
{
"key": "voiceSettingsJson",
"label": "Voice settings JSON",
"required": false,
"jsonValue": true,
"defaultValue": "\"voice_settings\": { \"speed\": 0.9, \"stability\": 0.5, \"similarity_boost\": 0.75 }",
"placeholder": "\"voice_settings\": { \"speed\": 0.9, \"stability\": 0.5, \"similarity_boost\": 0.75 }",
"description": "Optional full ElevenLabs request fragment. If present, it controls the full voice_settings payload."
}
],
"textToSpeechWebSocket": {
"endpointTemplate": "wss://api.elevenlabs.io/v1/text-to-speech/{{voiceId}}/stream-input?model_id={{model}}&output_format=mp3_44100_128&auto_mode=true",
"authenticationHeaderName": "xi-api-key",
"authenticationScheme": "",
"apiKeySettingKey": "apiKey",
"connectSuccessEventName": "",
"startMessageTemplate": "{ \"text\": \" \", {{voiceSettingsJson}}, \"xi_api_key\": {{apiKey}} }",
"startSuccessEventName": "",
"continueMessageTemplate": "{ \"text\": {{textWithTrailingSpace}}, \"try_trigger_generation\": true }",
"finishMessageTemplate": "{ \"text\": \"\" }",
"responseAudioMode": "base64JsonString",
"responseAudioJsonPath": "audio",
"finalFlagJsonPath": "isFinal",
"taskFailedEventName": "error",
"outputContentType": "audio/mpeg"
}
}
]
}

View File

@ -0,0 +1,111 @@
<?xml version="1.0" encoding="utf-8"?>
<UserControl
x:Class="OpenClawTray.Controls.VoiceSettingsPanel"
xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation"
xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml">
<UserControl.Resources>
<DataTemplate x:Key="VoiceProviderOptionTemplate">
<TextBlock Text="{Binding DisplayName}" Opacity="{Binding DisplayOpacity}"/>
</DataTemplate>
</UserControl.Resources>
<StackPanel Spacing="8">
<TextBlock Text="VOICE" Style="{StaticResource CaptionTextBlockStyle}"
Foreground="#E74C3C" FontWeight="Bold"/>
<Grid ColumnSpacing="12">
<Grid.ColumnDefinitions>
<ColumnDefinition Width="2*"/>
<ColumnDefinition Width="3*"/>
</Grid.ColumnDefinitions>
<ComboBox x:Name="VoiceModeComboBox" Header="Mode" SelectionChanged="OnVoiceModeChanged">
<ComboBoxItem Content="Off" Tag="Off"/>
<ComboBoxItem Content="Voice Wake" Tag="VoiceWake" IsEnabled="False"/>
<ComboBoxItem Content="Talk Mode" Tag="TalkMode"/>
</ComboBox>
<TextBlock x:Name="VoiceModeDescriptionTextBlock"
Grid.Column="1"
VerticalAlignment="Center"
Style="{StaticResource CaptionTextBlockStyle}"
Foreground="{ThemeResource TextFillColorSecondaryBrush}"
TextWrapping="Wrap"/>
</Grid>
<CheckBox x:Name="VoiceShowRepeaterAtStartupCheckBox"
Content="Show VoiceMode repeater form at startup"/>
<Grid ColumnSpacing="12">
<Grid.ColumnDefinitions>
<ColumnDefinition Width="2*"/>
<ColumnDefinition Width="3*"/>
</Grid.ColumnDefinitions>
<ComboBox x:Name="VoiceSpeechToTextProviderComboBox"
Header="Speech to text provider"
ItemTemplate="{StaticResource VoiceProviderOptionTemplate}"
SelectionChanged="OnVoiceProviderChanged"/>
<TextBlock x:Name="VoiceSpeechToTextProviderDescriptionTextBlock"
Grid.Column="1"
VerticalAlignment="Center"
Style="{StaticResource CaptionTextBlockStyle}"
Foreground="{ThemeResource TextFillColorSecondaryBrush}"
TextWrapping="Wrap"/>
</Grid>
<Grid ColumnSpacing="12">
<Grid.ColumnDefinitions>
<ColumnDefinition Width="2*"/>
<ColumnDefinition Width="3*"/>
</Grid.ColumnDefinitions>
<ComboBox x:Name="VoiceTextToSpeechProviderComboBox"
Header="Text to speech provider"
ItemTemplate="{StaticResource VoiceProviderOptionTemplate}"
SelectionChanged="OnVoiceProviderChanged"/>
<TextBlock x:Name="VoiceTextToSpeechProviderDescriptionTextBlock"
Grid.Column="1"
VerticalAlignment="Center"
Style="{StaticResource CaptionTextBlockStyle}"
Foreground="{ThemeResource TextFillColorSecondaryBrush}"
TextWrapping="Wrap"/>
</Grid>
<StackPanel x:Name="VoiceTtsProviderSettingsPanel" Spacing="8" Visibility="Collapsed">
<TextBlock x:Name="VoiceTtsProviderSettingsTitleTextBlock"
Style="{StaticResource CaptionTextBlockStyle}"
Foreground="#E74C3C"
FontWeight="Bold"/>
<PasswordBox x:Name="VoiceTtsApiKeyPasswordBox"
Header="API key"
PasswordChanged="OnVoiceProviderSettingsChanged"/>
<ComboBox x:Name="VoiceTtsModelComboBox"
Header="Model"
Visibility="Collapsed"
SelectionChanged="OnVoiceProviderSettingsChanged"/>
<TextBox x:Name="VoiceTtsModelTextBox"
Header="Model"
Visibility="Collapsed"
TextChanged="OnVoiceProviderSettingsChanged"/>
<TextBox x:Name="VoiceTtsVoiceIdTextBox"
Header="Voice ID"
TextChanged="OnVoiceProviderSettingsChanged"/>
<TextBox x:Name="VoiceTtsVoiceSettingsJsonTextBox"
Header="Voice settings JSON"
Visibility="Collapsed"
AcceptsReturn="True"
TextWrapping="Wrap"
MinHeight="96"
TextChanged="OnVoiceProviderSettingsChanged"/>
</StackPanel>
<ComboBox x:Name="VoiceInputDeviceComboBox" Header="Listen device (microphone)" DisplayMemberPath="Name"/>
<ComboBox x:Name="VoiceOutputDeviceComboBox" Header="Talk device (speaker)" DisplayMemberPath="Name"/>
<Button Content="Refresh voice devices" HorizontalAlignment="Left" Click="OnRefreshVoiceDevices"/>
<CheckBox x:Name="VoiceConversationToastsCheckBox"
Content="Show voice transcripts and replies as toasts"/>
<TextBlock x:Name="VoiceSettingsInfoTextBlock"
Style="{StaticResource CaptionTextBlockStyle}"
Foreground="{ThemeResource TextFillColorSecondaryBrush}"
TextWrapping="Wrap"/>
</StackPanel>
</UserControl>

View File

@ -0,0 +1,574 @@
using Microsoft.UI.Xaml;
using Microsoft.UI.Xaml.Controls;
using OpenClaw.Shared;
using OpenClawTray.Services;
using OpenClawTray.Services.Voice;
using System;
using System.Collections.Generic;
using System.Linq;
using System.Threading.Tasks;
namespace OpenClawTray.Controls;
public sealed partial class VoiceSettingsPanel : UserControl
{
private SettingsManager? _settings;
private IVoiceConfigurationApi? _voiceConfigurationApi;
private VoiceProviderConfigurationStore _voiceProviderConfigurationDraft = new();
private string _activeSttProviderId = VoiceProviderIds.Windows;
private string _activeTtsProviderId = VoiceProviderIds.Windows;
private bool _updatingVoiceProviderFields;
private List<VoiceProviderOption> _speechToTextOptions = new();
private List<VoiceProviderOption> _textToSpeechOptions = new();
private List<DeviceOption> _inputOptions = new();
private List<DeviceOption> _outputOptions = new();
private List<string> _activeTtsModelOptions = new();
public VoiceSettingsPanel()
{
InitializeComponent();
}
public void Initialize(SettingsManager settings, IVoiceConfigurationApi voiceConfigurationApi)
{
_settings = settings;
_voiceConfigurationApi = voiceConfigurationApi;
LoadVoiceSettings();
_ = LoadVoiceDevicesAsync();
}
public async Task ApplyAsync(SettingsManager settings)
{
CaptureSelectedVoiceProviderSettings();
var voiceSettings = new VoiceSettings
{
Mode = GetSelectedVoiceMode(),
Enabled = GetSelectedVoiceMode() != VoiceActivationMode.Off,
ShowRepeaterAtStartup = (VoiceShowRepeaterAtStartupCheckBox.IsChecked ?? true) && GetSelectedVoiceMode() != VoiceActivationMode.Off,
ShowConversationToasts = VoiceConversationToastsCheckBox.IsChecked ?? false,
SpeechToTextProviderId = (VoiceSpeechToTextProviderComboBox.SelectedItem as VoiceProviderOption)?.Id ?? VoiceProviderIds.Windows,
TextToSpeechProviderId = (VoiceTextToSpeechProviderComboBox.SelectedItem as VoiceProviderOption)?.Id ?? VoiceProviderIds.Windows,
InputDeviceId = (VoiceInputDeviceComboBox.SelectedItem as DeviceOption)?.DeviceId,
OutputDeviceId = (VoiceOutputDeviceComboBox.SelectedItem as DeviceOption)?.DeviceId,
SampleRateHz = settings.Voice.SampleRateHz,
CaptureChunkMs = settings.Voice.CaptureChunkMs,
BargeInEnabled = settings.Voice.BargeInEnabled,
VoiceWake = new VoiceWakeSettings
{
Engine = settings.Voice.VoiceWake.Engine,
ModelId = settings.Voice.VoiceWake.ModelId,
TriggerThreshold = settings.Voice.VoiceWake.TriggerThreshold,
TriggerCooldownMs = settings.Voice.VoiceWake.TriggerCooldownMs,
PreRollMs = settings.Voice.VoiceWake.PreRollMs,
EndSilenceMs = settings.Voice.VoiceWake.EndSilenceMs
},
TalkMode = new TalkModeSettings
{
MinSpeechMs = settings.Voice.TalkMode.MinSpeechMs,
EndSilenceMs = settings.Voice.TalkMode.EndSilenceMs,
MaxUtteranceMs = settings.Voice.TalkMode.MaxUtteranceMs
}
};
settings.Voice = voiceSettings;
settings.VoiceProviderConfiguration = _voiceProviderConfigurationDraft.Clone();
if (_voiceConfigurationApi != null)
{
_voiceConfigurationApi.SetProviderConfiguration(_voiceProviderConfigurationDraft);
await _voiceConfigurationApi.UpdateSettingsAsync(new VoiceSettingsUpdateArgs
{
Settings = voiceSettings,
Persist = false
});
}
}
private void LoadVoiceSettings()
{
if (_settings == null || _voiceConfigurationApi == null)
{
return;
}
_voiceProviderConfigurationDraft = _settings.VoiceProviderConfiguration.Clone();
LoadVoiceProviders();
SelectVoiceMode(_settings.Voice.Mode);
UpdateVoiceSelectionDescriptions();
VoiceShowRepeaterAtStartupCheckBox.IsChecked = _settings.Voice.Mode == VoiceActivationMode.Off
? false
: _settings.Voice.ShowRepeaterAtStartup;
VoiceConversationToastsCheckBox.IsChecked = _settings.Voice.ShowConversationToasts;
UpdateVoiceProviderSettingsEditor();
UpdateVoiceSettingsInfo();
}
private void LoadVoiceProviders()
{
var catalog = _voiceConfigurationApi!.GetProviderCatalog();
_speechToTextOptions = catalog.SpeechToTextProviders
.Select(Clone)
.ToList();
_textToSpeechOptions = catalog.TextToSpeechProviders
.Select(Clone)
.ToList();
VoiceSpeechToTextProviderComboBox.ItemsSource = _speechToTextOptions;
VoiceTextToSpeechProviderComboBox.ItemsSource = _textToSpeechOptions;
VoiceSpeechToTextProviderComboBox.SelectedItem =
_speechToTextOptions.FirstOrDefault(p => p.Id == _settings!.Voice.SpeechToTextProviderId)
?? _speechToTextOptions.FirstOrDefault();
VoiceTextToSpeechProviderComboBox.SelectedItem =
_textToSpeechOptions.FirstOrDefault(p => p.Id == _settings!.Voice.TextToSpeechProviderId)
?? _textToSpeechOptions.FirstOrDefault();
_ = EnsureSelectableProviderSelection(VoiceSpeechToTextProviderComboBox, _speechToTextOptions, ref _activeSttProviderId);
_ = EnsureSelectableProviderSelection(VoiceTextToSpeechProviderComboBox, _textToSpeechOptions, ref _activeTtsProviderId);
UpdateVoiceSelectionDescriptions();
UpdateDeviceSelectionAvailability();
}
private async Task LoadVoiceDevicesAsync()
{
if (_settings == null || _voiceConfigurationApi == null)
{
return;
}
try
{
VoiceSettingsInfoTextBlock.Text = "Loading voice devices...";
var devices = await _voiceConfigurationApi.ListDevicesAsync();
_inputOptions =
[
new DeviceOption(null, "System default microphone")
];
_inputOptions.AddRange(devices
.Where(d => d.IsInput)
.Select(d => new DeviceOption(d.DeviceId, d.Name)));
_outputOptions =
[
new DeviceOption(null, "System default speaker")
];
_outputOptions.AddRange(devices
.Where(d => d.IsOutput)
.Select(d => new DeviceOption(d.DeviceId, d.Name)));
VoiceInputDeviceComboBox.ItemsSource = _inputOptions;
VoiceOutputDeviceComboBox.ItemsSource = _outputOptions;
VoiceInputDeviceComboBox.SelectedItem = _inputOptions.FirstOrDefault(o => o.DeviceId == _settings.Voice.InputDeviceId) ?? _inputOptions[0];
VoiceOutputDeviceComboBox.SelectedItem = _outputOptions.FirstOrDefault(o => o.DeviceId == _settings.Voice.OutputDeviceId) ?? _outputOptions[0];
UpdateDeviceSelectionAvailability();
UpdateVoiceSettingsInfo();
}
catch (Exception ex)
{
VoiceSettingsInfoTextBlock.Text = $"Failed to load voice devices: {ex.Message}";
}
}
private void SelectVoiceMode(VoiceActivationMode mode)
{
var target = mode switch
{
VoiceActivationMode.VoiceWake => "VoiceWake",
VoiceActivationMode.TalkMode => "TalkMode",
_ => "Off"
};
foreach (var item in VoiceModeComboBox.Items.OfType<ComboBoxItem>())
{
if (string.Equals(item.Tag?.ToString(), target, StringComparison.Ordinal))
{
VoiceModeComboBox.SelectedItem = item;
return;
}
}
VoiceModeComboBox.SelectedIndex = 0;
}
private VoiceActivationMode GetSelectedVoiceMode()
{
var tag = (VoiceModeComboBox.SelectedItem as ComboBoxItem)?.Tag?.ToString();
return tag switch
{
"VoiceWake" => VoiceActivationMode.VoiceWake,
"TalkMode" => VoiceActivationMode.TalkMode,
_ => VoiceActivationMode.Off
};
}
private void UpdateVoiceSelectionDescriptions()
{
VoiceModeDescriptionTextBlock.Text = GetVoiceModeDescription(GetSelectedVoiceMode());
VoiceSpeechToTextProviderDescriptionTextBlock.Text =
(VoiceSpeechToTextProviderComboBox.SelectedItem as VoiceProviderOption)?.Description ?? string.Empty;
VoiceTextToSpeechProviderDescriptionTextBlock.Text =
(VoiceTextToSpeechProviderComboBox.SelectedItem as VoiceProviderOption)?.Description ?? string.Empty;
}
private static string GetVoiceModeDescription(VoiceActivationMode mode)
{
return mode switch
{
VoiceActivationMode.TalkMode => "Continuous conversation mode. Listen after replies and send each completed utterance as a chat turn.",
VoiceActivationMode.VoiceWake => "Wake-word mode. Stays idle until the hotword is detected, then starts listening for a request.",
_ => "Voice features stay off until you start them manually."
};
}
private void UpdateVoiceSettingsInfo()
{
var stt = (VoiceSpeechToTextProviderComboBox.SelectedItem as VoiceProviderOption)?.Name ?? "Windows Speech Recognition";
var tts = (VoiceTextToSpeechProviderComboBox.SelectedItem as VoiceProviderOption)?.Name ?? "Windows Speech Synthesis";
var input = (VoiceInputDeviceComboBox.SelectedItem as DeviceOption)?.Name ?? "System default microphone";
var output = (VoiceOutputDeviceComboBox.SelectedItem as DeviceOption)?.Name ?? "System default speaker";
var fallbackNotice = string.Empty;
if (VoiceSpeechToTextProviderComboBox.SelectedItem is VoiceProviderOption sttOption &&
!VoiceProviderCatalogService.SupportsSpeechToTextRuntime(sttOption.Id))
{
fallbackNotice += " Selected non-Windows STT routes are scaffolded but not implemented yet.";
}
if (VoiceTextToSpeechProviderComboBox.SelectedItem is VoiceProviderOption ttsOption &&
!VoiceProviderCatalogService.SupportsTextToSpeechRuntime(ttsOption.Id))
{
fallbackNotice += " Unsupported TTS providers will fall back to Windows until their runtime adapters are added.";
}
VoiceSettingsInfoTextBlock.Text =
$"Mode: {VoiceDisplayHelper.GetModeLabel(GetSelectedVoiceMode())}. STT: {stt}. TTS: {tts}. Listen: {input}. Talk: {output}.{fallbackNotice}";
}
private void UpdateDeviceSelectionAvailability()
{
var lockToDefaultDevices = string.Equals(
(VoiceSpeechToTextProviderComboBox.SelectedItem as VoiceProviderOption)?.Id,
VoiceProviderIds.Windows,
StringComparison.OrdinalIgnoreCase);
if (lockToDefaultDevices)
{
if (_inputOptions.Count > 0)
{
VoiceInputDeviceComboBox.SelectedItem = _inputOptions[0];
}
if (_outputOptions.Count > 0)
{
VoiceOutputDeviceComboBox.SelectedItem = _outputOptions[0];
}
}
VoiceInputDeviceComboBox.IsEnabled = !lockToDefaultDevices;
VoiceOutputDeviceComboBox.IsEnabled = !lockToDefaultDevices;
}
private void UpdateVoiceProviderSettingsEditor()
{
var providerId = GetSelectedTextToSpeechProviderId();
var showProviderSettings = !string.Equals(providerId, VoiceProviderIds.Windows, StringComparison.OrdinalIgnoreCase);
VoiceTtsProviderSettingsPanel.Visibility = showProviderSettings ? Visibility.Visible : Visibility.Collapsed;
if (!showProviderSettings)
{
_activeTtsProviderId = VoiceProviderIds.Windows;
return;
}
var provider = GetSelectedTextToSpeechProvider();
var apiKeySetting = FindSetting(provider, VoiceProviderSettingKeys.ApiKey);
var modelSetting = FindSetting(provider, VoiceProviderSettingKeys.Model);
var voiceIdSetting = FindSetting(provider, VoiceProviderSettingKeys.VoiceId);
var voiceSettingsJsonSetting = FindSetting(provider, VoiceProviderSettingKeys.VoiceSettingsJson);
var modelValue = GetProviderValue(providerId, modelSetting) ?? string.Empty;
_updatingVoiceProviderFields = true;
try
{
VoiceTtsProviderSettingsTitleTextBlock.Text = $"{GetSelectedTextToSpeechProviderName().ToUpperInvariant()} SETTINGS";
VoiceTtsApiKeyPasswordBox.Header = apiKeySetting?.Label ?? "API key";
VoiceTtsApiKeyPasswordBox.Visibility = apiKeySetting != null ? Visibility.Visible : Visibility.Collapsed;
VoiceTtsApiKeyPasswordBox.Password = GetProviderValue(providerId, apiKeySetting) ?? string.Empty;
_activeTtsModelOptions = modelSetting?.Options
.Where(option => !string.IsNullOrWhiteSpace(option))
.Distinct(StringComparer.OrdinalIgnoreCase)
.ToList()
?? [];
if (_activeTtsModelOptions.Count > 0)
{
if (!string.IsNullOrWhiteSpace(modelValue) &&
!_activeTtsModelOptions.Contains(modelValue, StringComparer.OrdinalIgnoreCase))
{
_activeTtsModelOptions.Insert(0, modelValue);
}
VoiceTtsModelComboBox.Header = modelSetting?.Label ?? "Model";
VoiceTtsModelComboBox.ItemsSource = _activeTtsModelOptions;
VoiceTtsModelComboBox.SelectedItem = _activeTtsModelOptions
.FirstOrDefault(option => string.Equals(option, modelValue, StringComparison.OrdinalIgnoreCase))
?? _activeTtsModelOptions.FirstOrDefault();
VoiceTtsModelComboBox.Visibility = Visibility.Visible;
VoiceTtsModelTextBox.Visibility = Visibility.Collapsed;
}
else
{
VoiceTtsModelTextBox.Header = modelSetting?.Label ?? "Model";
VoiceTtsModelTextBox.PlaceholderText = modelSetting?.Placeholder ?? string.Empty;
VoiceTtsModelTextBox.Visibility = modelSetting != null ? Visibility.Visible : Visibility.Collapsed;
VoiceTtsModelTextBox.Text = modelValue;
VoiceTtsModelComboBox.ItemsSource = null;
VoiceTtsModelComboBox.SelectedItem = null;
VoiceTtsModelComboBox.Visibility = Visibility.Collapsed;
}
VoiceTtsVoiceIdTextBox.Header = voiceIdSetting?.Label ?? "Voice ID";
VoiceTtsVoiceIdTextBox.PlaceholderText = voiceIdSetting?.Placeholder ?? string.Empty;
VoiceTtsVoiceIdTextBox.Visibility = voiceIdSetting != null ? Visibility.Visible : Visibility.Collapsed;
VoiceTtsVoiceIdTextBox.Text = GetProviderValue(providerId, voiceIdSetting) ?? string.Empty;
VoiceTtsVoiceSettingsJsonTextBox.Header = voiceSettingsJsonSetting?.Label ?? "Voice settings JSON";
VoiceTtsVoiceSettingsJsonTextBox.PlaceholderText = voiceSettingsJsonSetting?.Placeholder ?? string.Empty;
VoiceTtsVoiceSettingsJsonTextBox.Visibility = voiceSettingsJsonSetting != null ? Visibility.Visible : Visibility.Collapsed;
VoiceTtsVoiceSettingsJsonTextBox.Text = GetProviderValue(providerId, voiceSettingsJsonSetting) ?? string.Empty;
_activeTtsProviderId = providerId;
}
finally
{
_updatingVoiceProviderFields = false;
}
}
private string GetSelectedTextToSpeechProviderId()
{
return (VoiceTextToSpeechProviderComboBox.SelectedItem as VoiceProviderOption)?.Id ?? VoiceProviderIds.Windows;
}
private string GetSelectedTextToSpeechProviderName()
{
return (VoiceTextToSpeechProviderComboBox.SelectedItem as VoiceProviderOption)?.Name ?? "Provider";
}
private VoiceProviderOption? GetSelectedTextToSpeechProvider()
{
return VoiceTextToSpeechProviderComboBox.SelectedItem as VoiceProviderOption;
}
private void CaptureSelectedVoiceProviderSettings()
{
if (_updatingVoiceProviderFields)
{
return;
}
var providerId = _activeTtsProviderId;
if (string.Equals(providerId, VoiceProviderIds.Windows, StringComparison.OrdinalIgnoreCase))
{
return;
}
var provider = _textToSpeechOptions.FirstOrDefault(option =>
string.Equals(option.Id, providerId, StringComparison.OrdinalIgnoreCase));
SetProviderValue(providerId, FindSetting(provider, VoiceProviderSettingKeys.ApiKey), VoiceTtsApiKeyPasswordBox.Password);
SetProviderValue(providerId, FindSetting(provider, VoiceProviderSettingKeys.Model), GetSelectedProviderModelValue());
SetProviderValue(providerId, FindSetting(provider, VoiceProviderSettingKeys.VoiceId), VoiceTtsVoiceIdTextBox.Text);
SetProviderValue(providerId, FindSetting(provider, VoiceProviderSettingKeys.VoiceSettingsJson), VoiceTtsVoiceSettingsJsonTextBox.Text);
}
private async void OnRefreshVoiceDevices(object sender, RoutedEventArgs e)
{
await LoadVoiceDevicesAsync();
}
private void OnVoiceModeChanged(object sender, SelectionChangedEventArgs e)
{
var mode = GetSelectedVoiceMode();
VoiceShowRepeaterAtStartupCheckBox.IsChecked = mode == VoiceActivationMode.Off
? false
: (VoiceShowRepeaterAtStartupCheckBox.IsChecked ?? true);
VoiceShowRepeaterAtStartupCheckBox.IsEnabled = mode != VoiceActivationMode.Off;
UpdateVoiceSelectionDescriptions();
UpdateVoiceSettingsInfo();
}
private void OnVoiceProviderChanged(object sender, SelectionChangedEventArgs e)
{
if (ReferenceEquals(sender, VoiceSpeechToTextProviderComboBox) &&
!EnsureSelectableProviderSelection(VoiceSpeechToTextProviderComboBox, _speechToTextOptions, ref _activeSttProviderId))
{
return;
}
if (ReferenceEquals(sender, VoiceTextToSpeechProviderComboBox) &&
!EnsureSelectableProviderSelection(VoiceTextToSpeechProviderComboBox, _textToSpeechOptions, ref _activeTtsProviderId))
{
return;
}
CaptureSelectedVoiceProviderSettings();
UpdateVoiceSelectionDescriptions();
UpdateDeviceSelectionAvailability();
UpdateVoiceProviderSettingsEditor();
UpdateVoiceSettingsInfo();
}
private void OnVoiceProviderSettingsChanged(object sender, RoutedEventArgs e)
{
CaptureSelectedVoiceProviderSettings();
}
private string? GetProviderValue(string providerId, VoiceProviderSettingDefinition? setting)
{
if (setting == null)
{
return null;
}
return _voiceProviderConfigurationDraft.GetValue(providerId, setting.Key) ?? setting.DefaultValue;
}
private string? GetSelectedProviderModelValue()
{
if (VoiceTtsModelComboBox.Visibility == Visibility.Visible)
{
return VoiceTtsModelComboBox.SelectedItem?.ToString();
}
return VoiceTtsModelTextBox.Text;
}
private sealed record DeviceOption(string? DeviceId, string Name);
private void SetProviderValue(
string providerId,
VoiceProviderSettingDefinition? setting,
string? value)
{
if (setting == null)
{
return;
}
var normalized = string.IsNullOrWhiteSpace(value)
? setting.DefaultValue
: value.Trim();
_voiceProviderConfigurationDraft.SetValue(providerId, setting.Key, normalized);
}
private static VoiceProviderSettingDefinition? FindSetting(VoiceProviderOption? provider, string settingKey)
{
return provider?.Settings.FirstOrDefault(setting =>
string.Equals(setting.Key, settingKey, StringComparison.OrdinalIgnoreCase));
}
private static VoiceProviderOption Clone(VoiceProviderOption source)
{
return new VoiceProviderOption
{
Id = source.Id,
Name = source.Name,
Runtime = source.Runtime,
Enabled = source.Enabled,
VisibleInSettings = source.VisibleInSettings,
Selectable = source.Selectable,
Description = source.Description,
Settings = source.Settings
.Select(setting => new VoiceProviderSettingDefinition
{
Key = setting.Key,
Label = setting.Label,
Secret = setting.Secret,
DefaultValue = setting.DefaultValue,
Placeholder = setting.Placeholder,
Description = setting.Description,
Required = setting.Required,
JsonValue = setting.JsonValue,
Options = setting.Options.ToList()
})
.ToList(),
TextToSpeechHttp = source.TextToSpeechHttp == null
? null
: new VoiceTextToSpeechHttpContract
{
EndpointTemplate = source.TextToSpeechHttp.EndpointTemplate,
HttpMethod = source.TextToSpeechHttp.HttpMethod,
AuthenticationHeaderName = source.TextToSpeechHttp.AuthenticationHeaderName,
AuthenticationScheme = source.TextToSpeechHttp.AuthenticationScheme,
ApiKeySettingKey = source.TextToSpeechHttp.ApiKeySettingKey,
RequestContentType = source.TextToSpeechHttp.RequestContentType,
RequestBodyTemplate = source.TextToSpeechHttp.RequestBodyTemplate,
ResponseAudioMode = source.TextToSpeechHttp.ResponseAudioMode,
ResponseAudioJsonPath = source.TextToSpeechHttp.ResponseAudioJsonPath,
ResponseStatusCodeJsonPath = source.TextToSpeechHttp.ResponseStatusCodeJsonPath,
ResponseStatusMessageJsonPath = source.TextToSpeechHttp.ResponseStatusMessageJsonPath,
SuccessStatusValue = source.TextToSpeechHttp.SuccessStatusValue,
OutputContentType = source.TextToSpeechHttp.OutputContentType
},
TextToSpeechWebSocket = source.TextToSpeechWebSocket == null
? null
: new VoiceTextToSpeechWebSocketContract
{
EndpointTemplate = source.TextToSpeechWebSocket.EndpointTemplate,
AuthenticationHeaderName = source.TextToSpeechWebSocket.AuthenticationHeaderName,
AuthenticationScheme = source.TextToSpeechWebSocket.AuthenticationScheme,
ApiKeySettingKey = source.TextToSpeechWebSocket.ApiKeySettingKey,
ConnectSuccessEventName = source.TextToSpeechWebSocket.ConnectSuccessEventName,
StartMessageTemplate = source.TextToSpeechWebSocket.StartMessageTemplate,
StartSuccessEventName = source.TextToSpeechWebSocket.StartSuccessEventName,
ContinueMessageTemplate = source.TextToSpeechWebSocket.ContinueMessageTemplate,
FinishMessageTemplate = source.TextToSpeechWebSocket.FinishMessageTemplate,
ResponseAudioMode = source.TextToSpeechWebSocket.ResponseAudioMode,
ResponseAudioJsonPath = source.TextToSpeechWebSocket.ResponseAudioJsonPath,
ResponseStatusCodeJsonPath = source.TextToSpeechWebSocket.ResponseStatusCodeJsonPath,
ResponseStatusMessageJsonPath = source.TextToSpeechWebSocket.ResponseStatusMessageJsonPath,
FinalFlagJsonPath = source.TextToSpeechWebSocket.FinalFlagJsonPath,
TaskFailedEventName = source.TextToSpeechWebSocket.TaskFailedEventName,
SuccessStatusValue = source.TextToSpeechWebSocket.SuccessStatusValue,
OutputContentType = source.TextToSpeechWebSocket.OutputContentType
}
};
}
private static bool EnsureSelectableProviderSelection(
ComboBox comboBox,
IReadOnlyList<VoiceProviderOption> options,
ref string activeProviderId)
{
var previousProviderId = activeProviderId;
if (comboBox.SelectedItem is VoiceProviderOption selected && selected.Selectable)
{
activeProviderId = selected.Id;
return true;
}
var fallback = options.FirstOrDefault(option =>
option.Selectable &&
string.Equals(option.Id, previousProviderId, StringComparison.OrdinalIgnoreCase))
?? options.FirstOrDefault(option => option.Selectable);
if (fallback == null)
{
return false;
}
if (!ReferenceEquals(comboBox.SelectedItem, fallback))
{
comboBox.SelectedItem = fallback;
}
activeProviderId = fallback.Id;
return false;
}
}

View File

@ -55,7 +55,7 @@ public sealed class QuickSendDialog : WindowEx
Title = LocalizationHelper.GetString("WindowTitle_QuickSend");
this.SetWindowSize(420, 260);
this.CenterOnScreen();
this.SetIcon(IconHelper.GetStatusIconPath(ConnectionStatus.Connected));
this.SetIcon(AppIconHelper.GetStatusIconPath(ConnectionStatus.Connected));
// Apply Acrylic via controller to keep IsInputActive=true.
// This avoids focus/activation oddities on Windows 10 for hotkey-launched windows.

View File

@ -0,0 +1,64 @@
using OpenClaw.Shared;
namespace OpenClawTray.Helpers;
public static class AppIconHelper
{
private static readonly string AssetsPath = ResolveAssetsPath();
private static readonly string IconsPath = Path.Combine(AssetsPath, "Icons");
public static string GetStatusIconPath(ConnectionStatus status)
{
var iconName = status switch
{
ConnectionStatus.Connected => "StatusConnected.ico",
ConnectionStatus.Connecting => "StatusConnecting.ico",
ConnectionStatus.Error => "StatusError.ico",
_ => "StatusDisconnected.ico"
};
var path = Path.Combine(IconsPath, iconName);
if (!File.Exists(path))
{
path = GetAppIconPath();
}
return path;
}
public static string GetAppIconPath()
{
var path = Path.Combine(AssetsPath, "openclaw.ico");
if (!File.Exists(path))
{
throw new FileNotFoundException(
$"Application icon was not found at '{path}'. Ensure the Assets folder is packaged correctly and contains 'openclaw.ico'.",
path);
}
return path;
}
private static string ResolveAssetsPath()
{
var bundledPath = Path.Combine(AppContext.BaseDirectory, "Assets");
if (Directory.Exists(bundledPath))
{
return bundledPath;
}
var current = new DirectoryInfo(AppContext.BaseDirectory);
while (current != null)
{
var sourcePath = Path.Combine(current.FullName, "src", "OpenClaw.Tray.WinUI", "Assets");
if (Directory.Exists(sourcePath))
{
return sourcePath;
}
current = current.Parent;
}
return bundledPath;
}
}

View File

@ -1,145 +0,0 @@
using OpenClaw.Shared;
using System;
using System.Drawing;
using System.IO;
using System.Runtime.InteropServices;
namespace OpenClawTray.Helpers;
/// <summary>
/// Provides icon resources for the tray application.
/// Creates dynamic status icons with lobster pixel art.
/// </summary>
public static class IconHelper
{
private static readonly string AssetsPath = Path.Combine(AppContext.BaseDirectory, "Assets");
private static readonly string IconsPath = Path.Combine(AssetsPath, "Icons");
// Icon cache
private static Icon? _connectedIcon;
private static Icon? _disconnectedIcon;
private static Icon? _activityIcon;
private static Icon? _errorIcon;
private static Icon? _appIcon;
public static string GetStatusIconPath(ConnectionStatus status)
{
var iconName = status switch
{
ConnectionStatus.Connected => "StatusConnected.ico",
ConnectionStatus.Connecting => "StatusConnecting.ico",
ConnectionStatus.Error => "StatusError.ico",
_ => "StatusDisconnected.ico"
};
var path = Path.Combine(IconsPath, iconName);
// If specific icon doesn't exist, fall back to main icon
if (!File.Exists(path))
{
path = Path.Combine(AssetsPath, "openclaw.ico");
}
return path;
}
public static Icon GetStatusIcon(ConnectionStatus status)
{
return status switch
{
ConnectionStatus.Connected => GetOrCreateIcon(ref _connectedIcon, ConnectionStatus.Connected),
ConnectionStatus.Connecting => GetOrCreateIcon(ref _activityIcon, ConnectionStatus.Connecting),
ConnectionStatus.Error => GetOrCreateIcon(ref _errorIcon, ConnectionStatus.Error),
_ => GetOrCreateIcon(ref _disconnectedIcon, ConnectionStatus.Disconnected)
};
}
public static Icon GetAppIcon()
{
if (_appIcon != null) return _appIcon;
var iconPath = Path.Combine(AssetsPath, "openclaw.ico");
if (File.Exists(iconPath))
{
_appIcon = new Icon(iconPath);
}
else
{
_appIcon = CreateLobsterIcon(Color.FromArgb(255, 99, 71)); // Lobster red
}
return _appIcon;
}
private static Icon GetOrCreateIcon(ref Icon? cached, ConnectionStatus status)
{
if (cached != null) return cached;
var iconPath = GetStatusIconPath(status);
if (File.Exists(iconPath))
{
cached = new Icon(iconPath);
}
else
{
// Generate dynamic icon
var color = status switch
{
ConnectionStatus.Connected => Color.FromArgb(76, 175, 80), // Green
ConnectionStatus.Connecting => Color.FromArgb(255, 193, 7), // Amber
ConnectionStatus.Error => Color.FromArgb(244, 67, 54), // Red
_ => Color.FromArgb(158, 158, 158) // Gray
};
cached = CreateLobsterIcon(color);
}
return cached;
}
/// <summary>
/// Creates a simple colored lobster icon programmatically.
/// Uses pixel art style matching the original WinForms version.
/// </summary>
public static Icon CreateLobsterIcon(Color color)
{
const int size = 16;
using var bitmap = new Bitmap(size, size);
using var g = Graphics.FromImage(bitmap);
g.Clear(Color.Transparent);
// Simple lobster silhouette (pixel art style)
using var brush = new SolidBrush(color);
// Body
g.FillRectangle(brush, 6, 6, 4, 6);
// Claws
g.FillRectangle(brush, 3, 4, 2, 2);
g.FillRectangle(brush, 11, 4, 2, 2);
g.FillRectangle(brush, 4, 6, 2, 2);
g.FillRectangle(brush, 10, 6, 2, 2);
// Tail
g.FillRectangle(brush, 7, 12, 2, 3);
g.FillRectangle(brush, 5, 14, 6, 1);
// Eyes
using var eyeBrush = new SolidBrush(Color.White);
g.FillRectangle(eyeBrush, 6, 5, 1, 1);
g.FillRectangle(eyeBrush, 9, 5, 1, 1);
// Convert bitmap to icon
var hIcon = bitmap.GetHicon();
var icon = Icon.FromHandle(hIcon);
// Clone to own the icon data
var result = (Icon)icon.Clone();
DestroyIcon(hIcon);
return result;
}
[DllImport("user32.dll", CharSet = CharSet.Auto)]
private static extern bool DestroyIcon(IntPtr handle);
}

View File

@ -33,6 +33,7 @@
</PropertyGroup>
<ItemGroup>
<ProjectReference Include="..\OpenClaw.Tray.Shared\OpenClaw.Tray.Shared.csproj" />
<ProjectReference Include="..\OpenClaw.Shared\OpenClaw.Shared.csproj" />
</ItemGroup>
@ -61,4 +62,3 @@
</Target>
</Project>

View File

@ -0,0 +1,3 @@
using System.Runtime.CompilerServices;
[assembly: InternalsVisibleTo("OpenClaw.Tray.Tests")]

View File

@ -7,15 +7,19 @@ namespace OpenClawTray.Services;
/// <summary>
/// Registers and handles global hotkeys using P/Invoke.
/// Default: Ctrl+Alt+Shift+C for Quick Send.
/// Defaults:
/// - Ctrl+Alt+Shift+C for Quick Send
/// - Ctrl+Alt+Shift+V for Voice pause/resume
/// </summary>
public class GlobalHotkeyService : IDisposable
{
private const int HOTKEY_ID = 9001;
private const int QUICK_SEND_HOTKEY_ID = 9001;
private const int VOICE_TOGGLE_HOTKEY_ID = 9002;
private const uint MOD_CONTROL = 0x0002;
private const uint MOD_ALT = 0x0001;
private const uint MOD_SHIFT = 0x0004;
private const uint VK_C = 0x43;
private const uint VK_V = 0x56;
private const int WM_HOTKEY = 0x0312;
[DllImport("user32.dll", SetLastError = true)]
@ -105,6 +109,7 @@ public class GlobalHotkeyService : IDisposable
private IntPtr _hwnd;
private bool _registered;
private bool _disposed;
private readonly object _sync = new();
private Thread? _messageThread;
private WndProcDelegate? _wndProcDelegate; // prevent GC collection
private volatile bool _running;
@ -113,7 +118,8 @@ public class GlobalHotkeyService : IDisposable
private readonly ManualResetEventSlim _windowReady = new(false);
private readonly ManualResetEventSlim _opCompleted = new(false);
public event EventHandler? HotkeyPressed;
public event EventHandler? QuickSendHotkeyPressed;
public event EventHandler? VoiceToggleHotkeyPressed;
public GlobalHotkeyService()
{
@ -121,12 +127,15 @@ public class GlobalHotkeyService : IDisposable
public bool Register()
{
if (_registered) return true;
try
{
// Create message window on a dedicated thread with message loop
EnsureMessageLoop();
lock (_sync)
{
if (_registered) return true;
// Create message window on a dedicated thread with message loop
EnsureMessageLoop();
}
if (!_windowReady.Wait(TimeSpan.FromSeconds(2)))
{
@ -134,18 +143,21 @@ public class GlobalHotkeyService : IDisposable
return false;
}
if (_hwnd == IntPtr.Zero)
lock (_sync)
{
Logger.Warn("Failed to create hotkey message window");
return false;
}
if (_hwnd == IntPtr.Zero)
{
Logger.Warn("Failed to create hotkey message window");
return false;
}
_opCompleted.Reset();
if (!PostMessage(_hwnd, WM_APP_REGISTER, IntPtr.Zero, IntPtr.Zero))
{
Logger.Warn("Failed to post WM_APP_REGISTER message for hotkey registration");
_registered = false;
return false;
_opCompleted.Reset();
if (!PostMessage(_hwnd, WM_APP_REGISTER, IntPtr.Zero, IntPtr.Zero))
{
Logger.Warn("Failed to post WM_APP_REGISTER message for hotkey registration");
_registered = false;
return false;
}
}
if (!_opCompleted.Wait(TimeSpan.FromSeconds(2)))
@ -225,19 +237,34 @@ public class GlobalHotkeyService : IDisposable
if (msg == WM_APP_REGISTER)
{
// Register from the message-loop thread that owns hWnd.
_registered = RegisterHotKey(hWnd, HOTKEY_ID,
var quickSendRegistered = RegisterHotKey(hWnd, QUICK_SEND_HOTKEY_ID,
MOD_CONTROL | MOD_ALT | MOD_SHIFT | MOD_NOREPEAT,
VK_C);
var voiceToggleRegistered = RegisterHotKey(hWnd, VOICE_TOGGLE_HOTKEY_ID,
MOD_CONTROL | MOD_ALT | MOD_SHIFT | MOD_NOREPEAT,
VK_V);
_registered = quickSendRegistered && voiceToggleRegistered;
if (_registered)
{
Logger.Info("Global hotkey registered: Ctrl+Alt+Shift+C");
Logger.Info("Global hotkeys registered: Ctrl+Alt+Shift+C (Quick Send), Ctrl+Alt+Shift+V (Voice Pause)");
}
else
{
if (quickSendRegistered)
{
UnregisterHotKey(hWnd, QUICK_SEND_HOTKEY_ID);
}
if (voiceToggleRegistered)
{
UnregisterHotKey(hWnd, VOICE_TOGGLE_HOTKEY_ID);
}
var err = Marshal.GetLastWin32Error();
var errMsg = new Win32Exception(err).Message;
Logger.Warn($"Failed to register global hotkey (Win32Error={err}: {errMsg})");
Logger.Warn($"Failed to register one or more global hotkeys (Win32Error={err}: {errMsg})");
}
_opCompleted.Set();
@ -250,9 +277,10 @@ public class GlobalHotkeyService : IDisposable
{
if (_registered)
{
UnregisterHotKey(hWnd, HOTKEY_ID);
UnregisterHotKey(hWnd, QUICK_SEND_HOTKEY_ID);
UnregisterHotKey(hWnd, VOICE_TOGGLE_HOTKEY_ID);
_registered = false;
Logger.Info("Global hotkey unregistered");
Logger.Info("Global hotkeys unregistered");
}
}
catch (Exception ex)
@ -266,10 +294,15 @@ public class GlobalHotkeyService : IDisposable
return IntPtr.Zero;
}
if (msg == WM_HOTKEY && wParam.ToInt32() == HOTKEY_ID)
if (msg == WM_HOTKEY && wParam.ToInt32() == QUICK_SEND_HOTKEY_ID)
{
Logger.Info("Hotkey pressed: Ctrl+Alt+Shift+C");
OnHotkeyPressed();
OnQuickSendHotkeyPressed();
}
else if (msg == WM_HOTKEY && wParam.ToInt32() == VOICE_TOGGLE_HOTKEY_ID)
{
Logger.Info("Hotkey pressed: Ctrl+Alt+Shift+V");
OnVoiceToggleHotkeyPressed();
}
return DefWindowProc(hWnd, msg, wParam, lParam);
}
@ -302,9 +335,14 @@ public class GlobalHotkeyService : IDisposable
}
}
internal void OnHotkeyPressed()
internal void OnQuickSendHotkeyPressed()
{
HotkeyPressed?.Invoke(this, EventArgs.Empty);
QuickSendHotkeyPressed?.Invoke(this, EventArgs.Empty);
}
internal void OnVoiceToggleHotkeyPressed()
{
VoiceToggleHotkeyPressed?.Invoke(this, EventArgs.Empty);
}
public void Dispose()

View File

@ -5,6 +5,7 @@ using Microsoft.UI.Dispatching;
using OpenClaw.Shared;
using OpenClaw.Shared.Capabilities;
using OpenClawTray.Helpers;
using OpenClawTray.Services.Voice;
using OpenClawTray.Windows;
using Microsoft.UI.Xaml;
@ -21,6 +22,7 @@ public class NodeService : IDisposable
private CanvasWindow? _canvasWindow;
private ScreenCaptureService? _screenCaptureService;
private CameraCaptureService? _cameraCaptureService;
private VoiceService? _voiceService;
private DateTime _lastScreenCaptureNotification = DateTime.MinValue;
private string? _a2uiHostUrl;
@ -29,6 +31,7 @@ public class NodeService : IDisposable
private CanvasCapability? _canvasCapability;
private ScreenCapability? _screenCapability;
private CameraCapability? _cameraCapability;
private VoiceCapability? _voiceCapability;
private readonly string _dataPath;
// Events
@ -44,13 +47,14 @@ public class NodeService : IDisposable
public string? FullDeviceId => _nodeClient?.FullDeviceId;
public string? GatewayUrl => _nodeClient?.GatewayUrl;
public NodeService(IOpenClawLogger logger, DispatcherQueue dispatcherQueue, string dataPath)
public NodeService(IOpenClawLogger logger, DispatcherQueue dispatcherQueue, VoiceService voiceService, string dataPath)
{
_logger = logger;
_dispatcherQueue = dispatcherQueue;
_dataPath = dataPath;
_screenCaptureService = new ScreenCaptureService(logger);
_cameraCaptureService = new CameraCaptureService(logger);
_voiceService = voiceService;
}
/// <summary>
@ -79,6 +83,34 @@ public class NodeService : IDisposable
await _nodeClient.ConnectAsync();
_a2uiHostUrl = BuildA2UIHostUrl(_nodeClient.GatewayUrl);
if (_voiceService != null)
{
var settings = await _voiceService.GetSettingsAsync();
if (settings.Enabled && settings.Mode != VoiceActivationMode.Off)
{
var startTcs = new TaskCompletionSource<bool>(TaskCreationOptions.RunContinuationsAsynchronously);
var enqueued = _dispatcherQueue.TryEnqueue(async () =>
{
try
{
await _voiceService.StartAsync(new VoiceStartArgs { Mode = settings.Mode });
startTcs.TrySetResult(true);
}
catch (Exception ex)
{
startTcs.TrySetException(ex);
}
});
if (!enqueued)
{
throw new InvalidOperationException("Dispatcher queue unavailable for voice startup.");
}
await startTcs.Task;
}
}
}
/// <summary>
@ -92,6 +124,30 @@ public class NodeService : IDisposable
_nodeClient.Dispose();
_nodeClient = null;
}
if (_voiceService != null)
{
var stopTcs = new TaskCompletionSource<bool>(TaskCreationOptions.RunContinuationsAsynchronously);
var enqueued = _dispatcherQueue.TryEnqueue(async () =>
{
try
{
await _voiceService.StopAsync(new VoiceStopArgs { Reason = "Node disconnected" });
stopTcs.TrySetResult(true);
}
catch (Exception ex)
{
stopTcs.TrySetException(ex);
}
});
if (!enqueued)
{
throw new InvalidOperationException("Dispatcher queue unavailable for voice shutdown.");
}
await stopTcs.Task;
}
// Close canvas window
if (_canvasWindow != null && !_canvasWindow.IsClosed)
@ -134,6 +190,19 @@ public class NodeService : IDisposable
_cameraCapability.ListRequested += OnCameraList;
_cameraCapability.SnapRequested += OnCameraSnap;
_nodeClient.RegisterCapability(_cameraCapability);
// Voice capability
_voiceCapability = new VoiceCapability(_logger);
_voiceCapability.ListDevicesRequested += OnVoiceListDevices;
_voiceCapability.SettingsRequested += OnVoiceGetSettings;
_voiceCapability.SettingsUpdateRequested += OnVoiceSetSettings;
_voiceCapability.StatusRequested += OnVoiceGetStatus;
_voiceCapability.StartRequested += OnVoiceStart;
_voiceCapability.StopRequested += OnVoiceStop;
_voiceCapability.PauseRequested += OnVoicePause;
_voiceCapability.ResumeRequested += OnVoiceResume;
_voiceCapability.SkipRequested += OnVoiceSkip;
_nodeClient.RegisterCapability(_voiceCapability);
_logger.Info("All capabilities registered");
}
@ -475,6 +544,82 @@ public class NodeService : IDisposable
}
}
#endregion
#region Voice Capability Handlers
private Task<VoiceAudioDeviceInfo[]> OnVoiceListDevices()
{
if (_voiceService == null)
throw new InvalidOperationException("Voice service not available");
return _voiceService.ListDevicesAsync();
}
private Task<VoiceSettings> OnVoiceGetSettings()
{
if (_voiceService == null)
throw new InvalidOperationException("Voice service not available");
return _voiceService.GetSettingsAsync();
}
private Task<VoiceSettings> OnVoiceSetSettings(VoiceSettingsUpdateArgs args)
{
if (_voiceService == null)
throw new InvalidOperationException("Voice service not available");
return _voiceService.UpdateSettingsAsync(args);
}
private Task<VoiceStatusInfo> OnVoiceGetStatus()
{
if (_voiceService == null)
throw new InvalidOperationException("Voice service not available");
return _voiceService.GetStatusAsync();
}
private Task<VoiceStatusInfo> OnVoiceStart(VoiceStartArgs args)
{
if (_voiceService == null)
throw new InvalidOperationException("Voice service not available");
return _voiceService.StartAsync(args);
}
private Task<VoiceStatusInfo> OnVoiceStop(VoiceStopArgs args)
{
if (_voiceService == null)
throw new InvalidOperationException("Voice service not available");
return _voiceService.StopAsync(args);
}
private Task<VoiceStatusInfo> OnVoicePause(VoicePauseArgs args)
{
if (_voiceService == null)
throw new InvalidOperationException("Voice service not available");
return _voiceService.PauseAsync(args);
}
private Task<VoiceStatusInfo> OnVoiceResume(VoiceResumeArgs args)
{
if (_voiceService == null)
throw new InvalidOperationException("Voice service not available");
return _voiceService.ResumeAsync(args);
}
private Task<VoiceStatusInfo> OnVoiceSkip(VoiceSkipArgs args)
{
if (_voiceService == null)
throw new InvalidOperationException("Voice service not available");
return _voiceService.SkipCurrentReplyAsync(args);
}
#endregion
public void Dispose()
@ -484,7 +629,6 @@ public class NodeService : IDisposable
try { client?.Dispose(); } catch { /* ignore */ }
try { _cameraCaptureService?.Dispose(); } catch { /* ignore */ }
if (_canvasWindow != null && !_canvasWindow.IsClosed)
{
var window = _canvasWindow;

View File

@ -47,6 +47,9 @@ public class SettingsManager
public bool NotifyChatResponses { get; set; } = true;
public bool PreferStructuredCategories { get; set; } = true;
public List<OpenClaw.Shared.UserNotificationRule> UserRules { get; set; } = new();
public VoiceSettings Voice { get; set; } = new();
public VoiceRepeaterWindowSettings VoiceRepeaterWindow { get; set; } = new();
public VoiceProviderConfigurationStore VoiceProviderConfiguration { get; set; } = new();
// Node mode (enables Windows as a node, not just operator)
public bool EnableNodeMode { get; set; } = false;
@ -94,6 +97,10 @@ public class SettingsManager
PreferStructuredCategories = loaded.PreferStructuredCategories;
if (loaded.UserRules != null)
UserRules = loaded.UserRules;
Voice = loaded.Voice ?? new VoiceSettings();
VoiceRepeaterWindow = loaded.VoiceRepeaterWindow ?? new VoiceRepeaterWindowSettings();
VoiceProviderConfiguration = loaded.VoiceProviderConfiguration?.Clone() ?? new VoiceProviderConfigurationStore();
VoiceProviderConfiguration.MigrateLegacyCredentials(loaded.VoiceProviderCredentials);
}
}
}
@ -103,7 +110,7 @@ public class SettingsManager
}
}
public void Save()
public void Save(bool logSuccess = true)
{
try
{
@ -135,13 +142,19 @@ public class SettingsManager
SkippedUpdateTag = string.IsNullOrWhiteSpace(SkippedUpdateTag) ? null : SkippedUpdateTag,
NotifyChatResponses = NotifyChatResponses,
PreferStructuredCategories = PreferStructuredCategories,
UserRules = UserRules
UserRules = UserRules,
Voice = Voice,
VoiceRepeaterWindow = VoiceRepeaterWindow,
VoiceProviderConfiguration = VoiceProviderConfiguration.Clone()
};
var json = data.ToJson();
File.WriteAllText(SettingsFilePath, json);
Logger.Info("Settings saved");
if (logSuccess)
{
Logger.Info("Settings saved");
}
}
catch (Exception ex)
{

View File

@ -0,0 +1,29 @@
using System;
using System.Threading;
using System.Threading.Tasks;
using OpenClaw.Shared;
namespace OpenClawTray.Services.Voice;
internal sealed class AudioGraphStreamingSpeechToTextRoute : IVoiceSpeechToTextRoute
{
private readonly IOpenClawLogger _logger;
public AudioGraphStreamingSpeechToTextRoute(IOpenClawLogger logger)
{
_logger = logger;
}
public VoiceSpeechToTextRouteKind Kind => VoiceSpeechToTextRouteKind.Streaming;
public Task<VoiceSpeechToTextRouteResources> StartAsync(
VoiceProviderOption provider,
VoiceSettings settings,
CancellationToken cancellationToken)
{
cancellationToken.ThrowIfCancellationRequested();
_logger.Info($"Selected streaming STT route for provider '{provider.Name}'.");
throw new NotSupportedException(
$"STT provider '{provider.Name}' is assigned to the AudioGraph streaming route, but that adapter is not implemented yet.");
}
}

View File

@ -0,0 +1,16 @@
namespace OpenClawTray.Services.Voice;
public sealed class DispatcherQueueAdapter : IUiDispatcher
{
private readonly Microsoft.UI.Dispatching.DispatcherQueue _dispatcherQueue;
public DispatcherQueueAdapter(Microsoft.UI.Dispatching.DispatcherQueue dispatcherQueue)
{
_dispatcherQueue = dispatcherQueue;
}
public bool TryEnqueue(Action callback)
{
return _dispatcherQueue.TryEnqueue(() => callback());
}
}

View File

@ -0,0 +1,15 @@
using System.Threading;
using System.Threading.Tasks;
using OpenClaw.Shared;
namespace OpenClawTray.Services.Voice;
internal interface IVoiceSpeechToTextRoute
{
VoiceSpeechToTextRouteKind Kind { get; }
Task<VoiceSpeechToTextRouteResources> StartAsync(
VoiceProviderOption provider,
VoiceSettings settings,
CancellationToken cancellationToken);
}

View File

@ -0,0 +1,29 @@
using System;
using System.Threading;
using System.Threading.Tasks;
using OpenClaw.Shared;
namespace OpenClawTray.Services.Voice;
internal sealed class SherpaOnnxSpeechToTextRoute : IVoiceSpeechToTextRoute
{
private readonly IOpenClawLogger _logger;
public SherpaOnnxSpeechToTextRoute(IOpenClawLogger logger)
{
_logger = logger;
}
public VoiceSpeechToTextRouteKind Kind => VoiceSpeechToTextRouteKind.SherpaOnnx;
public Task<VoiceSpeechToTextRouteResources> StartAsync(
VoiceProviderOption provider,
VoiceSettings settings,
CancellationToken cancellationToken)
{
cancellationToken.ThrowIfCancellationRequested();
_logger.Info($"Selected embedded sherpa-onnx STT route for provider '{provider.Name}'.");
throw new NotSupportedException(
"The sherpa-onnx STT route is not implemented yet. This route will require a user-provided local model bundle.");
}
}

View File

@ -0,0 +1,431 @@
using System;
using System.Linq;
using System.Runtime.InteropServices;
using System.Threading;
using System.Threading.Tasks;
using OpenClaw.Shared;
using WinRT;
using Windows.Devices.Enumeration;
using Windows.Media;
using Windows.Media.Audio;
using Windows.Media.Capture;
using Windows.Media.Devices;
using Windows.Media.Render;
namespace OpenClawTray.Services.Voice;
public sealed class VoiceAudioFrameEventArgs : EventArgs
{
public VoiceAudioFrameEventArgs(
string? deviceId,
string? deviceName,
DateTime utcTimestamp,
int sampleRateHz,
int channelCount,
byte[] data,
float peakLevel)
{
DeviceId = deviceId;
DeviceName = deviceName;
UtcTimestamp = utcTimestamp;
SampleRateHz = sampleRateHz;
ChannelCount = channelCount;
Data = data;
PeakLevel = peakLevel;
}
public string? DeviceId { get; }
public string? DeviceName { get; }
public DateTime UtcTimestamp { get; }
public int SampleRateHz { get; }
public int ChannelCount { get; }
public byte[] Data { get; }
public float PeakLevel { get; }
}
public sealed class VoiceCaptureSignalEventArgs : EventArgs
{
public VoiceCaptureSignalEventArgs(
string? deviceId,
string? deviceName,
DateTime utcTimestamp,
float peakLevel)
{
DeviceId = deviceId;
DeviceName = deviceName;
UtcTimestamp = utcTimestamp;
PeakLevel = peakLevel;
}
public string? DeviceId { get; }
public string? DeviceName { get; }
public DateTime UtcTimestamp { get; }
public float PeakLevel { get; }
}
public sealed class VoiceCaptureService : IAsyncDisposable
{
private const float DefaultSignalThreshold = 0.015f;
private readonly IOpenClawLogger _logger;
private readonly object _gate = new();
private AudioGraph? _audioGraph;
private AudioDeviceInputNode? _deviceInputNode;
private AudioFrameOutputNode? _frameOutputNode;
private DeviceInformation? _activeCaptureDevice;
private int _sampleRateHz;
private int _channelCount;
private bool _captureReady;
private TaskCompletionSource<bool> _captureReadyTcs = CreateCaptureReadyTcs();
public VoiceCaptureService(IOpenClawLogger logger)
{
_logger = logger;
}
public event EventHandler<VoiceAudioFrameEventArgs>? FrameCaptured;
public event EventHandler<VoiceCaptureSignalEventArgs>? SignalDetected;
public bool IsRunning
{
get
{
lock (_gate)
{
return _audioGraph != null;
}
}
}
public string? ActiveDeviceId
{
get
{
lock (_gate)
{
return _activeCaptureDevice?.Id;
}
}
}
public string? ActiveDeviceName
{
get
{
lock (_gate)
{
return _activeCaptureDevice?.Name;
}
}
}
public async Task StartAsync(VoiceSettings settings, CancellationToken cancellationToken)
{
ArgumentNullException.ThrowIfNull(settings);
await StopAsync();
cancellationToken.ThrowIfCancellationRequested();
lock (_gate)
{
_captureReady = false;
_captureReadyTcs = CreateCaptureReadyTcs();
}
AudioGraph? audioGraph = null;
AudioDeviceInputNode? deviceInputNode = null;
AudioFrameOutputNode? frameOutputNode = null;
try
{
var graphSettings = new AudioGraphSettings(AudioRenderCategory.Speech)
{
QuantumSizeSelectionMode = QuantumSizeSelectionMode.ClosestToDesired,
DesiredSamplesPerQuantum = (int)ResolveDesiredSamplesPerQuantum(settings.SampleRateHz, settings.CaptureChunkMs)
};
var graphCreation = await AudioGraph.CreateAsync(graphSettings);
if (graphCreation.Status != AudioGraphCreationStatus.Success || graphCreation.Graph == null)
{
throw new InvalidOperationException($"AudioGraph unavailable: {graphCreation.Status}");
}
audioGraph = graphCreation.Graph;
var captureDevice = await ResolveCaptureDeviceAsync(settings.InputDeviceId);
var inputCreation = await audioGraph.CreateDeviceInputNodeAsync(
MediaCategory.Speech,
audioGraph.EncodingProperties,
captureDevice);
if (inputCreation.Status != AudioDeviceNodeCreationStatus.Success || inputCreation.DeviceInputNode == null)
{
throw new InvalidOperationException($"Audio input node unavailable: {inputCreation.Status}");
}
deviceInputNode = inputCreation.DeviceInputNode;
frameOutputNode = audioGraph.CreateFrameOutputNode(audioGraph.EncodingProperties);
deviceInputNode.AddOutgoingConnection(frameOutputNode);
audioGraph.QuantumStarted += OnAudioGraphQuantumStarted;
audioGraph.UnrecoverableErrorOccurred += OnAudioGraphUnrecoverableErrorOccurred;
lock (_gate)
{
_audioGraph = audioGraph;
_deviceInputNode = deviceInputNode;
_frameOutputNode = frameOutputNode;
_activeCaptureDevice = captureDevice;
_sampleRateHz = (int)audioGraph.EncodingProperties.SampleRate;
_channelCount = (int)audioGraph.EncodingProperties.ChannelCount;
}
frameOutputNode.Start();
deviceInputNode.Start();
audioGraph.Start();
audioGraph = null;
deviceInputNode = null;
frameOutputNode = null;
_logger.Info(
$"Voice capture graph started on {(captureDevice?.Name ?? "system default microphone")} ({captureDevice?.Id ?? "default"})");
}
finally
{
if (frameOutputNode != null)
{
try { frameOutputNode.Stop(); } catch { }
try { frameOutputNode.Dispose(); } catch { }
}
if (deviceInputNode != null)
{
try { deviceInputNode.Stop(); } catch { }
try { deviceInputNode.Dispose(); } catch { }
}
if (audioGraph != null)
{
audioGraph.QuantumStarted -= OnAudioGraphQuantumStarted;
audioGraph.UnrecoverableErrorOccurred -= OnAudioGraphUnrecoverableErrorOccurred;
try { audioGraph.Stop(); } catch { }
try { audioGraph.Dispose(); } catch { }
}
}
}
public ValueTask DisposeAsync()
{
return new ValueTask(StopAsync());
}
public async Task StopAsync()
{
AudioGraph? audioGraph;
AudioDeviceInputNode? deviceInputNode;
AudioFrameOutputNode? frameOutputNode;
string? deviceName;
lock (_gate)
{
audioGraph = _audioGraph;
_audioGraph = null;
deviceInputNode = _deviceInputNode;
_deviceInputNode = null;
frameOutputNode = _frameOutputNode;
_frameOutputNode = null;
deviceName = _activeCaptureDevice?.Name;
_activeCaptureDevice = null;
_sampleRateHz = 0;
_channelCount = 0;
}
if (audioGraph == null && deviceInputNode == null && frameOutputNode == null)
{
return;
}
if (audioGraph != null)
{
audioGraph.QuantumStarted -= OnAudioGraphQuantumStarted;
audioGraph.UnrecoverableErrorOccurred -= OnAudioGraphUnrecoverableErrorOccurred;
}
try { frameOutputNode?.Stop(); } catch { }
try { deviceInputNode?.Stop(); } catch { }
try { audioGraph?.Stop(); } catch { }
try { frameOutputNode?.Dispose(); } catch { }
try { deviceInputNode?.Dispose(); } catch { }
try { audioGraph?.Dispose(); } catch { }
await Task.CompletedTask;
_logger.Info($"Voice capture graph stopped{(string.IsNullOrWhiteSpace(deviceName) ? string.Empty : $" ({deviceName})")}");
}
public Task WaitForCaptureReadyAsync(CancellationToken cancellationToken)
{
Task readinessTask;
lock (_gate)
{
readinessTask = _captureReady ? Task.CompletedTask : _captureReadyTcs.Task;
}
return readinessTask.WaitAsync(cancellationToken);
}
internal static uint ResolveDesiredSamplesPerQuantum(int sampleRateHz, int chunkMs)
{
return VoiceCaptureMath.ResolveDesiredSamplesPerQuantum(sampleRateHz, chunkMs);
}
internal static bool HasAudibleSignal(float peakLevel, float threshold = DefaultSignalThreshold)
{
return VoiceCaptureMath.HasAudibleSignal(peakLevel, threshold);
}
internal static float ComputePeakLevel(byte[] data)
{
return VoiceCaptureMath.ComputePeakLevel(data);
}
private async Task<DeviceInformation> ResolveCaptureDeviceAsync(string? preferredInputDeviceId)
{
var devices = await DeviceInformation.FindAllAsync(DeviceClass.AudioCapture);
if (devices.Count == 0)
{
throw new InvalidOperationException("No audio capture devices are available.");
}
if (!string.IsNullOrWhiteSpace(preferredInputDeviceId))
{
var selected = devices.FirstOrDefault(device =>
string.Equals(device.Id, preferredInputDeviceId, StringComparison.Ordinal));
if (selected != null)
{
return selected;
}
throw new InvalidOperationException($"Selected input device '{preferredInputDeviceId}' was not found.");
}
var defaultId = MediaDevice.GetDefaultAudioCaptureId(AudioDeviceRole.Default);
var defaultDevice = devices.FirstOrDefault(device =>
string.Equals(device.Id, defaultId, StringComparison.Ordinal));
return defaultDevice ?? devices[0];
}
private void OnAudioGraphUnrecoverableErrorOccurred(AudioGraph sender, AudioGraphUnrecoverableErrorOccurredEventArgs args)
{
_logger.Warn($"Voice capture graph unrecoverable error: {args.Error}");
}
private void OnAudioGraphQuantumStarted(AudioGraph sender, object args)
{
try
{
AudioFrameOutputNode? frameOutputNode;
string? deviceId;
string? deviceName;
int sampleRateHz;
int channelCount;
lock (_gate)
{
frameOutputNode = _frameOutputNode;
deviceId = _activeCaptureDevice?.Id;
deviceName = _activeCaptureDevice?.Name;
sampleRateHz = _sampleRateHz;
channelCount = _channelCount;
}
if (frameOutputNode == null)
{
return;
}
using var frame = frameOutputNode.GetFrame();
if (!TryCopyAudioFrame(frame, out var bytes) || bytes.Length == 0)
{
return;
}
TaskCompletionSource<bool>? captureReadyTcs = null;
lock (_gate)
{
if (!_captureReady)
{
_captureReady = true;
captureReadyTcs = _captureReadyTcs;
}
}
captureReadyTcs?.TrySetResult(true);
var utcNow = DateTime.UtcNow;
var peak = ComputePeakLevel(bytes);
FrameCaptured?.Invoke(
this,
new VoiceAudioFrameEventArgs(
deviceId,
deviceName,
utcNow,
sampleRateHz,
channelCount,
bytes,
peak));
if (HasAudibleSignal(peak))
{
SignalDetected?.Invoke(
this,
new VoiceCaptureSignalEventArgs(
deviceId,
deviceName,
utcNow,
peak));
}
}
catch (Exception ex)
{
_logger.Warn($"Voice capture quantum processing failed: {ex.Message}");
}
}
private static bool TryCopyAudioFrame(AudioFrame frame, out byte[] bytes)
{
bytes = [];
using var buffer = frame.LockBuffer(AudioBufferAccessMode.Read);
using var reference = buffer.CreateReference();
var access = reference.As<IMemoryBufferByteAccess>();
access.GetBuffer(out var data, out var capacity);
if (data == IntPtr.Zero || capacity == 0)
{
return false;
}
bytes = new byte[capacity];
Marshal.Copy(data, bytes, 0, (int)capacity);
return true;
}
private static TaskCompletionSource<bool> CreateCaptureReadyTcs()
{
return new TaskCompletionSource<bool>(TaskCreationOptions.RunContinuationsAsynchronously);
}
[ComImport]
[Guid("5B0D3235-4DBA-4D44-865E-8F1D0E4FD04D")]
[InterfaceType(ComInterfaceType.InterfaceIsIUnknown)]
private interface IMemoryBufferByteAccess
{
void GetBuffer(out IntPtr buffer, out uint capacity);
}
}

View File

@ -0,0 +1,49 @@
using OpenClaw.Shared;
namespace OpenClawTray.Services.Voice;
public static class VoiceDisplayHelper
{
public static string GetModeLabel(VoiceActivationMode mode)
{
return mode switch
{
VoiceActivationMode.VoiceWake => "Voice Wake",
VoiceActivationMode.TalkMode => "Talk Mode",
_ => "Off"
};
}
public static string GetStateLabel(VoiceRuntimeState state)
{
return state switch
{
VoiceRuntimeState.Arming => "Starting",
VoiceRuntimeState.ListeningForVoiceWake => "Listening",
VoiceRuntimeState.ListeningContinuously => "Listening",
VoiceRuntimeState.RecordingUtterance => "Recording",
VoiceRuntimeState.SubmittingAudio => "Sending",
VoiceRuntimeState.AwaitingResponse => "Waiting for reply",
VoiceRuntimeState.PlayingResponse => "Speaking",
VoiceRuntimeState.Paused => "Paused",
VoiceRuntimeState.Error => "Error",
VoiceRuntimeState.Idle => "Idle",
_ => "Stopped"
};
}
public static string GetRuntimeLabel(VoiceStatusInfo status)
{
if (status.State == VoiceRuntimeState.Paused)
{
return $"{GetModeLabel(status.Mode)} (Paused)";
}
if (status.Running)
{
return $"{GetModeLabel(status.Mode)} ({GetStateLabel(status.State)})";
}
return "Off";
}
}

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,28 @@
using System;
using OpenClaw.Shared;
namespace OpenClawTray.Services.Voice;
internal static class VoiceSpeechToTextRouteFactory
{
public static IVoiceSpeechToTextRoute Create(
VoiceProviderOption provider,
IOpenClawLogger logger)
{
ArgumentNullException.ThrowIfNull(provider);
ArgumentNullException.ThrowIfNull(logger);
return ResolveRouteKind(provider) switch
{
VoiceSpeechToTextRouteKind.WindowsMedia => new WindowsMediaSpeechToTextRoute(logger),
VoiceSpeechToTextRouteKind.Streaming => new AudioGraphStreamingSpeechToTextRoute(logger),
VoiceSpeechToTextRouteKind.SherpaOnnx => new SherpaOnnxSpeechToTextRoute(logger),
_ => new WindowsMediaSpeechToTextRoute(logger)
};
}
public static VoiceSpeechToTextRouteKind ResolveRouteKind(VoiceProviderOption provider)
{
return VoiceSpeechToTextRouteResolver.ResolveRouteKind(provider);
}
}

View File

@ -0,0 +1,9 @@
using Windows.Media.SpeechRecognition;
namespace OpenClawTray.Services.Voice;
internal sealed class VoiceSpeechToTextRouteResources
{
public VoiceCaptureService? CaptureService { get; init; }
public SpeechRecognizer? SpeechRecognizer { get; init; }
}

View File

@ -0,0 +1,55 @@
using System;
using System.Threading;
using System.Threading.Tasks;
using OpenClaw.Shared;
using Windows.Media.SpeechRecognition;
namespace OpenClawTray.Services.Voice;
internal sealed class WindowsMediaSpeechToTextRoute : IVoiceSpeechToTextRoute
{
private static readonly TimeSpan InitialSilenceTimeout = TimeSpan.FromSeconds(30);
private static readonly TimeSpan BabbleTimeout = TimeSpan.FromSeconds(4);
private readonly IOpenClawLogger _logger;
public WindowsMediaSpeechToTextRoute(IOpenClawLogger logger)
{
_logger = logger;
}
public VoiceSpeechToTextRouteKind Kind => VoiceSpeechToTextRouteKind.WindowsMedia;
public async Task<VoiceSpeechToTextRouteResources> StartAsync(
VoiceProviderOption provider,
VoiceSettings settings,
CancellationToken cancellationToken)
{
cancellationToken.ThrowIfCancellationRequested();
return new VoiceSpeechToTextRouteResources
{
SpeechRecognizer = await CreateRecognizerAsync(settings)
};
}
public async Task<SpeechRecognizer> CreateRecognizerAsync(VoiceSettings settings)
{
var recognizer = new SpeechRecognizer();
recognizer.Timeouts.EndSilenceTimeout = TimeSpan.FromMilliseconds(settings.TalkMode.EndSilenceMs);
recognizer.Timeouts.InitialSilenceTimeout = InitialSilenceTimeout;
recognizer.Timeouts.BabbleTimeout = BabbleTimeout;
recognizer.Constraints.Add(new SpeechRecognitionTopicConstraint(SpeechRecognitionScenario.Dictation, "always-on-dictation"));
var compilation = await recognizer.CompileConstraintsAsync();
if (compilation.Status != SpeechRecognitionResultStatus.Success)
{
recognizer.Dispose();
throw new InvalidOperationException($"Speech recognizer unavailable: {compilation.Status}");
}
_logger.Debug(
$"Speech recognizer compiled successfully ({compilation.Status}); endSilenceMs={recognizer.Timeouts.EndSilenceTimeout.TotalMilliseconds:0}; initialSilenceMs={recognizer.Timeouts.InitialSilenceTimeout.TotalMilliseconds:0}; babbleMs={recognizer.Timeouts.BabbleTimeout.TotalMilliseconds:0}");
return recognizer;
}
}

View File

@ -26,7 +26,7 @@ public sealed partial class ActivityStreamWindow : WindowEx
this.SetWindowSize(520, 640);
this.CenterOnScreen();
this.SetIcon(IconHelper.GetStatusIconPath(ConnectionStatus.Connected));
this.SetIcon(AppIconHelper.GetStatusIconPath(ConnectionStatus.Connected));
Closed += OnClosed;
ActivityStreamService.Updated += OnActivityUpdated;

View File

@ -23,7 +23,7 @@ public sealed partial class NotificationHistoryWindow : WindowEx
// Window configuration
this.SetWindowSize(450, 600);
this.CenterOnScreen();
this.SetIcon(IconHelper.GetStatusIconPath(ConnectionStatus.Connected));
this.SetIcon(AppIconHelper.GetStatusIconPath(ConnectionStatus.Connected));
Closed += (s, e) => IsClosed = true;

View File

@ -3,6 +3,7 @@
x:Class="OpenClawTray.Windows.SettingsWindow"
xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation"
xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml"
xmlns:controls="using:OpenClawTray.Controls"
xmlns:winex="using:WinUIEx"
Title="Settings — OpenClaw Tray"
MinWidth="400" MinHeight="500">
@ -112,7 +113,7 @@
<!-- Test Notification -->
<Button x:Name="TestNotificationButton" x:Uid="SettingsTestNotificationButton" Content="Send Test Notification"
Click="OnTestNotification"/>
<!-- Advanced Section -->
<StackPanel Spacing="8">
<TextBlock x:Uid="SettingsAdvancedHeader" Text="ADVANCED (EXPERIMENTAL)" Style="{StaticResource CaptionTextBlockStyle}"
@ -124,6 +125,8 @@
Foreground="{ThemeResource TextFillColorSecondaryBrush}"
TextWrapping="Wrap"
Margin="0,-4,0,0"/>
<controls:VoiceSettingsPanel x:Name="VoiceSettingsPanel"/>
</StackPanel>
</StackPanel>

View File

@ -1,8 +1,10 @@
using Microsoft.Toolkit.Uwp.Notifications;
using Microsoft.UI.Xaml;
using Microsoft.UI.Xaml.Controls;
using OpenClaw.Shared;
using OpenClawTray.Helpers;
using OpenClawTray.Services;
using OpenClawTray.Services.Voice;
using System;
using System.Threading.Tasks;
using WinUIEx;
@ -17,22 +19,22 @@ public sealed partial class SettingsWindow : WindowEx
public event EventHandler? SettingsSaved;
public SettingsWindow(SettingsManager settings)
public SettingsWindow(SettingsManager settings, IVoiceConfigurationApi voiceConfigurationApi)
{
_settings = settings;
InitializeComponent();
Title = LocalizationHelper.GetString("WindowTitle_Settings");
// Window configuration
this.SetWindowSize(480, 700);
this.SetWindowSize(560, 860);
this.CenterOnScreen();
this.SetIcon(IconHelper.GetStatusIconPath(ConnectionStatus.Connected));
this.SetIcon(AppIconHelper.GetStatusIconPath(ConnectionStatus.Connected));
LoadSettings();
VoiceSettingsPanel.Initialize(_settings, voiceConfigurationApi);
Closed += (s, e) => IsClosed = true;
Logger.Info("[Settings] Window opened");
}
@ -50,11 +52,10 @@ public sealed partial class SettingsWindow : WindowEx
AutoStartToggle.IsOn = _settings.AutoStart;
GlobalHotkeyToggle.IsOn = _settings.GlobalHotkeyEnabled;
NotificationsToggle.IsOn = _settings.ShowNotifications;
// Set sound combo — match by Tag (stable persistence key), not Content (display text)
for (int i = 0; i < NotificationSoundComboBox.Items.Count; i++)
{
if (NotificationSoundComboBox.Items[i] is Microsoft.UI.Xaml.Controls.ComboBoxItem item &&
if (NotificationSoundComboBox.Items[i] is ComboBoxItem item &&
item.Tag?.ToString() == _settings.NotificationSound)
{
NotificationSoundComboBox.SelectedIndex = i;
@ -62,9 +63,10 @@ public sealed partial class SettingsWindow : WindowEx
}
}
if (NotificationSoundComboBox.SelectedIndex < 0)
{
NotificationSoundComboBox.SelectedIndex = 0;
}
// Notification filters
NotifyHealthCb.IsChecked = _settings.NotifyHealth;
NotifyUrgentCb.IsChecked = _settings.NotifyUrgent;
NotifyReminderCb.IsChecked = _settings.NotifyReminder;
@ -73,12 +75,11 @@ public sealed partial class SettingsWindow : WindowEx
NotifyBuildCb.IsChecked = _settings.NotifyBuild;
NotifyStockCb.IsChecked = _settings.NotifyStock;
NotifyInfoCb.IsChecked = _settings.NotifyInfo;
// Advanced
NodeModeToggle.IsOn = _settings.EnableNodeMode;
}
private void SaveSettings()
private async Task<bool> SaveSettingsAsync()
{
_settings.UseSshTunnel = UseSshTunnelToggle.IsOn;
_settings.SshTunnelUser = SshTunnelUserTextBox.Text.Trim();
@ -94,8 +95,8 @@ public sealed partial class SettingsWindow : WindowEx
_settings.AutoStart = AutoStartToggle.IsOn;
_settings.GlobalHotkeyEnabled = GlobalHotkeyToggle.IsOn;
_settings.ShowNotifications = NotificationsToggle.IsOn;
if (NotificationSoundComboBox.SelectedItem is Microsoft.UI.Xaml.Controls.ComboBoxItem item)
if (NotificationSoundComboBox.SelectedItem is ComboBoxItem item)
{
_settings.NotificationSound = item.Tag?.ToString() ?? "Default";
}
@ -108,12 +109,22 @@ public sealed partial class SettingsWindow : WindowEx
_settings.NotifyBuild = NotifyBuildCb.IsChecked ?? true;
_settings.NotifyStock = NotifyStockCb.IsChecked ?? true;
_settings.NotifyInfo = NotifyInfoCb.IsChecked ?? true;
// Advanced
_settings.EnableNodeMode = NodeModeToggle.IsOn;
try
{
await VoiceSettingsPanel.ApplyAsync(_settings);
}
catch (Exception ex)
{
Logger.Error($"[Settings] Failed to apply voice settings: {ex.Message}");
StatusLabel.Text = $"❌ Failed to apply voice settings: {ex.Message}";
return false;
}
_settings.Save();
AutoStartManager.SetAutoStart(_settings.AutoStart);
return true;
}
private async void OnTestConnection(object sender, RoutedEventArgs e)
@ -159,7 +170,7 @@ public sealed partial class SettingsWindow : WindowEx
var connected = false;
var tcs = new TaskCompletionSource<bool>();
client.StatusChanged += (s, status) =>
{
if (status == ConnectionStatus.Connected)
@ -174,8 +185,7 @@ public sealed partial class SettingsWindow : WindowEx
};
_ = client.ConnectAsync();
// Wait up to 5 seconds for connection
var completedTask = await Task.WhenAny(tcs.Task, Task.Delay(5000));
if (completedTask != tcs.Task)
{
@ -224,13 +234,13 @@ public sealed partial class SettingsWindow : WindowEx
}
}
private void OnSave(object sender, RoutedEventArgs e)
private async void OnSave(object sender, RoutedEventArgs e)
{
var useSshTunnel = UseSshTunnelToggle.IsOn;
var gatewayUrl = GatewayUrlTextBox.Text.Trim();
if (!useSshTunnel && !GatewayUrlHelper.IsValidGatewayUrl(gatewayUrl))
{
Logger.Warn($"[Settings] Save blocked — invalid gateway URL");
Logger.Warn("[Settings] Save blocked — invalid gateway URL");
StatusLabel.Text = $"❌ {GatewayUrlHelper.ValidationMessage}";
return;
}
@ -246,14 +256,23 @@ public sealed partial class SettingsWindow : WindowEx
var oldGateway = _settings.GatewayUrl;
var oldAutoStart = _settings.AutoStart;
var oldNodeMode = _settings.EnableNodeMode;
SaveSettings();
if (!await SaveSettingsAsync())
{
return;
}
if (!string.Equals(oldGateway, _settings.GatewayUrl, StringComparison.Ordinal))
Logger.Info($"[Settings] GatewayUrl changed");
{
Logger.Info("[Settings] GatewayUrl changed");
}
if (oldAutoStart != _settings.AutoStart)
{
Logger.Info($"[Settings] AutoStart changed to {_settings.AutoStart}");
}
if (oldNodeMode != _settings.EnableNodeMode)
{
Logger.Info($"[Settings] NodeMode changed to {_settings.EnableNodeMode}");
}
Logger.Info("[Settings] Settings saved");
SettingsSaved?.Invoke(this, EventArgs.Empty);

View File

@ -31,7 +31,7 @@ public sealed partial class StatusDetailWindow : WindowEx
// Window configuration
this.SetWindowSize(420, 550);
this.CenterOnScreen();
this.SetIcon(IconHelper.GetStatusIconPath(status));
this.SetIcon(AppIconHelper.GetStatusIconPath(status));
Closed += (s, e) => IsClosed = true;

View File

@ -0,0 +1,120 @@
<?xml version="1.0" encoding="utf-8"?>
<winex:WindowEx
x:Class="OpenClawTray.Windows.VoiceModeWindow"
xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation"
xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml"
xmlns:winex="using:WinUIEx"
Title="Voice Mode"
MinWidth="420"
MinHeight="480">
<Window.SystemBackdrop>
<MicaBackdrop/>
</Window.SystemBackdrop>
<Grid>
<Grid.RowDefinitions>
<RowDefinition Height="*"/>
<RowDefinition Height="Auto"/>
</Grid.RowDefinitions>
<ScrollViewer Grid.Row="0" VerticalScrollBarVisibility="Auto" Padding="24,24,24,12">
<StackPanel Spacing="24" MaxWidth="500">
<StackPanel Spacing="8">
<TextBlock Text="VOICE MODE" Style="{StaticResource CaptionTextBlockStyle}" Foreground="#E74C3C" FontWeight="Bold"/>
<TextBlock Text="Current voice runtime status and configuration summary."
TextWrapping="Wrap"/>
</StackPanel>
<StackPanel Spacing="8">
<TextBlock Text="STATUS" Style="{StaticResource CaptionTextBlockStyle}" Foreground="#E74C3C" FontWeight="Bold"/>
<ItemsControl x:Name="StatusItemsControl">
<ItemsControl.ItemTemplate>
<DataTemplate>
<Grid Padding="8" Margin="0,2" CornerRadius="4"
Background="{ThemeResource CardBackgroundFillColorDefaultBrush}">
<Grid.ColumnDefinitions>
<ColumnDefinition Width="150"/>
<ColumnDefinition Width="*"/>
</Grid.ColumnDefinitions>
<TextBlock Text="{Binding Label}" Grid.Column="0"
Foreground="{ThemeResource TextFillColorSecondaryBrush}"/>
<TextBlock Text="{Binding Value}" Grid.Column="1" TextWrapping="Wrap"/>
</Grid>
</DataTemplate>
</ItemsControl.ItemTemplate>
</ItemsControl>
</StackPanel>
<StackPanel Spacing="8">
<TextBlock Text="CONFIGURATION" Style="{StaticResource CaptionTextBlockStyle}" Foreground="#E74C3C" FontWeight="Bold"/>
<ItemsControl x:Name="ConfigurationItemsControl">
<ItemsControl.ItemTemplate>
<DataTemplate>
<Grid Padding="8" Margin="0,2" CornerRadius="4"
Background="{ThemeResource CardBackgroundFillColorDefaultBrush}">
<Grid.ColumnDefinitions>
<ColumnDefinition Width="150"/>
<ColumnDefinition Width="*"/>
</Grid.ColumnDefinitions>
<TextBlock Text="{Binding Label}" Grid.Column="0"
Foreground="{ThemeResource TextFillColorSecondaryBrush}"/>
<TextBlock Text="{Binding Value}" Grid.Column="1" TextWrapping="Wrap"/>
</Grid>
</DataTemplate>
</ItemsControl.ItemTemplate>
</ItemsControl>
</StackPanel>
<StackPanel Spacing="8">
<TextBlock Text="RECENT" Style="{StaticResource CaptionTextBlockStyle}" Foreground="#E74C3C" FontWeight="Bold"/>
<ItemsControl x:Name="RecentItemsControl">
<ItemsControl.ItemTemplate>
<DataTemplate>
<Grid Padding="8" Margin="0,2" CornerRadius="4"
Background="{ThemeResource CardBackgroundFillColorDefaultBrush}">
<Grid.ColumnDefinitions>
<ColumnDefinition Width="150"/>
<ColumnDefinition Width="*"/>
</Grid.ColumnDefinitions>
<TextBlock Text="{Binding Label}" Grid.Column="0"
Foreground="{ThemeResource TextFillColorSecondaryBrush}"/>
<TextBlock Text="{Binding Value}" Grid.Column="1" TextWrapping="Wrap"/>
</Grid>
</DataTemplate>
</ItemsControl.ItemTemplate>
</ItemsControl>
</StackPanel>
<StackPanel x:Name="TroubleshootingPanel" Spacing="8" Visibility="Collapsed">
<TextBlock x:Name="TroubleshootingTextBlock"
Style="{StaticResource CaptionTextBlockStyle}"
Foreground="{ThemeResource TextFillColorSecondaryBrush}"
TextWrapping="Wrap"/>
<StackPanel Orientation="Horizontal" Spacing="8">
<Button x:Name="OpenSpeechSettingsButton"
Content="Open Speech Settings"
Click="OnOpenSpeechSettings"
Visibility="Collapsed"/>
<Button x:Name="OpenMicrophoneSettingsButton"
Content="Open Microphone Settings"
Click="OnOpenMicrophoneSettings"
Visibility="Collapsed"/>
</StackPanel>
</StackPanel>
</StackPanel>
</ScrollViewer>
<Border Grid.Row="1"
Background="{ThemeResource CardBackgroundFillColorDefaultBrush}"
BorderBrush="{ThemeResource CardStrokeColorDefaultBrush}"
BorderThickness="0,1,0,0"
Padding="24,16">
<StackPanel Orientation="Horizontal" Spacing="8" HorizontalAlignment="Right">
<Button Content="Refresh" Click="OnRefresh" Width="90"/>
<Button Content="Settings" Click="OnOpenSettings" Width="90"/>
<Button Content="Close" Click="OnClose" Width="90" Style="{ThemeResource AccentButtonStyle}"/>
</StackPanel>
</Border>
</Grid>
</winex:WindowEx>

View File

@ -0,0 +1,171 @@
using Microsoft.UI.Xaml;
using OpenClaw.Shared;
using OpenClawTray.Helpers;
using OpenClawTray.Services;
using OpenClawTray.Services.Voice;
using System;
using System.Collections.Generic;
using System.Diagnostics;
using WinUIEx;
namespace OpenClawTray.Windows;
public sealed partial class VoiceModeWindow : WindowEx
{
private readonly SettingsManager _settings;
private readonly IVoiceRuntimeControlApi _voiceRuntimeControlApi;
private readonly IVoiceConfigurationApi _voiceConfigurationApi;
public bool IsClosed { get; private set; }
public event EventHandler? OpenSettingsRequested;
public VoiceModeWindow(
SettingsManager settings,
IVoiceRuntimeControlApi voiceRuntimeControlApi,
IVoiceConfigurationApi voiceConfigurationApi)
{
_settings = settings;
_voiceRuntimeControlApi = voiceRuntimeControlApi;
_voiceConfigurationApi = voiceConfigurationApi;
InitializeComponent();
Title = "Voice Mode";
this.SetWindowSize(520, 620);
this.CenterOnScreen();
this.SetIcon(AppIconHelper.GetStatusIconPath(ConnectionStatus.Connected));
Closed += (s, e) => IsClosed = true;
RefreshStatus();
}
public void RefreshStatus()
{
var running = _voiceRuntimeControlApi.CurrentStatus;
var catalog = _voiceConfigurationApi.GetProviderCatalog();
StatusItemsControl.ItemsSource = new List<DetailRow>
{
new("Mode", VoiceDisplayHelper.GetModeLabel(_settings.Voice.Mode)),
new("Runtime", VoiceDisplayHelper.GetRuntimeLabel(running)),
new("Node Mode", _settings.EnableNodeMode ? "Enabled" : "Disabled"),
new("Session", string.IsNullOrWhiteSpace(running.SessionKey) ? "main" : running.SessionKey!),
new("State", VoiceDisplayHelper.GetStateLabel(running.State)),
new("Queued replies", running.PendingReplyCount.ToString())
};
ConfigurationItemsControl.ItemsSource = new List<DetailRow>
{
new("Speech to text", ResolveProviderName(catalog.SpeechToTextProviders, _settings.Voice.SpeechToTextProviderId, "Windows Speech Recognition")),
new("Text to speech", ResolveProviderName(catalog.TextToSpeechProviders, _settings.Voice.TextToSpeechProviderId, "Windows Speech Synthesis")),
new("Listen device", DescribeDevice(_settings.Voice.InputDeviceId, "System default microphone")),
new("Talk device", DescribeDevice(_settings.Voice.OutputDeviceId, "System default speaker")),
new("Voice toasts", _settings.Voice.ShowConversationToasts ? "Enabled" : "Disabled")
};
RecentItemsControl.ItemsSource = new List<DetailRow>
{
new("Last utterance", FormatTimestamp(running.LastUtteranceUtc)),
new("Last wake", FormatTimestamp(running.LastVoiceWakeUtc)),
new("Last issue", string.IsNullOrWhiteSpace(running.LastError) ? "None" : running.LastError!)
};
UpdateTroubleshooting(running.LastError);
}
private static string ResolveProviderName(
IReadOnlyList<VoiceProviderOption> providers,
string? providerId,
string fallback)
{
foreach (var provider in providers)
{
if (string.Equals(provider.Id, providerId, StringComparison.OrdinalIgnoreCase))
{
return provider.Name;
}
}
return fallback;
}
private static string DescribeDevice(string? deviceId, string defaultLabel)
{
return string.IsNullOrWhiteSpace(deviceId) ? defaultLabel : "Selected device";
}
private static string FormatTimestamp(DateTime? value)
{
return value?.ToLocalTime().ToString("HH:mm:ss") ?? "None";
}
private void UpdateTroubleshooting(string? error)
{
TroubleshootingPanel.Visibility = Visibility.Collapsed;
OpenSpeechSettingsButton.Visibility = Visibility.Collapsed;
OpenMicrophoneSettingsButton.Visibility = Visibility.Collapsed;
TroubleshootingTextBlock.Text = string.Empty;
if (string.IsNullOrWhiteSpace(error))
{
return;
}
if (error.Contains("online speech recognition is disabled", StringComparison.OrdinalIgnoreCase))
{
TroubleshootingPanel.Visibility = Visibility.Visible;
OpenSpeechSettingsButton.Visibility = Visibility.Visible;
TroubleshootingTextBlock.Text =
"To fix this: open Windows Settings, go to Privacy & security > Speech, turn on Online speech recognition, then restart voice mode.";
return;
}
if (error.Contains("microphone access is blocked", StringComparison.OrdinalIgnoreCase))
{
TroubleshootingPanel.Visibility = Visibility.Visible;
OpenMicrophoneSettingsButton.Visibility = Visibility.Visible;
TroubleshootingTextBlock.Text =
"To fix this: open Windows Settings, go to Privacy & security > Microphone, allow microphone access and enable desktop app access, then restart voice mode.";
}
}
private void OnOpenSpeechSettings(object sender, RoutedEventArgs e)
{
OpenSettingsUri("ms-settings:privacy-speech");
}
private void OnOpenMicrophoneSettings(object sender, RoutedEventArgs e)
{
OpenSettingsUri("ms-settings:privacy-microphone");
}
private void OnRefresh(object sender, RoutedEventArgs e)
{
RefreshStatus();
}
private void OnOpenSettings(object sender, RoutedEventArgs e)
{
OpenSettingsRequested?.Invoke(this, EventArgs.Empty);
}
private void OnClose(object sender, RoutedEventArgs e)
{
Close();
}
private static void OpenSettingsUri(string uri)
{
try
{
Process.Start(new ProcessStartInfo(uri) { UseShellExecute = true });
}
catch
{
}
}
private sealed record DetailRow(string Label, string Value);
}

View File

@ -0,0 +1,160 @@
<?xml version="1.0" encoding="utf-8"?>
<winex:WindowEx
x:Class="OpenClawTray.Windows.VoiceRepeaterWindow"
xmlns="http://schemas.microsoft.com/winfx/2006/xaml/presentation"
xmlns:x="http://schemas.microsoft.com/winfx/2006/xaml"
xmlns:winex="using:WinUIEx"
Title="Voice Mode"
MinWidth="320"
MinHeight="150">
<Window.SystemBackdrop>
<MicaBackdrop/>
</Window.SystemBackdrop>
<Grid x:Name="WindowRoot">
<Grid.RowDefinitions>
<RowDefinition Height="*"/>
<RowDefinition Height="Auto"/>
</Grid.RowDefinitions>
<Border Grid.Row="0"
Margin="4,4,4,0"
Background="{ThemeResource CardBackgroundFillColorDefaultBrush}"
BorderBrush="{ThemeResource CardStrokeColorDefaultBrush}"
BorderThickness="1"
CornerRadius="10"
Padding="6">
<Grid>
<TextBlock x:Name="EmptyConversationTextBlock"
Text="Transcript and replies appear here."
Foreground="{ThemeResource TextFillColorSecondaryBrush}"
HorizontalAlignment="Center"
VerticalAlignment="Center"
TextWrapping="Wrap"/>
<ScrollViewer x:Name="ConversationScrollViewer"
VerticalScrollBarVisibility="Auto"
HorizontalScrollBarVisibility="Disabled">
<StackPanel Spacing="6">
<ItemsControl x:Name="ConversationItemsControl">
<ItemsControl.ItemTemplate>
<DataTemplate>
<StackPanel Spacing="1">
<TextBlock Text="{Binding Caption}"
Foreground="{ThemeResource TextFillColorSecondaryBrush}"
FontSize="{Binding CaptionFontSize}"/>
<TextBlock Text="{Binding Message}"
FontSize="{Binding MessageFontSize}"
TextWrapping="Wrap"
MaxLines="5"/>
</StackPanel>
</DataTemplate>
</ItemsControl.ItemTemplate>
</ItemsControl>
<StackPanel x:Name="DraftPanel"
Visibility="Collapsed"
Spacing="1">
<TextBlock x:Name="DraftCaptionTextBlock"
Text="You (draft)"
Foreground="{ThemeResource TextFillColorSecondaryBrush}"
FontSize="10"/>
<TextBlock x:Name="DraftTextBlock"
FontSize="13"
TextWrapping="Wrap"
FontStyle="Italic"
Foreground="{ThemeResource TextFillColorSecondaryBrush}"
MaxLines="4"/>
</StackPanel>
</StackPanel>
</ScrollViewer>
</Grid>
</Border>
<Border Grid.Row="1"
Margin="4"
Background="{ThemeResource CardBackgroundFillColorDefaultBrush}"
BorderBrush="{ThemeResource CardStrokeColorDefaultBrush}"
BorderThickness="1"
CornerRadius="10"
Padding="6,4">
<Grid ColumnSpacing="4">
<Grid.ColumnDefinitions>
<ColumnDefinition Width="*"/>
<ColumnDefinition Width="Auto"/>
</Grid.ColumnDefinitions>
<TextBlock x:Name="TroubleshootingTextBlock"
Grid.Column="0"
VerticalAlignment="Center"
Foreground="{ThemeResource TextFillColorSecondaryBrush}"
Visibility="Collapsed"
MaxLines="2"
TextWrapping="Wrap"
FontSize="10"/>
<StackPanel Grid.Column="1"
Orientation="Horizontal"
Spacing="4">
<Button x:Name="PauseResumeButton"
Width="28"
Height="26"
Padding="0"
Click="OnPauseResumeClick"
ToolTipService.ToolTip="Pause or resume voice mode">
<SymbolIcon x:Name="PauseResumeIcon" Symbol="Pause"/>
</Button>
<Button x:Name="SkipReplyButton"
Width="28"
Height="26"
Padding="0"
Click="OnSkipReplyClick"
ToolTipService.ToolTip="Skip current reply">
<SymbolIcon Symbol="Forward"/>
</Button>
<Button x:Name="ViewSettingsButton"
Width="28"
Height="26"
Padding="0"
ToolTipService.ToolTip="Voice repeater settings">
<SymbolIcon Symbol="Setting"/>
<Button.Flyout>
<Flyout Placement="TopEdgeAlignedRight">
<StackPanel Width="220" Spacing="8" Padding="8">
<CheckBox x:Name="AutoScrollCheckBox"
Content="Auto-scroll"
Checked="OnAutoScrollChanged"
Unchecked="OnAutoScrollChanged"/>
<CheckBox x:Name="FloatingEnabledCheckBox"
Content="Float above other windows"
Checked="OnFloatingEnabledChanged"
Unchecked="OnFloatingEnabledChanged"/>
<StackPanel Spacing="4">
<TextBlock Text="Text size"
Foreground="{ThemeResource TextFillColorSecondaryBrush}"/>
<ComboBox x:Name="TextSizeComboBox"
SelectionChanged="OnTextSizeSelectionChanged">
<ComboBoxItem Content="11 pt" Tag="11"/>
<ComboBoxItem Content="12 pt" Tag="12"/>
<ComboBoxItem Content="13 pt" Tag="13"/>
<ComboBoxItem Content="14 pt" Tag="14"/>
<ComboBoxItem Content="15 pt" Tag="15"/>
</ComboBox>
</StackPanel>
<Button Content="Open Voice Status"
Click="OnOpenVoiceStatusClick"/>
</StackPanel>
</Flyout>
</Button.Flyout>
</Button>
</StackPanel>
</Grid>
</Border>
</Grid>
</winex:WindowEx>

View File

@ -0,0 +1,563 @@
using Microsoft.UI.Windowing;
using Microsoft.UI.Dispatching;
using Microsoft.UI.Xaml;
using Microsoft.UI.Xaml.Controls;
using OpenClaw.Shared;
using OpenClawTray.Helpers;
using OpenClawTray.Services;
using OpenClawTray.Services.Voice;
using System;
using System.Collections.ObjectModel;
using System.ComponentModel;
using System.Runtime.CompilerServices;
using System.Threading.Tasks;
using Windows.Graphics;
using WinUIEx;
namespace OpenClawTray.Windows;
public sealed partial class VoiceRepeaterWindow : WindowEx, IVoiceChatWindow
{
private const int MaxConversationItems = 24;
private const int DefaultWidth = 360;
private const int DefaultHeight = 170;
private const int DefaultMargin = 12;
private const double DefaultTextSize = 13;
private const double DefaultCaptionSize = 10;
private readonly SettingsManager _settings;
private readonly IVoiceRuntimeControlApi _voiceRuntimeControlApi;
private readonly ObservableCollection<ConversationItem> _conversationItems = [];
private readonly DispatcherQueueTimer? _refreshTimer;
private readonly DispatcherQueueTimer? _layoutSaveTimer;
private bool _controlActionInFlight;
private bool _suppressSettingsEvents;
private bool _suppressPlacementSave = true;
private bool _initialPlacementPending = true;
private bool _placementDirty;
private bool _autoScrollEnabled;
private double _messageFontSize = DefaultTextSize;
private double _captionFontSize = DefaultCaptionSize;
public bool IsClosed { get; private set; }
public event EventHandler? OpenVoiceStatusRequested;
public VoiceRepeaterWindow(
SettingsManager settings,
IVoiceRuntimeControlApi voiceRuntimeControlApi)
{
_settings = settings;
_voiceRuntimeControlApi = voiceRuntimeControlApi;
_autoScrollEnabled = _settings.VoiceRepeaterWindow.AutoScroll;
InitializeComponent();
Title = "Voice Mode";
ApplyStoredWindowPlacement();
this.SetIcon(AppIconHelper.GetStatusIconPath(ConnectionStatus.Connected));
ConversationItemsControl.ItemsSource = _conversationItems;
Closed += OnWindowClosed;
Activated += OnWindowActivated;
var dispatcherQueue = DispatcherQueue.GetForCurrentThread();
if (dispatcherQueue != null)
{
_refreshTimer = dispatcherQueue.CreateTimer();
_refreshTimer.Interval = TimeSpan.FromMilliseconds(400);
_refreshTimer.Tick += (_, _) => RefreshStatus();
_refreshTimer.Start();
_layoutSaveTimer = dispatcherQueue.CreateTimer();
_layoutSaveTimer.Interval = TimeSpan.FromMilliseconds(600);
_layoutSaveTimer.IsRepeating = false;
_layoutSaveTimer.Tick += (_, _) =>
{
_layoutSaveTimer.Stop();
SaveWindowPlacement();
};
}
if (AppWindow is not null)
{
AppWindow.Changed += OnAppWindowChanged;
}
ApplyViewSettings();
RefreshStatus();
UpdateConversationPlaceholder();
}
public void RefreshStatus()
{
var status = _voiceRuntimeControlApi.CurrentStatus;
ApplyStatus(status);
}
public Task UpdateVoiceTranscriptDraftAsync(string text, bool clear)
{
var draftText = clear ? string.Empty : (text ?? string.Empty);
DraftTextBlock.Text = draftText;
DraftPanel.Visibility = string.IsNullOrWhiteSpace(draftText)
? Visibility.Collapsed
: Visibility.Visible;
UpdateConversationPlaceholder();
ScrollConversationToEnd();
return Task.CompletedTask;
}
public Task AppendVoiceConversationTurnAsync(VoiceConversationTurnEventArgs args)
{
if (args == null || string.IsNullOrWhiteSpace(args.Message))
{
return Task.CompletedTask;
}
var item = new ConversationItem(
args.Direction == VoiceConversationDirection.Outgoing ? "You" : "Assistant",
DateTime.Now.ToString("HH:mm:ss"),
args.Message,
_messageFontSize,
_captionFontSize);
_conversationItems.Add(item);
while (_conversationItems.Count > MaxConversationItems)
{
_conversationItems.RemoveAt(0);
}
UpdateConversationPlaceholder();
ScrollConversationToEnd();
return Task.CompletedTask;
}
private async void OnPauseResumeClick(object sender, RoutedEventArgs e)
{
if (_controlActionInFlight)
{
return;
}
_controlActionInFlight = true;
ApplyStatus(_voiceRuntimeControlApi.CurrentStatus);
try
{
var status = _voiceRuntimeControlApi.CurrentStatus;
if (status.State == VoiceRuntimeState.Paused)
{
await _voiceRuntimeControlApi.ResumeAsync(new VoiceResumeArgs { Reason = "Voice repeater resume button" });
}
else
{
await _voiceRuntimeControlApi.PauseAsync(new VoicePauseArgs { Reason = "Voice repeater pause button" });
}
}
finally
{
_controlActionInFlight = false;
RefreshStatus();
}
}
private async void OnSkipReplyClick(object sender, RoutedEventArgs e)
{
if (_controlActionInFlight || !_voiceRuntimeControlApi.CurrentStatus.CanSkipReply)
{
return;
}
_controlActionInFlight = true;
ApplyStatus(_voiceRuntimeControlApi.CurrentStatus);
try
{
await _voiceRuntimeControlApi.SkipCurrentReplyAsync(new VoiceSkipArgs
{
Reason = "Voice repeater skip button"
});
}
finally
{
_controlActionInFlight = false;
RefreshStatus();
}
}
private void OnAutoScrollChanged(object sender, RoutedEventArgs e)
{
if (_suppressSettingsEvents)
{
return;
}
_autoScrollEnabled = AutoScrollCheckBox.IsChecked == true;
_settings.VoiceRepeaterWindow.AutoScroll = _autoScrollEnabled;
_settings.Save(logSuccess: false);
if (_autoScrollEnabled)
{
ScrollConversationToEnd();
}
}
private void OnTextSizeSelectionChanged(object sender, SelectionChangedEventArgs e)
{
if (_suppressSettingsEvents || TextSizeComboBox.SelectedItem is not ComboBoxItem item)
{
return;
}
if (!double.TryParse(item.Tag?.ToString(), out var size))
{
return;
}
_settings.VoiceRepeaterWindow.TextSize = size;
ApplyViewSettings();
_settings.Save(logSuccess: false);
}
private void OnFloatingEnabledChanged(object sender, RoutedEventArgs e)
{
if (_suppressSettingsEvents)
{
return;
}
var enabled = FloatingEnabledCheckBox.IsChecked == true;
_settings.VoiceRepeaterWindow.FloatingEnabled = enabled;
IsAlwaysOnTop = enabled;
_settings.Save(logSuccess: false);
}
private void OnOpenVoiceStatusClick(object sender, RoutedEventArgs e)
{
OpenVoiceStatusRequested?.Invoke(this, EventArgs.Empty);
}
private void OnWindowClosed(object sender, WindowEventArgs e)
{
if (_refreshTimer != null)
{
_refreshTimer.Stop();
}
if (_layoutSaveTimer != null)
{
_layoutSaveTimer.Stop();
}
if (AppWindow is not null)
{
AppWindow.Changed -= OnAppWindowChanged;
}
Activated -= OnWindowActivated;
FlushWindowPlacement();
IsClosed = true;
}
private void OnWindowActivated(object sender, WindowActivatedEventArgs args)
{
if (!_initialPlacementPending)
{
return;
}
_initialPlacementPending = false;
ApplyStoredWindowPlacement();
var dispatcherQueue = DispatcherQueue.GetForCurrentThread();
_ = dispatcherQueue?.TryEnqueue(() => _suppressPlacementSave = false);
}
private void OnAppWindowChanged(AppWindow sender, AppWindowChangedEventArgs args)
{
if (_suppressPlacementSave)
{
return;
}
if (args.DidPositionChange || args.DidSizeChange)
{
_placementDirty = true;
_layoutSaveTimer?.Stop();
_layoutSaveTimer?.Start();
}
}
private void ApplyStatus(VoiceStatusInfo status)
{
Title = $"Voice Mode ({GetWindowStateLabel(status)})";
DraftCaptionTextBlock.Text = status.State == VoiceRuntimeState.RecordingUtterance
? "You (speaking)"
: "You (draft)";
if (string.IsNullOrWhiteSpace(status.LastError))
{
TroubleshootingTextBlock.Visibility = Visibility.Collapsed;
TroubleshootingTextBlock.Text = string.Empty;
}
else
{
TroubleshootingTextBlock.Visibility = Visibility.Visible;
TroubleshootingTextBlock.Text = status.LastError;
}
var paused = status.State == VoiceRuntimeState.Paused;
PauseResumeButton.IsEnabled = !_controlActionInFlight && status.Mode != VoiceActivationMode.Off;
PauseResumeIcon.Symbol = paused ? Symbol.Play : Symbol.Pause;
ToolTipService.SetToolTip(
PauseResumeButton,
paused ? "Resume voice mode" : "Pause voice mode");
SkipReplyButton.IsEnabled = !_controlActionInFlight && status.CanSkipReply;
}
private void ApplyStoredWindowPlacement()
{
if (AppWindow is null)
{
return;
}
var prefs = _settings.VoiceRepeaterWindow;
var width = prefs.HasSavedPlacement
? prefs.Width.GetValueOrDefault(DefaultWidth)
: DefaultWidth;
var height = prefs.HasSavedPlacement
? prefs.Height.GetValueOrDefault(DefaultHeight)
: DefaultHeight;
var clampedWidth = Math.Max(width, 320);
var clampedHeight = Math.Max(height, 150);
IsAlwaysOnTop = prefs.FloatingEnabled;
var targetRect = prefs.HasSavedPlacement && prefs.X.HasValue && prefs.Y.HasValue
? new RectInt32(prefs.X.Value, prefs.Y.Value, clampedWidth, clampedHeight)
: GetDefaultAnchorRect(clampedWidth, clampedHeight);
if (!IsPlacementVisible(targetRect))
{
targetRect = GetDefaultAnchorRect(clampedWidth, clampedHeight);
}
try
{
AppWindow.MoveAndResize(targetRect);
}
catch
{
this.SetWindowSize(targetRect.Width, targetRect.Height);
AppWindow.Move(new PointInt32(targetRect.X, targetRect.Y));
}
}
private void ApplyViewSettings()
{
_suppressSettingsEvents = true;
try
{
_autoScrollEnabled = _settings.VoiceRepeaterWindow.AutoScroll;
_messageFontSize = Math.Clamp(
_settings.VoiceRepeaterWindow.TextSize > 0 ? _settings.VoiceRepeaterWindow.TextSize : DefaultTextSize,
11,
15);
_captionFontSize = Math.Max(9, _messageFontSize - 3);
DraftTextBlock.FontSize = _messageFontSize;
DraftCaptionTextBlock.FontSize = _captionFontSize;
TroubleshootingTextBlock.FontSize = _captionFontSize;
foreach (var item in _conversationItems)
{
item.MessageFontSize = _messageFontSize;
item.CaptionFontSize = _captionFontSize;
}
AutoScrollCheckBox.IsChecked = _autoScrollEnabled;
FloatingEnabledCheckBox.IsChecked = _settings.VoiceRepeaterWindow.FloatingEnabled;
SelectTextSizeItem(_messageFontSize);
}
finally
{
_suppressSettingsEvents = false;
}
}
private void SaveWindowPlacement()
{
if (IsClosed || AppWindow is null || _suppressPlacementSave)
{
return;
}
var size = AppWindow.Size;
var position = AppWindow.Position;
_settings.VoiceRepeaterWindow.Width = size.Width;
_settings.VoiceRepeaterWindow.Height = size.Height;
_settings.VoiceRepeaterWindow.X = position.X;
_settings.VoiceRepeaterWindow.Y = position.Y;
_settings.VoiceRepeaterWindow.HasSavedPlacement = true;
_settings.Save(logSuccess: false);
_placementDirty = false;
}
private void FlushWindowPlacement()
{
if (_placementDirty || !IsClosed)
{
SaveWindowPlacement();
}
}
private RectInt32 GetDefaultAnchorRect(int width, int height)
{
var displayArea = DisplayArea.Primary;
var x = displayArea.WorkArea.X + DefaultMargin;
var y = displayArea.WorkArea.Y + Math.Max(DefaultMargin, displayArea.WorkArea.Height - height - DefaultMargin);
return new RectInt32(x, y, width, height);
}
private static bool IsPlacementVisible(RectInt32 rect)
{
try
{
var displayArea = DisplayArea.GetFromRect(rect, DisplayAreaFallback.Nearest);
var workArea = displayArea.WorkArea;
return rect.Width > 0 &&
rect.Height > 0 &&
rect.X < workArea.X + workArea.Width &&
rect.X + rect.Width > workArea.X &&
rect.Y < workArea.Y + workArea.Height &&
rect.Y + rect.Height > workArea.Y;
}
catch
{
return false;
}
}
private void SelectTextSizeItem(double size)
{
var sizeTag = ((int)Math.Round(size)).ToString();
foreach (var entry in TextSizeComboBox.Items)
{
if (entry is ComboBoxItem item && string.Equals(item.Tag?.ToString(), sizeTag, StringComparison.Ordinal))
{
TextSizeComboBox.SelectedItem = item;
return;
}
}
TextSizeComboBox.SelectedIndex = 2;
}
private void UpdateConversationPlaceholder()
{
EmptyConversationTextBlock.Visibility = _conversationItems.Count == 0 && DraftPanel.Visibility != Visibility.Visible
? Visibility.Visible
: Visibility.Collapsed;
}
private void ScrollConversationToEnd()
{
if (!_autoScrollEnabled)
{
return;
}
var dispatcherQueue = DispatcherQueue.GetForCurrentThread();
_ = dispatcherQueue?.TryEnqueue(() =>
{
ConversationScrollViewer.UpdateLayout();
ConversationScrollViewer.ChangeView(null, ConversationScrollViewer.ScrollableHeight, null, true);
_ = dispatcherQueue.TryEnqueue(() =>
ConversationScrollViewer.ChangeView(null, ConversationScrollViewer.ScrollableHeight, null, true));
});
}
private static string GetWindowStateLabel(VoiceStatusInfo status)
{
return status.State switch
{
VoiceRuntimeState.ListeningForVoiceWake => "listening",
VoiceRuntimeState.ListeningContinuously => "listening",
VoiceRuntimeState.RecordingUtterance => "hearing you",
VoiceRuntimeState.AwaitingResponse => "waiting",
VoiceRuntimeState.PlayingResponse => "speaking",
VoiceRuntimeState.Paused => "paused",
VoiceRuntimeState.Arming => "starting",
VoiceRuntimeState.Error => "error",
_ when status.Mode == VoiceActivationMode.Off => "off",
_ => "idle"
};
}
private sealed class ConversationItem : INotifyPropertyChanged
{
private double _messageFontSize;
private double _captionFontSize;
public ConversationItem(
string speaker,
string timestamp,
string message,
double messageFontSize,
double captionFontSize)
{
Speaker = speaker;
Timestamp = timestamp;
Message = message;
_messageFontSize = messageFontSize;
_captionFontSize = captionFontSize;
}
public string Speaker { get; }
public string Timestamp { get; }
public string Message { get; }
public string Caption => $"{Speaker} · {Timestamp}";
public double MessageFontSize
{
get => _messageFontSize;
set
{
if (Math.Abs(_messageFontSize - value) < 0.01)
{
return;
}
_messageFontSize = value;
OnPropertyChanged();
}
}
public double CaptionFontSize
{
get => _captionFontSize;
set
{
if (Math.Abs(_captionFontSize - value) < 0.01)
{
return;
}
_captionFontSize = value;
OnPropertyChanged();
}
}
public event PropertyChangedEventHandler? PropertyChanged;
private void OnPropertyChanged([CallerMemberName] string? propertyName = null)
{
PropertyChanged?.Invoke(this, new PropertyChangedEventArgs(propertyName));
}
}
}

View File

@ -0,0 +1,15 @@
namespace OpenClawTray.Windows;
internal sealed class WebChatVoiceDomState
{
public WebChatVoiceDomState()
{
}
public string PendingDraft { get; private set; } = string.Empty;
public void SetDraft(string? text, bool clear)
{
PendingDraft = clear ? string.Empty : (text ?? string.Empty);
}
}

View File

@ -3,6 +3,7 @@ using Microsoft.Web.WebView2.Core;
using OpenClaw.Shared;
using OpenClawTray.Helpers;
using OpenClawTray.Services;
using OpenClawTray.Services.Voice;
using System;
using System.Diagnostics;
using System.IO;
@ -14,14 +15,16 @@ using Windows.Foundation;
namespace OpenClawTray.Windows;
public sealed partial class WebChatWindow : WindowEx
, IVoiceChatWindow
{
private readonly string _gatewayUrl;
private readonly string _token;
// Store event handlers for cleanup
private readonly WebChatVoiceDomState _voiceDomState;
private bool _voiceDomReady;
private TypedEventHandler<CoreWebView2, CoreWebView2NavigationCompletedEventArgs>? _navigationCompletedHandler;
private TypedEventHandler<CoreWebView2, CoreWebView2NavigationStartingEventArgs>? _navigationStartingHandler;
public bool IsClosed { get; private set; }
public WebChatWindow(string gatewayUrl, string token)
@ -29,18 +32,18 @@ public sealed partial class WebChatWindow : WindowEx
Logger.Info($"WebChatWindow: Constructor called, gateway={gatewayUrl}");
_gatewayUrl = gatewayUrl;
_token = token;
_voiceDomState = new WebChatVoiceDomState();
InitializeComponent();
// Window configuration
this.SetWindowSize(520, 750);
this.MinWidth = 380;
this.MinHeight = 450;
this.CenterOnScreen();
this.SetIcon(IconHelper.GetStatusIconPath(ConnectionStatus.Connected));
this.SetIcon(AppIconHelper.GetStatusIconPath(ConnectionStatus.Connected));
Closed += OnWindowClosed;
Logger.Info("WebChatWindow: Starting InitializeWebViewAsync");
_ = InitializeWebViewAsync();
}
@ -48,8 +51,8 @@ public sealed partial class WebChatWindow : WindowEx
private void OnWindowClosed(object sender, WindowEventArgs e)
{
IsClosed = true;
// Cleanup WebView2 event handlers
_voiceDomReady = false;
if (WebView.CoreWebView2 != null)
{
if (_navigationCompletedHandler != null)
@ -64,35 +67,39 @@ public sealed partial class WebChatWindow : WindowEx
try
{
Logger.Info("WebChatWindow: Initializing WebView2...");
// Set up user data folder for WebView2
var userDataFolder = Path.Combine(
Environment.GetFolderPath(Environment.SpecialFolder.LocalApplicationData),
"OpenClawTray", "WebView2");
Directory.CreateDirectory(userDataFolder);
Logger.Info($"WebChatWindow: User data folder: {userDataFolder}");
// Set environment variable for user data folder
Environment.SetEnvironmentVariable("WEBVIEW2_USER_DATA_FOLDER", userDataFolder);
Logger.Info("WebChatWindow: Calling EnsureCoreWebView2Async...");
await WebView.EnsureCoreWebView2Async();
Logger.Info("WebChatWindow: CoreWebView2 initialized successfully");
// Configure WebView2
WebView.CoreWebView2.Settings.IsStatusBarEnabled = false;
WebView.CoreWebView2.Settings.AreDefaultContextMenusEnabled = true;
WebView.CoreWebView2.Settings.IsZoomControlEnabled = true;
await WebView.CoreWebView2.AddScriptToExecuteOnDocumentCreatedAsync(WebChatVoiceDomBridge.DocumentCreatedScript);
_voiceDomReady = false;
// Handle navigation events (store for cleanup)
_navigationCompletedHandler = (s, e) =>
{
Logger.Info($"WebChatWindow: Navigation completed, success={e.IsSuccess}, status={e.WebErrorStatus}");
LoadingRing.IsActive = false;
LoadingRing.Visibility = Visibility.Collapsed;
// Show friendly error if connection failed
_voiceDomReady = e.IsSuccess;
if (e.IsSuccess)
{
_ = RefreshTrayVoiceDomStateAsync();
}
if (!e.IsSuccess && (e.WebErrorStatus == CoreWebView2WebErrorStatus.ConnectionAborted ||
e.WebErrorStatus == CoreWebView2WebErrorStatus.CannotConnect ||
e.WebErrorStatus == CoreWebView2WebErrorStatus.ConnectionReset ||
@ -115,15 +122,14 @@ public sealed partial class WebChatWindow : WindowEx
_navigationStartingHandler = (s, e) =>
{
// Strip query params to avoid logging tokens
var safeUri = e.Uri?.Split('?')[0] ?? "unknown";
Logger.Info($"WebChatWindow: Navigation starting to {safeUri}");
_voiceDomReady = false;
LoadingRing.IsActive = true;
LoadingRing.Visibility = Visibility.Visible;
};
WebView.CoreWebView2.NavigationStarting += _navigationStartingHandler;
// Navigate to chat
NavigateToChat();
}
catch (Exception ex)
@ -135,13 +141,12 @@ public sealed partial class WebChatWindow : WindowEx
Logger.Error($"WebView2 inner exception: {ex.InnerException.GetType().FullName}: {ex.InnerException.Message}");
}
Logger.Error($"WebView2 stack trace: {ex.StackTrace}");
// Show error in the dialog instead of falling back to browser
LoadingRing.IsActive = false;
LoadingRing.Visibility = Visibility.Collapsed;
WebView.Visibility = Visibility.Collapsed;
ErrorPanel.Visibility = Visibility.Visible;
var errorDetails = $"Exception: {ex.GetType().FullName}\n" +
$"HResult: 0x{ex.HResult:X8}\n" +
$"Message: {ex.Message}\n\n" +
@ -149,17 +154,16 @@ public sealed partial class WebChatWindow : WindowEx
$"Architecture: {RuntimeInformation.ProcessArchitecture}\n" +
$"OS: {RuntimeInformation.OSDescription}\n\n" +
$"Stack Trace:\n{ex.StackTrace}";
if (ex.InnerException != null)
{
errorDetails += $"\n\nInner Exception: {ex.InnerException.GetType().FullName}\n{ex.InnerException.Message}";
}
ErrorText.Text = errorDetails;
}
}
// Set to a test URL to bypass gateway (e.g., "https://www.bing.com"), or null for normal operation
private const string? DEBUG_TEST_URL = null;
private static bool IsLocalHost(Uri uri)
@ -208,12 +212,11 @@ public sealed partial class WebChatWindow : WindowEx
ErrorPanel.Visibility = Visibility.Visible;
ErrorText.Text = message;
}
private void NavigateToChat()
{
if (WebView.CoreWebView2 == null) return;
// If debug URL is set, use it instead of gateway
if (!string.IsNullOrEmpty(DEBUG_TEST_URL))
{
Logger.Info($"WebChatWindow: DEBUG MODE - Navigating to test URL: {DEBUG_TEST_URL}");
@ -251,7 +254,7 @@ public sealed partial class WebChatWindow : WindowEx
ShowErrorMessage(errorMessage);
return;
}
try
{
Process.Start(new ProcessStartInfo(url) { UseShellExecute = true });
@ -266,4 +269,34 @@ public sealed partial class WebChatWindow : WindowEx
{
WebView.CoreWebView2?.OpenDevToolsWindow();
}
public async Task UpdateVoiceTranscriptDraftAsync(string text, bool clear)
{
_voiceDomState.SetDraft(text, clear);
await RefreshTrayVoiceDomStateAsync();
}
public async Task AppendVoiceConversationTurnAsync(VoiceConversationTurnEventArgs args)
{
await Task.CompletedTask;
}
private async Task RefreshTrayVoiceDomStateAsync()
{
if (WebView.CoreWebView2 == null || !_voiceDomReady || IsClosed)
{
return;
}
try
{
await WebView.CoreWebView2.ExecuteScriptAsync(
WebChatVoiceDomBridge.BuildSetDraftScript(_voiceDomState.PendingDraft));
await WebView.CoreWebView2.ExecuteScriptAsync(WebChatVoiceDomBridge.ClearLegacyTurnsScript);
}
catch (Exception ex)
{
Logger.Warn($"WebChatWindow: Failed to apply voice DOM state: {ex.Message}");
}
}
}

View File

@ -965,3 +965,234 @@ public class CameraCapabilityTests
Assert.Contains("Camera access blocked", res.Error);
}
}
public class VoiceCapabilityTests
{
private static JsonElement Parse(string json)
{
using var doc = JsonDocument.Parse(json);
return doc.RootElement.Clone();
}
[Fact]
public void CanHandle_VoiceCommands()
{
var cap = new VoiceCapability(NullLogger.Instance);
Assert.True(cap.CanHandle(VoiceCommands.ListDevices));
Assert.True(cap.CanHandle(VoiceCommands.GetSettings));
Assert.True(cap.CanHandle(VoiceCommands.SetSettings));
Assert.True(cap.CanHandle(VoiceCommands.GetStatus));
Assert.True(cap.CanHandle(VoiceCommands.Start));
Assert.True(cap.CanHandle(VoiceCommands.Stop));
Assert.False(cap.CanHandle("voice.unknown"));
Assert.Equal("voice", cap.Category);
}
[Fact]
public async Task ListDevices_ReturnsArrayFromHandler()
{
var cap = new VoiceCapability(NullLogger.Instance);
cap.ListDevicesRequested += () => Task.FromResult<VoiceAudioDeviceInfo[]>(
[
new VoiceAudioDeviceInfo
{
DeviceId = "default-input",
Name = "System default microphone",
IsDefault = true,
IsInput = true
}
]);
var res = await cap.ExecuteAsync(new NodeInvokeRequest
{
Id = "voice1",
Command = VoiceCommands.ListDevices,
Args = Parse("""{}""")
});
Assert.True(res.Ok);
var json = JsonSerializer.Serialize(res.Payload);
using var doc = JsonDocument.Parse(json);
Assert.Equal(JsonValueKind.Array, doc.RootElement.ValueKind);
Assert.Equal("default-input", doc.RootElement[0].GetProperty("DeviceId").GetString());
}
[Fact]
public async Task GetSettings_ReturnsSettingsFromHandler()
{
var cap = new VoiceCapability(NullLogger.Instance);
cap.SettingsRequested += () => Task.FromResult(new VoiceSettings
{
Enabled = true,
Mode = VoiceActivationMode.VoiceWake
});
var res = await cap.ExecuteAsync(new NodeInvokeRequest
{
Id = "voice2",
Command = VoiceCommands.GetSettings,
Args = Parse("""{}""")
});
Assert.True(res.Ok);
var json = JsonSerializer.Serialize(res.Payload);
using var doc = JsonDocument.Parse(json);
Assert.True(doc.RootElement.GetProperty("Enabled").GetBoolean());
Assert.Equal("VoiceWake", doc.RootElement.GetProperty("Mode").GetString());
}
[Fact]
public async Task SetSettings_UsesUpdateEnvelope_WhenPresent()
{
var cap = new VoiceCapability(NullLogger.Instance);
VoiceSettingsUpdateArgs? received = null;
cap.SettingsUpdateRequested += update =>
{
received = update;
return Task.FromResult(update.Settings);
};
var res = await cap.ExecuteAsync(new NodeInvokeRequest
{
Id = "voice3",
Command = VoiceCommands.SetSettings,
Args = Parse("""{"update":{"persist":false,"settings":{"enabled":true,"mode":"TalkMode"}}}""")
});
Assert.True(res.Ok);
Assert.NotNull(received);
Assert.False(received!.Persist);
Assert.Equal(VoiceActivationMode.TalkMode, received.Settings.Mode);
}
[Fact]
public async Task GetStatus_ReturnsStatusFromHandler()
{
var cap = new VoiceCapability(NullLogger.Instance);
cap.StatusRequested += () => Task.FromResult(new VoiceStatusInfo
{
Available = true,
Running = true,
Mode = VoiceActivationMode.TalkMode,
State = VoiceRuntimeState.ListeningContinuously
});
var res = await cap.ExecuteAsync(new NodeInvokeRequest
{
Id = "voice4",
Command = VoiceCommands.GetStatus,
Args = Parse("""{}""")
});
Assert.True(res.Ok);
var json = JsonSerializer.Serialize(res.Payload);
using var doc = JsonDocument.Parse(json);
Assert.True(doc.RootElement.GetProperty("Running").GetBoolean());
Assert.Equal("ListeningContinuously", doc.RootElement.GetProperty("State").GetString());
}
[Fact]
public async Task Start_PassesArgsToHandler()
{
var cap = new VoiceCapability(NullLogger.Instance);
VoiceStartArgs? received = null;
cap.StartRequested += args =>
{
received = args;
return Task.FromResult(new VoiceStatusInfo
{
Available = true,
Running = true,
Mode = args.Mode ?? VoiceActivationMode.Off,
State = VoiceRuntimeState.ListeningForVoiceWake,
SessionKey = args.SessionKey
});
};
var res = await cap.ExecuteAsync(new NodeInvokeRequest
{
Id = "voice5",
Command = VoiceCommands.Start,
Args = Parse("""{"mode":"VoiceWake","sessionKey":"session-123"}""")
});
Assert.True(res.Ok);
Assert.NotNull(received);
Assert.Equal(VoiceActivationMode.VoiceWake, received!.Mode);
Assert.Equal("session-123", received.SessionKey);
}
[Fact]
public async Task Stop_PassesReasonToHandler()
{
var cap = new VoiceCapability(NullLogger.Instance);
VoiceStopArgs? received = null;
cap.StopRequested += args =>
{
received = args;
return Task.FromResult(new VoiceStatusInfo
{
Available = true,
Running = false,
Mode = VoiceActivationMode.Off,
State = VoiceRuntimeState.Stopped,
LastError = args.Reason
});
};
var res = await cap.ExecuteAsync(new NodeInvokeRequest
{
Id = "voice6",
Command = VoiceCommands.Stop,
Args = Parse("""{"reason":"user requested"}""")
});
Assert.True(res.Ok);
Assert.NotNull(received);
Assert.Equal("user requested", received!.Reason);
}
[Fact]
public async Task Start_ReturnsError_WhenHandlerMissing()
{
var cap = new VoiceCapability(NullLogger.Instance);
var res = await cap.ExecuteAsync(new NodeInvokeRequest
{
Id = "voice7",
Command = VoiceCommands.Start,
Args = Parse("""{}""")
});
Assert.False(res.Ok);
Assert.Contains("not available", res.Error, StringComparison.OrdinalIgnoreCase);
}
[Fact]
public async Task LegacyVoiceSkipCommand_RemainsAccepted()
{
var cap = new VoiceCapability(NullLogger.Instance);
VoiceSkipArgs? received = null;
cap.SkipRequested += args =>
{
received = args;
return Task.FromResult(new VoiceStatusInfo
{
Available = true,
Running = true,
Mode = VoiceActivationMode.TalkMode,
State = VoiceRuntimeState.PlayingResponse
});
};
var res = await cap.ExecuteAsync(new NodeInvokeRequest
{
Id = "voice8",
Command = "voice.skip",
Args = Parse("""{"reason":"legacy caller"}""")
});
Assert.True(res.Ok);
Assert.NotNull(received);
Assert.Equal("legacy caller", received!.Reason);
}
}

View File

@ -588,7 +588,72 @@ public class SystemCapabilityExecApprovalsTests
var result = await cap.ExecuteAsync(request);
Assert.True(result.Ok);
}
[Fact]
public async Task SystemRun_WithDefaultAllow_DeniesDangerousPowerShellWrapperPayload()
{
var tempDir = Path.Combine(Path.GetTempPath(), $"test-{Guid.NewGuid():N}");
Directory.CreateDirectory(tempDir);
try
{
var policy = new ExecApprovalPolicy(tempDir, _logger);
policy.SetRules(
new[]
{
new ExecApprovalRule { Pattern = "Remove-Item *", Action = ExecApprovalAction.Deny }
},
ExecApprovalAction.Allow);
var cap = CreateCapability(policy);
var request = new NodeInvokeRequest
{
Command = "system.run",
Args = JsonDocument.Parse("{\"command\":[\"powershell\",\"-Command\",\"Remove-Item -Recurse -Force C:\\\\important\"]}").RootElement
};
var result = await cap.ExecuteAsync(request);
Assert.False(result.Ok);
Assert.Contains("denied", result.Error!, StringComparison.OrdinalIgnoreCase);
}
finally
{
try { Directory.Delete(tempDir, true); } catch { }
}
}
[Fact]
public async Task SystemRun_WithCommandChain_DeniesBlockedSegment()
{
var tempDir = Path.Combine(Path.GetTempPath(), $"test-{Guid.NewGuid():N}");
Directory.CreateDirectory(tempDir);
try
{
var policy = new ExecApprovalPolicy(tempDir, _logger);
policy.SetRules(
new[]
{
new ExecApprovalRule { Pattern = "echo *", Action = ExecApprovalAction.Allow },
new ExecApprovalRule { Pattern = "del *", Action = ExecApprovalAction.Deny }
},
ExecApprovalAction.Deny);
var cap = CreateCapability(policy);
var request = new NodeInvokeRequest
{
Command = "system.run",
Args = JsonDocument.Parse("{\"command\":\"echo ok & del /s /q C:\\\\important\\\\*\",\"shell\":\"cmd\"}").RootElement
};
var result = await cap.ExecuteAsync(request);
Assert.False(result.Ok);
Assert.Contains("denied", result.Error!, StringComparison.OrdinalIgnoreCase);
}
finally
{
try { Directory.Delete(tempDir, true); } catch { }
}
}
[Fact]
public async Task ExecApprovalsGet_ReturnsPolicy()
{

View File

@ -1,6 +1,8 @@
using System;
using System.Collections;
using System.Collections.Generic;
using System.Linq;
using System.Reflection;
using System.Text.Json;
using Xunit;
using OpenClaw.Shared;
@ -78,6 +80,54 @@ public class OpenClawGatewayClientTests
return _client.GetSessionList();
}
public string GetDefaultChatSessionKey()
{
return GetPrivateField<string>("_defaultChatSessionKey");
}
public void UpdateDefaultChatSessionKeyFromHello(string payloadJson)
{
using var doc = JsonDocument.Parse(payloadJson);
var method = typeof(OpenClawGatewayClient).GetMethod(
"UpdateDefaultChatSessionKeyFromHello",
System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance);
method!.Invoke(_client, new object[] { doc.RootElement.Clone() });
}
public string SerializeChatSendRequest(string message, string sessionKey, string idempotencyKey)
{
var parametersMethod = typeof(OpenClawGatewayClient).GetMethod(
"BuildChatSendParameters",
System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance);
var parameters = parametersMethod!.Invoke(_client, new object[] { message, sessionKey, idempotencyKey });
var serializeMethod = typeof(OpenClawGatewayClient).GetMethod(
"SerializeRequest",
System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Static);
return (string)serializeMethod!.Invoke(null, new object[] { "request-123", "chat.send", parameters! })!;
}
public string SerializeConnectRequest()
{
var parametersMethod = typeof(OpenClawGatewayClient).GetMethod(
"BuildConnectParameters",
System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance);
var parameters = parametersMethod!.Invoke(_client, Array.Empty<object>());
var serializeMethod = typeof(OpenClawGatewayClient).GetMethod(
"SerializeRequest",
System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Static);
return (string)serializeMethod!.Invoke(null, new object[] { "request-456", "connect", parameters! })!;
}
public string NormalizeChatSessionKey(string? sessionKey)
{
var method = typeof(OpenClawGatewayClient).GetMethod(
"NormalizeChatSessionKey",
System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Static);
return (string)method!.Invoke(null, new object?[] { sessionKey })!;
}
public void SetUnsupportedMethodFlags(bool usageStatus, bool usageCost, bool sessionPreview, bool nodeList)
{
SetPrivateField("_usageStatusUnsupported", usageStatus);
@ -134,6 +184,70 @@ public class OpenClawGatewayClientTests
return parsed ?? new SessionsPreviewPayloadInfo();
}
public ChatMessageEventArgs? HandleChatEventAndCaptureMessage(string payloadJson)
{
ChatMessageEventArgs? captured = null;
EventHandler<ChatMessageEventArgs> handler = (_, args) => captured = args;
_client.ChatMessageReceived += handler;
try
{
using var doc = JsonDocument.Parse(payloadJson);
var method = typeof(OpenClawGatewayClient).GetMethod(
"HandleChatEvent",
System.Reflection.BindingFlags.NonPublic | System.Reflection.BindingFlags.Instance);
method!.Invoke(_client, new object[] { doc.RootElement.Clone() });
}
finally
{
_client.ChatMessageReceived -= handler;
}
return captured;
}
public int GetPendingChatPreviewSessionCount()
{
var pending = GetPrivateField<IDictionary>("_pendingChatPreviewSessionKeys");
return pending.Count;
}
public void AddPendingChatPreviewSession(string sessionKey, string? lastKnownAssistantText = null, int attemptCount = 0)
{
var pending = GetPrivateField<IDictionary>("_pendingChatPreviewSessionKeys");
var stateType = typeof(OpenClawGatewayClient).GetNestedType(
"PendingChatPreviewState",
BindingFlags.NonPublic)!;
var state = Activator.CreateInstance(stateType)!;
stateType.GetProperty("LastKnownAssistantText")!.SetValue(state, lastKnownAssistantText);
stateType.GetProperty("AttemptCount")!.SetValue(state, attemptCount);
pending[sessionKey] = state;
}
public void SetLastAssistantMessage(string sessionKey, string text)
{
var lastMessages = GetPrivateField<IDictionary>("_lastAssistantMessagesBySession");
lastMessages[sessionKey] = text;
}
public ChatMessageEventArgs? ParseSessionsPreviewPayloadAndCaptureMessage(string payloadJson)
{
ChatMessageEventArgs? captured = null;
EventHandler<ChatMessageEventArgs> handler = (_, args) => captured = args;
_client.ChatMessageReceived += handler;
try
{
InvokePrivatePayloadParser("ParseSessionsPreview", payloadJson);
}
finally
{
_client.ChatMessageReceived -= handler;
}
return captured;
}
public GatewayNodeInfo[] ParseNodeListPayload(string payloadJson)
{
GatewayNodeInfo[] parsed = Array.Empty<GatewayNodeInfo>();
@ -834,6 +948,164 @@ public class OpenClawGatewayClientTests
Assert.Equal("degraded", channels[0].Status);
}
[Fact]
public void UpdateDefaultChatSessionKeyFromHello_UsesSnapshotMainSessionKey()
{
var helper = new GatewayClientTestHelper();
helper.UpdateDefaultChatSessionKeyFromHello("""
{
"type": "hello-ok",
"snapshot": {
"sessionDefaults": {
"mainSessionKey": "agent:main:main"
}
}
}
""");
Assert.Equal("main", helper.GetDefaultChatSessionKey());
}
[Fact]
public void ParseSessions_MainSession_UpdatesDefaultChatSessionKey()
{
var helper = new GatewayClientTestHelper();
helper.ParseSessionsPayload("""
{
"agent:main:main": {
"status": "active",
"displayName": "Main",
"isMain": true
},
"agent:other:test": {
"status": "active"
}
}
""");
Assert.Equal("main", helper.GetDefaultChatSessionKey());
}
[Fact]
public void SerializeChatSendRequest_IncludesSessionKeyAndIdempotencyKey()
{
var helper = new GatewayClientTestHelper();
var json = helper.SerializeChatSendRequest("hello", "main", "idem-123");
using var doc = JsonDocument.Parse(json);
var parameters = doc.RootElement.GetProperty("params");
Assert.Equal("hello", parameters.GetProperty("message").GetString());
Assert.Equal("main", parameters.GetProperty("sessionKey").GetString());
Assert.Equal("idem-123", parameters.GetProperty("idempotencyKey").GetString());
}
[Fact]
public void NormalizeChatSessionKey_CollapsesExpandedMainKey()
{
var helper = new GatewayClientTestHelper();
Assert.Equal("main", helper.NormalizeChatSessionKey("agent:main:main"));
Assert.Equal("main", helper.NormalizeChatSessionKey("main"));
Assert.Equal("agent:sub:test", helper.NormalizeChatSessionKey("agent:sub:test"));
}
[Fact]
public void HandleChatEvent_FinalWithoutMessage_QueuesPreviewLookup()
{
var helper = new GatewayClientTestHelper();
var captured = helper.HandleChatEventAndCaptureMessage("""
{
"type": "event",
"event": "chat",
"payload": {
"sessionKey": "agent:main:main",
"state": "final"
}
}
""");
Assert.Null(captured);
Assert.Equal(1, helper.GetPendingChatPreviewSessionCount());
}
[Fact]
public void ParseSessionsPreview_EmitsAssistantMessage_ForQueuedFinalPreview()
{
var helper = new GatewayClientTestHelper();
helper.AddPendingChatPreviewSession("main");
var captured = helper.ParseSessionsPreviewPayloadAndCaptureMessage("""
{
"ts": 1739760000000,
"previews": [
{
"key": "agent:main:main",
"status": "ok",
"items": [
{ "role": "user", "text": "hello" },
{ "role": "assistant", "text": "world" }
]
}
]
}
""");
Assert.NotNull(captured);
Assert.Equal("main", captured!.SessionKey);
Assert.Equal("assistant", captured.Role);
Assert.Equal("world", captured.Message);
Assert.True(captured.IsFinal);
Assert.Equal(0, helper.GetPendingChatPreviewSessionCount());
}
[Fact]
public void ParseSessionsPreview_DoesNotEmitStaleAssistantMessage_ForQueuedFinalPreview()
{
var helper = new GatewayClientTestHelper();
helper.SetUnsupportedMethodFlags(usageStatus: false, usageCost: false, sessionPreview: true, nodeList: false);
helper.SetLastAssistantMessage("main", "world");
helper.AddPendingChatPreviewSession("main", lastKnownAssistantText: "world");
var captured = helper.ParseSessionsPreviewPayloadAndCaptureMessage("""
{
"ts": 1739760000000,
"previews": [
{
"key": "agent:main:main",
"status": "ok",
"items": [
{ "role": "user", "text": "hello again" },
{ "role": "assistant", "text": "world" }
]
}
]
}
""");
Assert.Null(captured);
Assert.Equal(1, helper.GetPendingChatPreviewSessionCount());
}
[Fact]
public void SerializeConnectRequest_UsesCliClientModeAndOperatorScopes()
{
var helper = new GatewayClientTestHelper();
var json = helper.SerializeConnectRequest();
using var doc = JsonDocument.Parse(json);
var parameters = doc.RootElement.GetProperty("params");
var client = parameters.GetProperty("client");
var scopes = parameters.GetProperty("scopes").EnumerateArray().Select(item => item.GetString()).ToArray();
Assert.Equal("cli", client.GetProperty("mode").GetString());
Assert.Contains("operator.read", scopes);
Assert.Contains("operator.write", scopes);
}
// ── BuildMissingScopeFixCommands tests ─────────────────────────────────────
[Fact]

View File

@ -1,5 +1,6 @@
using System.Diagnostics;
using System.Runtime.InteropServices;
using System.Text;
using System.Text.Json;
using Xunit;
using OpenClaw.Shared;
@ -127,6 +128,46 @@ public class SystemRunTests
Assert.Equal("qux", runner.LastRequest.Env["BAZ"]);
}
[Fact]
public async Task SystemRun_BlocksDangerousEnvOverride()
{
var runner = new FakeCommandRunner();
var cap = new SystemCapability(NullLogger.Instance);
cap.SetCommandRunner(runner);
var req = new NodeInvokeRequest
{
Id = "r5b",
Command = "system.run",
Args = Parse("""{"command":"test","env":{"PATH":"C:\\evil","FOO":"bar"}}""")
};
var res = await cap.ExecuteAsync(req);
Assert.False(res.Ok);
Assert.Contains("environment variable", res.Error!, StringComparison.OrdinalIgnoreCase);
Assert.Null(runner.LastRequest);
}
[Fact]
public async Task SystemRun_BlocksInvalidEnvName()
{
var runner = new FakeCommandRunner();
var cap = new SystemCapability(NullLogger.Instance);
cap.SetCommandRunner(runner);
var req = new NodeInvokeRequest
{
Id = "r5c",
Command = "system.run",
Args = Parse("""{"command":"test","env":{"BAD NAME":"value"}}""")
};
var res = await cap.ExecuteAsync(req);
Assert.False(res.Ok);
Assert.Contains("environment variable", res.Error!, StringComparison.OrdinalIgnoreCase);
Assert.Null(runner.LastRequest);
}
[Fact]
public async Task SystemRun_DefaultsTimeout_To30s()
{
@ -278,6 +319,45 @@ public class SystemRunTests
Assert.Equal("push", runner.LastRequest.Args[0]);
}
[Fact]
public async Task SystemRun_WithPolicy_DeniesEncodedPowerShellPayload()
{
var tempDir = Path.Combine(Path.GetTempPath(), $"test-{Guid.NewGuid():N}");
Directory.CreateDirectory(tempDir);
try
{
var logger = new ExecTestLogger();
var policy = new ExecApprovalPolicy(tempDir, logger);
policy.SetRules(
new[]
{
new ExecApprovalRule { Pattern = "Remove-Item *", Action = ExecApprovalAction.Deny }
},
ExecApprovalAction.Allow);
var cap = new SystemCapability(logger);
cap.SetCommandRunner(new FakeCommandRunner());
cap.SetApprovalPolicy(policy);
var encoded = Convert.ToBase64String(Encoding.Unicode.GetBytes("Remove-Item -Recurse -Force C:\\important"));
var req = new NodeInvokeRequest
{
Id = "r10",
Command = "system.run",
Args = Parse($$"""{"command":["powershell","-EncodedCommand","{{encoded}}"]}""")
};
var res = await cap.ExecuteAsync(req);
Assert.False(res.Ok);
Assert.Contains("denied", res.Error!, StringComparison.OrdinalIgnoreCase);
}
finally
{
try { Directory.Delete(tempDir, true); } catch { }
}
}
/// <summary>
/// Fake runner for unit testing — no actual process execution.
/// </summary>

View File

@ -0,0 +1,141 @@
using OpenClaw.Shared;
using System.Text.Json;
namespace OpenClaw.Shared.Tests;
public class VoiceCommandsTests
{
[Fact]
public void All_ContainsExpectedCommandsInStableOrder()
{
Assert.Equal(
[
"voice.devices.list",
"voice.settings.get",
"voice.settings.set",
"voice.status.get",
"voice.start",
"voice.stop",
"voice.pause",
"voice.resume",
"voice.response.skip"
],
VoiceCommands.All);
}
}
public class VoiceSchemaDefaultsTests
{
[Fact]
public void VoiceSettings_Defaults_AreConcreteAndProviderAgnostic()
{
var settings = new VoiceSettings();
Assert.False(settings.Enabled);
Assert.Equal(VoiceActivationMode.Off, settings.Mode);
Assert.True(settings.ShowRepeaterAtStartup);
Assert.False(settings.ShowConversationToasts);
Assert.Equal(VoiceProviderIds.Windows, settings.SpeechToTextProviderId);
Assert.Equal(VoiceProviderIds.Windows, settings.TextToSpeechProviderId);
Assert.Equal(16000, settings.SampleRateHz);
Assert.Equal(80, settings.CaptureChunkMs);
Assert.True(settings.BargeInEnabled);
Assert.Equal("NanoWakeWord", settings.VoiceWake.Engine);
Assert.Equal("hey_openclaw", settings.VoiceWake.ModelId);
Assert.Equal(0.65f, settings.VoiceWake.TriggerThreshold);
Assert.Equal(250, settings.TalkMode.MinSpeechMs);
}
[Fact]
public void VoiceStatusInfo_Defaults_ToStopped()
{
var status = new VoiceStatusInfo();
Assert.False(status.Available);
Assert.False(status.Running);
Assert.Equal(VoiceActivationMode.Off, status.Mode);
Assert.Equal(VoiceRuntimeState.Stopped, status.State);
Assert.False(status.VoiceWakeLoaded);
Assert.Equal(0, status.PendingReplyCount);
Assert.False(status.CanSkipReply);
Assert.Null(status.CurrentReplyPreview);
Assert.Null(status.LastError);
}
[Fact]
public void VoiceEnums_Serialize_AsStrings()
{
var json = JsonSerializer.Serialize(new VoiceStartArgs
{
Mode = VoiceActivationMode.VoiceWake
});
Assert.Contains("\"VoiceWake\"", json);
}
[Fact]
public void VoiceProviderCatalog_Defaults_ToEmptyLists()
{
var catalog = new VoiceProviderCatalog();
Assert.Empty(catalog.SpeechToTextProviders);
Assert.Empty(catalog.TextToSpeechProviders);
}
[Fact]
public void VoiceProviderIds_ExposeRequiredBuiltInProviders()
{
Assert.Equal("windows", VoiceProviderIds.Windows);
Assert.Equal("foundry-local", VoiceProviderIds.FoundryLocal);
Assert.Equal("openai-whisper", VoiceProviderIds.OpenAiWhisper);
Assert.Equal("elevenlabs-stt", VoiceProviderIds.ElevenLabsSpeechToText);
Assert.Equal("azure-ai-speech", VoiceProviderIds.AzureAiSpeech);
Assert.Equal("sherpa-onnx", VoiceProviderIds.SherpaOnnx);
Assert.Equal("minimax", VoiceProviderIds.MiniMax);
Assert.Equal("elevenlabs", VoiceProviderIds.ElevenLabs);
Assert.Equal("endpoint", VoiceProviderSettingKeys.Endpoint);
Assert.Equal("modelPath", VoiceProviderSettingKeys.ModelPath);
Assert.Equal("voiceSettingsJson", VoiceProviderSettingKeys.VoiceSettingsJson);
}
[Fact]
public void VoiceProviderOption_Defaults_ToVisibleAndSelectable()
{
var option = new VoiceProviderOption { Name = "Provider" };
Assert.True(option.VisibleInSettings);
Assert.True(option.Selectable);
Assert.Equal("Provider", option.DisplayName);
Assert.Equal(1.0, option.DisplayOpacity);
}
[Fact]
public void VoiceProviderConfigurationStore_Defaults_ToEmptyProviders()
{
var configuration = new VoiceProviderConfigurationStore();
Assert.Empty(configuration.Providers);
}
[Fact]
public void VoiceProviderConfigurationStore_MigratesLegacyProviderCredentials()
{
var configuration = new VoiceProviderConfigurationStore();
configuration.MigrateLegacyCredentials(new VoiceProviderCredentials
{
MiniMaxApiKey = "minimax-key",
MiniMaxModel = "speech-2.8-turbo",
MiniMaxVoiceId = "English_MatureBoss",
ElevenLabsApiKey = "eleven-key",
ElevenLabsModel = "eleven_multilingual_v2",
ElevenLabsVoiceId = "voice-42"
});
Assert.Equal("minimax-key", configuration.GetValue(VoiceProviderIds.MiniMax, VoiceProviderSettingKeys.ApiKey));
Assert.Equal("speech-2.8-turbo", configuration.GetValue(VoiceProviderIds.MiniMax, VoiceProviderSettingKeys.Model));
Assert.Equal("English_MatureBoss", configuration.GetValue(VoiceProviderIds.MiniMax, VoiceProviderSettingKeys.VoiceId));
Assert.Equal("eleven-key", configuration.GetValue(VoiceProviderIds.ElevenLabs, VoiceProviderSettingKeys.ApiKey));
Assert.Equal("eleven_multilingual_v2", configuration.GetValue(VoiceProviderIds.ElevenLabs, VoiceProviderSettingKeys.Model));
Assert.Equal("voice-42", configuration.GetValue(VoiceProviderIds.ElevenLabs, VoiceProviderSettingKeys.VoiceId));
}
}

View File

@ -0,0 +1,148 @@
using OpenClaw.Shared;
namespace OpenClaw.Shared.Tests;
public class VoiceProviderConfigurationStoreExtensionsTests
{
[Fact]
public void GetOrAddProvider_ReusesExistingProvider_CaseInsensitively()
{
var store = new VoiceProviderConfigurationStore
{
Providers =
[
new VoiceProviderConfiguration { ProviderId = "MiniMax" }
]
};
var provider = store.GetOrAddProvider("minimax");
Assert.Same(store.Providers[0], provider);
Assert.Single(store.Providers);
}
[Fact]
public void FindProvider_MatchesProviderId_CaseInsensitively()
{
var store = new VoiceProviderConfigurationStore
{
Providers =
[
new VoiceProviderConfiguration { ProviderId = "ElevenLabs" }
]
};
var provider = store.FindProvider("elevenlabs");
Assert.NotNull(provider);
Assert.Equal("ElevenLabs", provider!.ProviderId);
}
[Fact]
public void GetValue_MatchesSettingKey_CaseInsensitively()
{
var configuration = new VoiceProviderConfiguration
{
Values = new Dictionary<string, string>
{
["ApiKey"] = "secret"
}
};
var value = configuration.GetValue("apikey");
Assert.Equal("secret", value);
}
[Fact]
public void StoreGetValue_MatchesProviderAndSetting_CaseInsensitively()
{
var store = new VoiceProviderConfigurationStore
{
Providers =
[
new VoiceProviderConfiguration
{
ProviderId = "MiniMax",
Values = new Dictionary<string, string>
{
["VoiceId"] = "English_MatureBoss"
}
}
]
};
var value = store.GetValue("minimax", "voiceid");
Assert.Equal("English_MatureBoss", value);
}
[Fact]
public void SetValue_AddsProviderAndTrimsStoredValue()
{
var store = new VoiceProviderConfigurationStore();
store.SetValue("minimax", "apiKey", " secret-key ");
var provider = Assert.Single(store.Providers);
Assert.Equal("minimax", provider.ProviderId);
Assert.Equal("secret-key", provider.Values["apiKey"]);
}
[Fact]
public void SetValue_UpdatesExistingEntry_CaseInsensitively()
{
var configuration = new VoiceProviderConfiguration
{
Values = new Dictionary<string, string>
{
["ApiKey"] = "old-value"
}
};
configuration.SetValue("apikey", " new-value ");
Assert.Single(configuration.Values);
Assert.Equal("new-value", configuration.Values["ApiKey"]);
}
[Fact]
public void SetValue_RemovesExistingEntry_WhenValueIsBlank()
{
var configuration = new VoiceProviderConfiguration
{
Values = new Dictionary<string, string>
{
["ApiKey"] = "secret"
}
};
configuration.SetValue("apikey", " ");
Assert.Empty(configuration.Values);
}
[Fact]
public void StoreSetValue_RemovesSetting_WhenValueIsNull()
{
var store = new VoiceProviderConfigurationStore
{
Providers =
[
new VoiceProviderConfiguration
{
ProviderId = "minimax",
Values = new Dictionary<string, string>
{
["apiKey"] = "secret"
}
}
]
};
store.SetValue("MiniMax", "ApiKey", null);
var provider = Assert.Single(store.Providers);
Assert.Empty(provider.Values);
}
}

View File

@ -1,7 +1,7 @@
<Project Sdk="Microsoft.NET.Sdk">
<PropertyGroup>
<TargetFramework>net10.0</TargetFramework>
<TargetFramework>net10.0-windows10.0.19041.0</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<IsPackable>false</IsPackable>
@ -20,6 +20,7 @@
<ItemGroup>
<ProjectReference Include="..\..\src\OpenClaw.Shared\OpenClaw.Shared.csproj" />
<ProjectReference Include="..\..\src\OpenClaw.Tray.Shared\OpenClaw.Tray.Shared.csproj" />
</ItemGroup>
</Project>

View File

@ -1,3 +1,4 @@
using System.Collections.Generic;
using System.Text.Json;
using OpenClaw.Shared;
@ -34,6 +35,62 @@ public class SettingsRoundTripTests
SkippedUpdateTag = "v1.2.3",
NotifyChatResponses = false,
PreferStructuredCategories = true,
Voice = new VoiceSettings
{
Enabled = true,
Mode = VoiceActivationMode.VoiceWake,
ShowRepeaterAtStartup = false,
ShowConversationToasts = true,
SpeechToTextProviderId = "windows",
TextToSpeechProviderId = "elevenlabs",
InputDeviceId = "mic-1",
OutputDeviceId = "spk-2",
SampleRateHz = 16000,
CaptureChunkMs = 80,
BargeInEnabled = false,
VoiceWake = new VoiceWakeSettings
{
Engine = "NanoWakeWord",
ModelId = "hey_openclaw",
TriggerThreshold = 0.72f,
TriggerCooldownMs = 2500,
PreRollMs = 1400,
EndSilenceMs = 1000
},
TalkMode = new TalkModeSettings
{
MinSpeechMs = 300,
EndSilenceMs = 1100,
MaxUtteranceMs = 18000
}
},
VoiceProviderConfiguration = new VoiceProviderConfigurationStore
{
Providers =
[
new VoiceProviderConfiguration
{
ProviderId = VoiceProviderIds.MiniMax,
Values = new Dictionary<string, string>
{
[VoiceProviderSettingKeys.ApiKey] = "minimax-key",
[VoiceProviderSettingKeys.Model] = "speech-2.8-turbo",
[VoiceProviderSettingKeys.VoiceId] = "English_MatureBoss",
[VoiceProviderSettingKeys.VoiceSettingsJson] = "{\"voice_id\":\"English_MatureBoss\",\"speed\":1.1}"
}
},
new VoiceProviderConfiguration
{
ProviderId = VoiceProviderIds.ElevenLabs,
Values = new Dictionary<string, string>
{
[VoiceProviderSettingKeys.ApiKey] = "eleven-key",
[VoiceProviderSettingKeys.Model] = "eleven_multilingual_v2",
[VoiceProviderSettingKeys.VoiceId] = "voice-42"
}
}
]
},
UserRules = new List<UserNotificationRule>
{
new() { Pattern = "build.*fail", IsRegex = true, Category = "urgent", Enabled = true }
@ -68,6 +125,27 @@ public class SettingsRoundTripTests
Assert.Equal(original.SkippedUpdateTag, restored.SkippedUpdateTag);
Assert.Equal(original.NotifyChatResponses, restored.NotifyChatResponses);
Assert.Equal(original.PreferStructuredCategories, restored.PreferStructuredCategories);
Assert.NotNull(restored.Voice);
Assert.True(restored.Voice.Enabled);
Assert.Equal(VoiceActivationMode.VoiceWake, restored.Voice.Mode);
Assert.False(restored.Voice.ShowRepeaterAtStartup);
Assert.True(restored.Voice.ShowConversationToasts);
Assert.Equal("windows", restored.Voice.SpeechToTextProviderId);
Assert.Equal("elevenlabs", restored.Voice.TextToSpeechProviderId);
Assert.Equal("mic-1", restored.Voice.InputDeviceId);
Assert.Equal("spk-2", restored.Voice.OutputDeviceId);
Assert.Equal("NanoWakeWord", restored.Voice.VoiceWake.Engine);
Assert.Equal("hey_openclaw", restored.Voice.VoiceWake.ModelId);
Assert.Equal(0.72f, restored.Voice.VoiceWake.TriggerThreshold);
Assert.Equal(300, restored.Voice.TalkMode.MinSpeechMs);
Assert.NotNull(restored.VoiceProviderConfiguration);
Assert.Equal("minimax-key", restored.VoiceProviderConfiguration.GetValue(VoiceProviderIds.MiniMax, VoiceProviderSettingKeys.ApiKey));
Assert.Equal("speech-2.8-turbo", restored.VoiceProviderConfiguration.GetValue(VoiceProviderIds.MiniMax, VoiceProviderSettingKeys.Model));
Assert.Equal("English_MatureBoss", restored.VoiceProviderConfiguration.GetValue(VoiceProviderIds.MiniMax, VoiceProviderSettingKeys.VoiceId));
Assert.Equal("{\"voice_id\":\"English_MatureBoss\",\"speed\":1.1}", restored.VoiceProviderConfiguration.GetValue(VoiceProviderIds.MiniMax, VoiceProviderSettingKeys.VoiceSettingsJson));
Assert.Equal("eleven-key", restored.VoiceProviderConfiguration.GetValue(VoiceProviderIds.ElevenLabs, VoiceProviderSettingKeys.ApiKey));
Assert.Equal("eleven_multilingual_v2", restored.VoiceProviderConfiguration.GetValue(VoiceProviderIds.ElevenLabs, VoiceProviderSettingKeys.Model));
Assert.Equal("voice-42", restored.VoiceProviderConfiguration.GetValue(VoiceProviderIds.ElevenLabs, VoiceProviderSettingKeys.VoiceId));
Assert.NotNull(restored.UserRules);
Assert.Single(restored.UserRules);
Assert.Equal("build.*fail", restored.UserRules[0].Pattern);
@ -119,9 +197,42 @@ public class SettingsRoundTripTests
Assert.Null(settings.SkippedUpdateTag);
Assert.True(settings.NotifyChatResponses);
Assert.True(settings.PreferStructuredCategories);
Assert.NotNull(settings.Voice);
Assert.False(settings.Voice.Enabled);
Assert.Equal(VoiceActivationMode.Off, settings.Voice.Mode);
Assert.True(settings.Voice.ShowRepeaterAtStartup);
Assert.False(settings.Voice.ShowConversationToasts);
Assert.Equal(VoiceProviderIds.Windows, settings.Voice.SpeechToTextProviderId);
Assert.Equal(VoiceProviderIds.Windows, settings.Voice.TextToSpeechProviderId);
Assert.NotNull(settings.VoiceProviderConfiguration);
Assert.Empty(settings.VoiceProviderConfiguration.Providers);
Assert.Equal(16000, settings.Voice.SampleRateHz);
Assert.Equal("NanoWakeWord", settings.Voice.VoiceWake.Engine);
Assert.Null(settings.UserRules);
}
[Fact]
public void LegacyVoiceProviderCredentials_Deserialize_ForMigration()
{
var json = """
{
"VoiceProviderCredentials": {
"MiniMaxApiKey": "minimax-key",
"MiniMaxModel": "speech-2.8-turbo",
"MiniMaxVoiceId": "English_MatureBoss"
}
}
""";
var settings = SettingsData.FromJson(json);
Assert.NotNull(settings);
Assert.NotNull(settings.VoiceProviderCredentials);
Assert.Equal("minimax-key", settings.VoiceProviderCredentials.MiniMaxApiKey);
Assert.Equal("speech-2.8-turbo", settings.VoiceProviderCredentials.MiniMaxModel);
Assert.Equal("English_MatureBoss", settings.VoiceProviderCredentials.MiniMaxVoiceId);
}
[Fact]
public void BackwardCompatibility_OldSettingsWithoutNewFields()
{
@ -161,6 +272,13 @@ public class SettingsRoundTripTests
Assert.False(settings.HasSeenActivityStreamTip);
Assert.Null(settings.SkippedUpdateTag);
Assert.True(settings.GlobalHotkeyEnabled);
Assert.NotNull(settings.Voice);
Assert.False(settings.Voice.Enabled);
Assert.Equal(VoiceActivationMode.Off, settings.Voice.Mode);
Assert.True(settings.Voice.ShowRepeaterAtStartup);
Assert.False(settings.Voice.ShowConversationToasts);
Assert.Equal(VoiceProviderIds.Windows, settings.Voice.SpeechToTextProviderId);
Assert.Equal(VoiceProviderIds.Windows, settings.Voice.TextToSpeechProviderId);
Assert.Null(settings.UserRules);
}

View File

@ -0,0 +1,221 @@
using OpenClaw.Shared;
using OpenClawTray.Services.Voice;
namespace OpenClaw.Tray.Tests;
public class VoiceChatCoordinatorTests
{
[Fact]
public async Task AttachWindow_ReplaysBufferedDraft()
{
var runtime = new FakeVoiceRuntime();
using var coordinator = new VoiceChatCoordinator(runtime, new ImmediateDispatcher());
runtime.RaiseDraft("hello world", "main", clear: false);
var window = new FakeVoiceChatWindow();
coordinator.AttachWindow(window);
await Task.Yield();
Assert.Equal("hello world", window.LastDraftText);
Assert.False(window.LastDraftClear);
}
[Fact]
public async Task DraftClear_IsReplayedWhenWindowAttachesLater()
{
var runtime = new FakeVoiceRuntime();
using var coordinator = new VoiceChatCoordinator(runtime, new ImmediateDispatcher());
runtime.RaiseDraft("temporary draft", "main", clear: false);
runtime.RaiseDraft(string.Empty, "main", clear: true);
await Task.Yield();
var window = new FakeVoiceChatWindow();
coordinator.AttachWindow(window);
await Task.Yield();
Assert.Equal(string.Empty, window.LastDraftText);
Assert.True(window.LastDraftClear);
}
[Fact]
public async Task DraftUpdates_AreIgnoredForClosedWindow()
{
var runtime = new FakeVoiceRuntime();
using var coordinator = new VoiceChatCoordinator(runtime, new ImmediateDispatcher());
var window = new FakeVoiceChatWindow { IsClosed = true };
coordinator.AttachWindow(window);
var updateCountAfterAttach = window.UpdateCallCount;
runtime.RaiseDraft("headless text", "main", clear: false);
await Task.Yield();
Assert.Equal(updateCountAfterAttach, window.UpdateCallCount);
}
[Fact]
public async Task DetachWindow_StopsFurtherDraftMirroring()
{
var runtime = new FakeVoiceRuntime();
using var coordinator = new VoiceChatCoordinator(runtime, new ImmediateDispatcher());
var window = new FakeVoiceChatWindow();
coordinator.AttachWindow(window);
coordinator.DetachWindow(window);
runtime.RaiseDraft("after detach", "main", clear: false);
await Task.Yield();
Assert.Equal(1, window.UpdateCallCount);
Assert.Equal(string.Empty, window.LastDraftText);
Assert.True(window.LastDraftClear);
}
[Fact]
public void ConversationTurn_IsForwarded()
{
var runtime = new FakeVoiceRuntime();
using var coordinator = new VoiceChatCoordinator(runtime, new ImmediateDispatcher());
VoiceConversationTurnEventArgs? received = null;
coordinator.ConversationTurnAvailable += (_, args) => received = args;
runtime.RaiseConversationTurn(new VoiceConversationTurnEventArgs
{
Direction = VoiceConversationDirection.Incoming,
Message = "reply",
SessionKey = "main"
});
Assert.NotNull(received);
Assert.Equal("reply", received!.Message);
Assert.Equal(VoiceConversationDirection.Incoming, received.Direction);
}
[Fact]
public async Task ConversationTurn_IsMirroredToAttachedWindow()
{
var runtime = new FakeVoiceRuntime();
using var coordinator = new VoiceChatCoordinator(runtime, new ImmediateDispatcher());
var window = new FakeVoiceChatWindow();
coordinator.AttachWindow(window);
runtime.RaiseConversationTurn(new VoiceConversationTurnEventArgs
{
Direction = VoiceConversationDirection.Outgoing,
Message = "hello from voice",
SessionKey = "main"
});
await Task.Yield();
Assert.Equal("hello from voice", window.LastTurnMessage);
Assert.Equal(VoiceConversationDirection.Outgoing, window.LastTurnDirection);
Assert.Equal(1, window.TurnCallCount);
}
[Fact]
public async Task AttachWindow_ReplaysBufferedConversationTurns()
{
var runtime = new FakeVoiceRuntime();
using var coordinator = new VoiceChatCoordinator(runtime, new ImmediateDispatcher());
runtime.RaiseConversationTurn(new VoiceConversationTurnEventArgs
{
Direction = VoiceConversationDirection.Outgoing,
Message = "replay this",
SessionKey = "main"
});
await Task.Yield();
var window = new FakeVoiceChatWindow();
coordinator.AttachWindow(window);
await Task.Yield();
Assert.Equal("replay this", window.LastTurnMessage);
Assert.Equal(VoiceConversationDirection.Outgoing, window.LastTurnDirection);
Assert.Equal(1, window.TurnCallCount);
}
[Fact]
public async Task DraftAndTurns_AreBroadcastToAllAttachedWindows()
{
var runtime = new FakeVoiceRuntime();
using var coordinator = new VoiceChatCoordinator(runtime, new ImmediateDispatcher());
var firstWindow = new FakeVoiceChatWindow();
var secondWindow = new FakeVoiceChatWindow();
coordinator.AttachWindow(firstWindow);
coordinator.AttachWindow(secondWindow);
runtime.RaiseDraft("shared draft", "main", clear: false);
runtime.RaiseConversationTurn(new VoiceConversationTurnEventArgs
{
Direction = VoiceConversationDirection.Incoming,
Message = "shared reply",
SessionKey = "main"
});
await Task.Yield();
Assert.Equal("shared draft", firstWindow.LastDraftText);
Assert.Equal("shared draft", secondWindow.LastDraftText);
Assert.Equal("shared reply", firstWindow.LastTurnMessage);
Assert.Equal("shared reply", secondWindow.LastTurnMessage);
}
private sealed class ImmediateDispatcher : IUiDispatcher
{
public bool TryEnqueue(Action callback)
{
callback();
return true;
}
}
private sealed class FakeVoiceRuntime : IVoiceRuntime
{
public event EventHandler<VoiceConversationTurnEventArgs>? ConversationTurnAvailable;
public event EventHandler<VoiceTranscriptDraftEventArgs>? TranscriptDraftUpdated;
public void RaiseDraft(string text, string? sessionKey, bool clear)
{
TranscriptDraftUpdated?.Invoke(this, new VoiceTranscriptDraftEventArgs
{
Text = text,
SessionKey = sessionKey ?? "main",
Clear = clear
});
}
public void RaiseConversationTurn(VoiceConversationTurnEventArgs args)
{
ConversationTurnAvailable?.Invoke(this, args);
}
}
private sealed class FakeVoiceChatWindow : IVoiceChatWindow
{
public bool IsClosed { get; set; }
public string LastDraftText { get; private set; } = string.Empty;
public bool LastDraftClear { get; private set; }
public int UpdateCallCount { get; private set; }
public string LastTurnMessage { get; private set; } = string.Empty;
public VoiceConversationDirection? LastTurnDirection { get; private set; }
public int TurnCallCount { get; private set; }
public Task UpdateVoiceTranscriptDraftAsync(string text, bool clear)
{
UpdateCallCount++;
LastDraftText = text;
LastDraftClear = clear;
return Task.CompletedTask;
}
public Task AppendVoiceConversationTurnAsync(VoiceConversationTurnEventArgs args)
{
TurnCallCount++;
LastTurnMessage = args.Message ?? string.Empty;
LastTurnDirection = args.Direction;
return Task.CompletedTask;
}
}
}

View File

@ -0,0 +1,75 @@
using System;
using System.Reflection;
using System.Threading;
using System.Threading.Tasks;
using OpenClaw.Shared;
using OpenClawTray.Services.Voice;
namespace OpenClaw.Tray.Tests;
public class VoiceCloudTextToSpeechClientTests
{
[Fact]
public async Task SynthesizeAsync_ThrowsOperationCanceled_WhenCallerTokenIsPreCancelled()
{
var client = new VoiceCloudTextToSpeechClient();
var provider = new VoiceProviderOption
{
Id = "test-ws",
Name = "Test WS",
Settings =
[
new VoiceProviderSettingDefinition { Key = "apiKey", Secret = true }
],
TextToSpeechWebSocket = new VoiceTextToSpeechWebSocketContract
{
EndpointTemplate = "wss://127.0.0.1:0/tts"
}
};
var store = new VoiceProviderConfigurationStore();
store.SetValue("test-ws", "apiKey", "test-key");
using var cts = new CancellationTokenSource();
cts.Cancel();
await Assert.ThrowsAnyAsync<OperationCanceledException>(
() => client.SynthesizeAsync("hello", provider, store, cancellationToken: cts.Token));
}
[Fact]
public void DecodeAudioBytes_DecodesHexString()
{
var result = InvokeDecodeAudioBytes("hexJsonString", "48656c6c6f", "TestProvider");
Assert.Equal([72, 101, 108, 108, 111], result); // "Hello"
}
[Fact]
public void DecodeAudioBytes_DecodesBase64String()
{
var result = InvokeDecodeAudioBytes("base64JsonString", "SGVsbG8=", "TestProvider");
Assert.Equal([72, 101, 108, 108, 111], result); // "Hello"
}
[Fact]
public void DecodeAudioBytes_ThrowsForUnsupportedMode()
{
var method = GetDecodeAudioBytesMethod();
var ex = Assert.Throws<TargetInvocationException>(
() => method.Invoke(null, ["unsupported", "data", "TestProvider"]));
Assert.IsType<InvalidOperationException>(ex.InnerException);
}
private static byte[] InvokeDecodeAudioBytes(string mode, string value, string providerName)
{
return (byte[])GetDecodeAudioBytesMethod().Invoke(null, [mode, value, providerName])!;
}
private static MethodInfo GetDecodeAudioBytesMethod() =>
typeof(VoiceCloudTextToSpeechClient).GetMethod(
"DecodeAudioBytes",
BindingFlags.NonPublic | BindingFlags.Static)!;
}

View File

@ -0,0 +1,131 @@
using System;
using System.IO;
using OpenClaw.Shared;
using OpenClawTray.Helpers;
using OpenClawTray.Services.Voice;
using System.Linq;
namespace OpenClaw.Tray.Tests;
public class VoiceProviderCatalogServiceTests
{
[Fact]
public void GetVoiceTrayIconPath_ReturnsBundledAppIconForOff()
{
var path = VoiceTrayIconHelper.GetVoiceTrayIconPath(VoiceTrayIconState.Off);
Assert.Equal(VoiceTrayIconHelper.GetBaseAppIconPath(), path, ignoreCase: true);
}
[Fact]
public void GetVoiceTrayIconPath_GeneratesListeningVariant()
{
var path = VoiceTrayIconHelper.GetVoiceTrayIconPath(VoiceTrayIconState.Listening);
Assert.True(File.Exists(path));
Assert.EndsWith(".ico", path, StringComparison.OrdinalIgnoreCase);
Assert.NotEqual(VoiceTrayIconHelper.GetBaseAppIconPath(), path, StringComparer.OrdinalIgnoreCase);
}
[Fact]
public void CatalogFilePath_ResolvesToExistingBundledAsset()
{
Assert.EndsWith("voice-providers.json", VoiceProviderCatalogService.CatalogFilePath, StringComparison.OrdinalIgnoreCase);
Assert.True(File.Exists(VoiceProviderCatalogService.CatalogFilePath));
}
[Fact]
public void LoadCatalog_IncludesOnlySelectableAndVisibleSpeechProviders()
{
var catalog = VoiceProviderCatalogService.LoadCatalog();
Assert.Contains(catalog.SpeechToTextProviders, p => p.Id == VoiceProviderIds.Windows);
Assert.Contains(catalog.SpeechToTextProviders, p => p.Id == VoiceProviderIds.SherpaOnnx);
Assert.DoesNotContain(catalog.SpeechToTextProviders, p => p.Id == VoiceProviderIds.FoundryLocal);
Assert.DoesNotContain(catalog.SpeechToTextProviders, p => p.Id == VoiceProviderIds.OpenAiWhisper);
Assert.DoesNotContain(catalog.SpeechToTextProviders, p => p.Id == VoiceProviderIds.ElevenLabsSpeechToText);
Assert.DoesNotContain(catalog.SpeechToTextProviders, p => p.Id == VoiceProviderIds.AzureAiSpeech);
Assert.Contains(catalog.TextToSpeechProviders, p => p.Id == VoiceProviderIds.Windows);
Assert.Contains(catalog.TextToSpeechProviders, p => p.Id == VoiceProviderIds.MiniMax);
Assert.Contains(catalog.TextToSpeechProviders, p => p.Id == VoiceProviderIds.ElevenLabs);
}
[Fact]
public void SupportsSpeechToTextRuntime_ReportsWindowsRouteSupportForConfiguredSpeechProviders()
{
Assert.True(VoiceProviderCatalogService.SupportsSpeechToTextRuntime(VoiceProviderIds.Windows));
Assert.True(VoiceProviderCatalogService.SupportsSpeechToTextRuntime(VoiceProviderIds.FoundryLocal));
Assert.True(VoiceProviderCatalogService.SupportsSpeechToTextRuntime(VoiceProviderIds.OpenAiWhisper));
Assert.True(VoiceProviderCatalogService.SupportsSpeechToTextRuntime(VoiceProviderIds.ElevenLabsSpeechToText));
Assert.True(VoiceProviderCatalogService.SupportsSpeechToTextRuntime(VoiceProviderIds.AzureAiSpeech));
Assert.False(VoiceProviderCatalogService.SupportsSpeechToTextRuntime(VoiceProviderIds.SherpaOnnx));
}
[Fact]
public void SupportsTextToSpeechRuntime_ReturnsTrueForImplementedProviders()
{
Assert.True(VoiceProviderCatalogService.SupportsTextToSpeechRuntime(VoiceProviderIds.Windows));
Assert.True(VoiceProviderCatalogService.SupportsTextToSpeechRuntime(VoiceProviderIds.MiniMax));
Assert.True(VoiceProviderCatalogService.SupportsTextToSpeechRuntime(VoiceProviderIds.ElevenLabs));
}
[Fact]
public void LoadCatalog_ExposesBuiltInCloudTtsContracts()
{
var catalog = VoiceProviderCatalogService.LoadCatalog();
var sherpaOnnx = Assert.Single(catalog.SpeechToTextProviders, p => p.Id == VoiceProviderIds.SherpaOnnx);
Assert.Equal(VoiceProviderRuntimeIds.Embedded, sherpaOnnx.Runtime);
Assert.False(sherpaOnnx.Enabled);
Assert.True(sherpaOnnx.VisibleInSettings);
Assert.False(sherpaOnnx.Selectable);
Assert.Equal(string.Empty, sherpaOnnx.Settings.Single(s => s.Key == VoiceProviderSettingKeys.ModelPath).DefaultValue);
var minimax = Assert.Single(catalog.TextToSpeechProviders, p => p.Id == VoiceProviderIds.MiniMax);
Assert.Equal("MiniMax", minimax.Name);
Assert.NotNull(minimax.TextToSpeechWebSocket);
Assert.Equal("wss://api.minimax.io/ws/v1/t2a_v2", minimax.TextToSpeechWebSocket!.EndpointTemplate);
Assert.Equal("Authorization", minimax.TextToSpeechWebSocket.AuthenticationHeaderName);
Assert.Equal(VoiceTextToSpeechResponseModes.HexJsonString, minimax.TextToSpeechWebSocket.ResponseAudioMode);
Assert.Contains("\"event\": \"task_start\"", minimax.TextToSpeechWebSocket.StartMessageTemplate);
Assert.Contains("\"event\": \"task_continue\"", minimax.TextToSpeechWebSocket.ContinueMessageTemplate);
var minimaxModelSetting = minimax.Settings.Single(s => s.Key == VoiceProviderSettingKeys.Model);
Assert.Equal("speech-2.8-turbo", minimaxModelSetting.DefaultValue);
Assert.Contains("speech-2.8-turbo", minimaxModelSetting.Options);
Assert.Contains("speech-2.5-turbo-preview", minimaxModelSetting.Options);
Assert.Equal("English_MatureBoss", minimax.Settings.Single(s => s.Key == VoiceProviderSettingKeys.VoiceId).DefaultValue);
var minimaxVoiceSettingsJson = minimax.Settings.Single(s => s.Key == VoiceProviderSettingKeys.VoiceSettingsJson);
Assert.False(minimaxVoiceSettingsJson.Required);
Assert.True(minimaxVoiceSettingsJson.JsonValue);
Assert.Contains("\"voice_setting\":", minimaxVoiceSettingsJson.Placeholder);
Assert.Contains("{{voiceId}}", minimaxVoiceSettingsJson.DefaultValue);
var elevenLabs = Assert.Single(catalog.TextToSpeechProviders, p => p.Id == VoiceProviderIds.ElevenLabs);
Assert.Equal("ElevenLabs", elevenLabs.Name);
Assert.NotNull(elevenLabs.TextToSpeechWebSocket);
Assert.Equal(
"wss://api.elevenlabs.io/v1/text-to-speech/{{voiceId}}/stream-input?model_id={{model}}&output_format=mp3_44100_128&auto_mode=true",
elevenLabs.TextToSpeechWebSocket!.EndpointTemplate);
Assert.Equal("xi-api-key", elevenLabs.TextToSpeechWebSocket.AuthenticationHeaderName);
Assert.Equal(string.Empty, elevenLabs.TextToSpeechWebSocket.AuthenticationScheme);
Assert.Equal(string.Empty, elevenLabs.TextToSpeechWebSocket.ConnectSuccessEventName);
Assert.Equal(string.Empty, elevenLabs.TextToSpeechWebSocket.StartSuccessEventName);
Assert.Contains("\"xi_api_key\": {{apiKey}}", elevenLabs.TextToSpeechWebSocket.StartMessageTemplate);
Assert.Contains("\"try_trigger_generation\": true", elevenLabs.TextToSpeechWebSocket.ContinueMessageTemplate);
Assert.Contains("{{textWithTrailingSpace}}", elevenLabs.TextToSpeechWebSocket.ContinueMessageTemplate);
Assert.Equal("{ \"text\": \"\" }", elevenLabs.TextToSpeechWebSocket.FinishMessageTemplate);
Assert.Equal(VoiceTextToSpeechResponseModes.Base64JsonString, elevenLabs.TextToSpeechWebSocket.ResponseAudioMode);
Assert.Equal("audio", elevenLabs.TextToSpeechWebSocket.ResponseAudioJsonPath);
Assert.Equal("isFinal", elevenLabs.TextToSpeechWebSocket.FinalFlagJsonPath);
var elevenLabsModelSetting = elevenLabs.Settings.Single(s => s.Key == VoiceProviderSettingKeys.Model);
Assert.Equal("eleven_multilingual_v2", elevenLabsModelSetting.DefaultValue);
Assert.Contains("eleven_flash_v2_5", elevenLabsModelSetting.Options);
Assert.Contains("eleven_turbo_v2_5", elevenLabsModelSetting.Options);
Assert.Equal("6aDn1KB0hjpdcocrUkmq", elevenLabs.Settings.Single(s => s.Key == VoiceProviderSettingKeys.VoiceId).DefaultValue);
var elevenLabsVoiceSettingsJson = elevenLabs.Settings.Single(s => s.Key == VoiceProviderSettingKeys.VoiceSettingsJson);
Assert.False(elevenLabsVoiceSettingsJson.Required);
Assert.True(elevenLabsVoiceSettingsJson.JsonValue);
Assert.Contains("\"voice_settings\":", elevenLabsVoiceSettingsJson.DefaultValue);
Assert.Contains("\"speed\": 0.9", elevenLabsVoiceSettingsJson.DefaultValue);
}
}

View File

@ -0,0 +1,360 @@
using OpenClaw.Shared;
using Windows.Media.Devices;
using Windows.Media.SpeechRecognition;
using OpenClawTray.Services.Voice;
namespace OpenClaw.Tray.Tests;
public class VoiceServiceTransportTests
{
[Fact]
public void GetOrCreateTransportReadySource_ReusesExistingTaskWhileConnecting()
{
var existing = new TaskCompletionSource<bool>(TaskCreationOptions.RunContinuationsAsynchronously);
var result = VoiceServiceTransportLogic.GetOrCreateTransportReadySource(
ConnectionStatus.Connecting,
existing,
out var shouldStartConnection);
Assert.Same(existing, result);
Assert.False(shouldStartConnection);
}
[Fact]
public void GetOrCreateTransportReadySource_CreatesFreshTaskWhenDisconnected()
{
var existing = new TaskCompletionSource<bool>(TaskCreationOptions.RunContinuationsAsynchronously);
var result = VoiceServiceTransportLogic.GetOrCreateTransportReadySource(
ConnectionStatus.Disconnected,
existing,
out var shouldStartConnection);
Assert.NotSame(existing, result);
Assert.True(shouldStartConnection);
}
[Fact]
public void GetOrCreateTransportReadySource_CreatesFreshTaskAfterError()
{
var existing = new TaskCompletionSource<bool>(TaskCreationOptions.RunContinuationsAsynchronously);
var result = VoiceServiceTransportLogic.GetOrCreateTransportReadySource(
ConnectionStatus.Error,
existing,
out var shouldStartConnection);
Assert.NotSame(existing, result);
Assert.True(shouldStartConnection);
}
[Fact]
public void UsesCloudTextToSpeechRuntime_ReturnsTrueForWebSocketProviders()
{
var provider = new VoiceProviderOption
{
Id = VoiceProviderIds.MiniMax,
TextToSpeechWebSocket = new VoiceTextToSpeechWebSocketContract
{
EndpointTemplate = "wss://example.test/tts"
}
};
var result = VoiceServiceTransportLogic.UsesCloudTextToSpeechRuntime(provider);
Assert.True(result);
}
[Theory]
[InlineData(true, false, 0, false, true)]
[InlineData(false, true, 0, false, true)]
[InlineData(false, false, 1, false, true)]
[InlineData(false, false, 0, true, true)]
[InlineData(false, false, 0, false, false)]
public void ShouldAcceptAssistantReply_MatchesPlaybackAndAwaitingState(
bool awaitingReply,
bool isSpeaking,
int queuedReplyCount,
bool acceptedViaLateReplyGrace,
bool expected)
{
var result = VoiceServiceTransportLogic.ShouldAcceptAssistantReply(
awaitingReply,
isSpeaking,
queuedReplyCount,
acceptedViaLateReplyGrace);
Assert.Equal(expected, result);
}
[Theory]
[InlineData(false, false, 0, "main", "main", 30, true)]
[InlineData(false, false, 0, "main", "main", 121, false)]
[InlineData(true, false, 0, "main", "main", 30, false)]
[InlineData(false, true, 0, "main", "main", 30, false)]
[InlineData(false, false, 1, "main", "main", 30, false)]
[InlineData(false, false, 0, "main", "other", 30, false)]
public void ShouldAcceptLateAssistantReply_OnlyMatchesBoundedGraceWindow(
bool awaitingReply,
bool isSpeaking,
int queuedReplyCount,
string lateReplySessionKey,
string incomingSessionKey,
int secondsAfterTimeout,
bool expected)
{
var timeoutUtc = new DateTime(2026, 3, 25, 0, 0, 0, DateTimeKind.Utc);
var graceUntilUtc = timeoutUtc.AddMinutes(2);
var result = VoiceServiceTransportLogic.ShouldAcceptLateAssistantReply(
awaitingReply,
isSpeaking,
queuedReplyCount,
lateReplySessionKey,
graceUntilUtc,
incomingSessionKey,
timeoutUtc.AddSeconds(secondsAfterTimeout));
Assert.Equal(expected, result);
}
[Theory]
[InlineData(true, false, false)]
[InlineData(false, true, false)]
[InlineData(false, false, true)]
public void ShouldRestartRecognitionAfterCompletion_SuppressesControlledRecycle(
bool restartInProgress,
bool awaitingReply,
bool expected)
{
var result = VoiceServiceTransportLogic.ShouldRestartRecognitionAfterCompletion(
true,
VoiceActivationMode.TalkMode,
restartInProgress,
awaitingReply,
false);
Assert.Equal(expected, result);
}
[Theory]
[InlineData(true, VoiceActivationMode.TalkMode, false, false, false, "eligible")]
[InlineData(true, VoiceActivationMode.VoiceWake, false, false, false, "mode=VoiceWake")]
[InlineData(false, VoiceActivationMode.TalkMode, false, false, false, "runtime-not-running")]
[InlineData(true, VoiceActivationMode.TalkMode, true, false, false, "controlled-restart-in-progress")]
[InlineData(true, VoiceActivationMode.TalkMode, false, true, false, "awaiting-reply")]
[InlineData(true, VoiceActivationMode.TalkMode, false, false, true, "speaking")]
public void DescribeRecognitionCompletionRestartDecision_ExplainsWhyRestartIsBlocked(
bool running,
VoiceActivationMode mode,
bool restartInProgress,
bool awaitingReply,
bool isSpeaking,
string expected)
{
var result = VoiceServiceTransportLogic.DescribeRecognitionCompletionRestartDecision(
running,
mode,
restartInProgress,
awaitingReply,
isSpeaking);
Assert.Equal(expected, result);
}
[Theory]
[InlineData(SpeechRecognitionResultStatus.UserCanceled, false, false, false, false, false, true)]
[InlineData(SpeechRecognitionResultStatus.TimeoutExceeded, false, false, false, false, false, false)]
[InlineData(SpeechRecognitionResultStatus.Success, false, false, false, false, false, false)]
[InlineData(SpeechRecognitionResultStatus.Success, false, true, false, false, false, false)]
[InlineData(SpeechRecognitionResultStatus.UserCanceled, true, false, false, false, false, false)]
[InlineData(SpeechRecognitionResultStatus.UserCanceled, false, false, true, false, false, false)]
[InlineData(SpeechRecognitionResultStatus.UserCanceled, false, false, false, true, false, false)]
[InlineData(SpeechRecognitionResultStatus.UserCanceled, false, false, false, false, true, false)]
public void ShouldRebuildRecognitionAfterCompletion_RebuildsOnlyForUserCanceledWithoutActivity(
SpeechRecognitionResultStatus status,
bool sessionHadActivity,
bool sessionHadCaptureSignal,
bool restartInProgress,
bool awaitingReply,
bool isSpeaking,
bool expected)
{
var result = VoiceServiceTransportLogic.ShouldRebuildRecognitionAfterCompletion(
status,
sessionHadActivity,
sessionHadCaptureSignal,
restartInProgress,
awaitingReply,
isSpeaking);
Assert.Equal(expected, result);
}
[Theory]
[InlineData(SpeechRecognitionResultStatus.TimeoutExceeded, false, true, false, false, false, "capture-signal-without-recognition")]
[InlineData(SpeechRecognitionResultStatus.UserCanceled, false, false, false, false, false, "user-canceled-without-activity")]
[InlineData(SpeechRecognitionResultStatus.TimeoutExceeded, false, false, false, false, false, "disabled-official-session-restart-only (status=TimeoutExceeded)")]
[InlineData(SpeechRecognitionResultStatus.Success, false, false, false, false, false, "disabled-official-session-restart-only (status=Success)")]
[InlineData(SpeechRecognitionResultStatus.TimeoutExceeded, true, true, false, false, false, "session-had-activity")]
[InlineData(SpeechRecognitionResultStatus.TimeoutExceeded, false, true, true, false, false, "controlled-restart-in-progress")]
[InlineData(SpeechRecognitionResultStatus.TimeoutExceeded, false, true, false, true, false, "awaiting-reply")]
[InlineData(SpeechRecognitionResultStatus.TimeoutExceeded, false, true, false, false, true, "speaking")]
public void DescribeRecognitionCompletionRebuildDecision_ExplainsWhyRebuildIsBlocked(
SpeechRecognitionResultStatus status,
bool sessionHadActivity,
bool sessionHadCaptureSignal,
bool restartInProgress,
bool awaitingReply,
bool isSpeaking,
string expected)
{
var result = VoiceServiceTransportLogic.DescribeRecognitionCompletionRebuildDecision(
status,
sessionHadActivity,
sessionHadCaptureSignal,
restartInProgress,
awaitingReply,
isSpeaking);
Assert.Equal(expected, result);
}
[Theory]
[InlineData(16000, 80, 1280)]
[InlineData(16000, 0, 1280)]
[InlineData(0, 80, 1280)]
[InlineData(48000, 20, 960)]
public void ResolveDesiredSamplesPerQuantum_UsesSpeechFriendlyDefaults(
int sampleRateHz,
int chunkMs,
uint expected)
{
var result = VoiceCaptureMath.ResolveDesiredSamplesPerQuantum(sampleRateHz, chunkMs);
Assert.Equal(expected, result);
}
public static IEnumerable<object[]> PeakLevelCases()
{
yield return [new byte[] { 0, 0, 0, 0 }, 0f];
yield return [new byte[] { 0, 0, 0, 63 }, 0.5f];
yield return [new byte[] { 0, 0, 128, 63, 0, 0, 0, 191 }, 1f];
}
[Theory]
[MemberData(nameof(PeakLevelCases))]
public void ComputePeakLevel_FindsLargestAbsoluteFloatSample(byte[] data, float expected)
{
var result = VoiceCaptureMath.ComputePeakLevel(data);
Assert.Equal(expected, result, 3);
}
[Theory]
[InlineData("Now again testing", "again testing", 1, true, "Now again testing")]
[InlineData("again testing", "again testing", 1, false, "again testing")]
[InlineData("Now again testing", "again testing", 3, false, "again testing")]
[InlineData("This is different", "again testing", 1, false, "again testing")]
public void SelectRecognizedText_PromotesRecentLongerHypothesisWhenFinalLooksTruncated(
string hypothesis,
string recognized,
int hypothesisAgeSeconds,
bool expectedPromoted,
string expected)
{
var now = new DateTime(2026, 3, 25, 16, 45, 30, DateTimeKind.Utc);
var result = VoiceServiceTransportLogic.SelectRecognizedText(
recognized,
hypothesis,
now.AddSeconds(-hypothesisAgeSeconds),
now,
out var promotedHypothesis);
Assert.Equal(expected, result);
Assert.Equal(expectedPromoted, promotedHypothesis);
}
[Theory]
[InlineData(true, "Now again testing", 1, "Now again testing")]
[InlineData(true, "Now again testing", 3, null)]
[InlineData(false, "Now again testing", 1, null)]
[InlineData(true, "", 1, null)]
public void SelectCompletionFallbackText_PromotesRecentHypothesisWhenSessionHadActivity(
bool sessionHadActivity,
string hypothesis,
int hypothesisAgeSeconds,
string? expected)
{
var now = new DateTime(2026, 3, 25, 21, 36, 35, DateTimeKind.Utc);
var result = VoiceServiceTransportLogic.SelectCompletionFallbackText(
sessionHadActivity,
hypothesis,
now.AddSeconds(-hypothesisAgeSeconds),
now);
Assert.Equal(expected, result);
}
[Theory]
[InlineData(false, false, false, true)]
[InlineData(true, false, false, false)]
[InlineData(false, true, false, false)]
[InlineData(false, false, true, false)]
public void ShouldClearTranscriptDraftAfterCompletion_ClearsOnlyWhenNoReplyOrFallbackInFlight(
bool awaitingReply,
bool isSpeaking,
bool usedFallbackTranscript,
bool expected)
{
var result = VoiceServiceTransportLogic.ShouldClearTranscriptDraftAfterCompletion(
awaitingReply,
isSpeaking,
usedFallbackTranscript);
Assert.Equal(expected, result);
}
[Theory]
[InlineData(true, false, false, false, true)]
[InlineData(false, false, false, false, false)]
[InlineData(true, true, false, false, false)]
[InlineData(true, false, true, false, false)]
[InlineData(true, false, false, true, false)]
public void ShouldRepromptAfterIncompleteRecognition_OnlyPromptsWhenSpeechWasHeardButNothingUsableSurvived(
bool sessionHadActivity,
bool awaitingReply,
bool isSpeaking,
bool usedFallbackTranscript,
bool expected)
{
var result = VoiceServiceTransportLogic.ShouldRepromptAfterIncompleteRecognition(
sessionHadActivity,
awaitingReply,
isSpeaking,
usedFallbackTranscript);
Assert.Equal(expected, result);
}
[Theory]
[InlineData(true, VoiceActivationMode.TalkMode, null, AudioDeviceRole.Default, true)]
[InlineData(true, VoiceActivationMode.TalkMode, "", AudioDeviceRole.Default, true)]
[InlineData(true, VoiceActivationMode.TalkMode, "device-1", AudioDeviceRole.Default, false)]
[InlineData(true, VoiceActivationMode.VoiceWake, null, AudioDeviceRole.Default, false)]
[InlineData(false, VoiceActivationMode.TalkMode, null, AudioDeviceRole.Default, false)]
[InlineData(true, VoiceActivationMode.TalkMode, null, AudioDeviceRole.Communications, false)]
public void ShouldRefreshRecognitionForDefaultCaptureDeviceChange_OnlyRefreshesTalkModeUsingSystemDefaultMic(
bool running,
VoiceActivationMode mode,
string? configuredInputDeviceId,
AudioDeviceRole role,
bool expected)
{
var result = VoiceServiceTransportLogic.ShouldRefreshRecognitionForDefaultCaptureDeviceChange(
running,
mode,
configuredInputDeviceId,
role);
Assert.Equal(expected, result);
}
}

View File

@ -0,0 +1,31 @@
using OpenClawTray.Windows;
namespace OpenClaw.Tray.Tests;
public class WebChatWindowDomBridgeTests
{
[Fact]
public void BuildSetDraftScript_ClearsWhenDraftIsBlank()
{
var script = WebChatVoiceDomBridge.BuildSetDraftScript(string.Empty);
Assert.Equal("window.__openClawTrayVoice?.clearDraft?.();", script);
}
[Fact]
public void BuildSetDraftScript_SerializesDraftText()
{
var script = WebChatVoiceDomBridge.BuildSetDraftScript("hello from voice");
Assert.Contains("setDraft", script);
Assert.Contains("\"hello from voice\"", script);
}
[Fact]
public void DocumentCreatedScript_ClearsLegacyTurnsHost()
{
Assert.Contains("openclaw-tray-voice-turns", WebChatVoiceDomBridge.DocumentCreatedScript);
Assert.Contains("clearLegacyTurnsHost", WebChatVoiceDomBridge.DocumentCreatedScript);
Assert.Equal("window.__openClawTrayVoice?.setTurns?.([]);", WebChatVoiceDomBridge.ClearLegacyTurnsScript);
}
}