From e0c40985a718453904dd36120c5e712c51667bce Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9gis=20Brid?= <36547063+RBrid@users.noreply.github.com> Date: Fri, 1 May 2026 11:31:58 -0700 Subject: [PATCH] feat: add Windows node text-to-speech (#253) Adds a focused Windows node text-to-speech capability as the first stable voice-support primitive. - adds the shared `tts.speak` capability and MCP/gateway documentation - wires Windows and ElevenLabs TTS behind opt-in tray settings - protects the ElevenLabs API key with DPAPI - adds shared and tray tests for capability behavior, settings, and ElevenLabs requests This lands the focused TTS foundation from the broader Voice Mode discussion in #120 so remaining voice UX/STT/repeater work can build on top in smaller follow-up PRs. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- README.md | 33 +-- docs/MCP_MODE.md | 2 +- docs/WINDOWS_NODE_TESTING.md | 2 + docs/gateway-node-integration.md | 2 + .../Capabilities/TtsCapability.cs | 108 +++++++++ src/OpenClaw.Shared/Mcp/McpToolBridge.cs | 4 + src/OpenClaw.Shared/Models.cs | 7 +- src/OpenClaw.Shared/SettingsData.cs | 5 + .../OpenClaw.Tray.WinUI.csproj | 1 + .../Services/NodeService.cs | 21 ++ .../Services/SettingsManager.cs | 73 ++++++ .../ElevenLabsTextToSpeechClient.cs | 116 ++++++++++ .../TextToSpeech/TextToSpeechService.cs | 208 ++++++++++++++++++ .../Windows/SettingsWindow.xaml | 26 +++ .../Windows/SettingsWindow.xaml.cs | 53 +++++ .../OpenClaw.Shared.Tests/CapabilityTests.cs | 172 +++++++++++++++ .../McpToolBridgeTests.cs | 2 + tests/OpenClaw.Shared.Tests/ModelsTests.cs | 2 + .../ElevenLabsTextToSpeechClientTests.cs | 137 ++++++++++++ .../OpenClaw.Tray.Tests.csproj | 4 +- .../SettingsRoundTripTests.cs | 38 ++++ .../TrayMenuWindowMarkupTests.cs | 4 + 22 files changed, 1002 insertions(+), 18 deletions(-) create mode 100644 src/OpenClaw.Shared/Capabilities/TtsCapability.cs create mode 100644 src/OpenClaw.Tray.WinUI/Services/TextToSpeech/ElevenLabsTextToSpeechClient.cs create mode 100644 src/OpenClaw.Tray.WinUI/Services/TextToSpeech/TextToSpeechService.cs create mode 100644 tests/OpenClaw.Tray.Tests/ElevenLabsTextToSpeechClientTests.cs diff --git a/README.md b/README.md index 0a455ca..536b491 100644 --- a/README.md +++ b/README.md @@ -179,6 +179,7 @@ When Node Mode is enabled in Settings, your Windows PC becomes a **node** that t | **Camera** | `camera.list`, `camera.snap`, `camera.clip` | Enumerate cameras and capture still photos or short video clips | | **Location** | `location.get` | Return Windows geolocation when permission is available | | **Device** | `device.info`, `device.status` | Return Windows host/app metadata and lightweight status | +| **Text-to-speech** | `tts.speak` | Speak text aloud through Windows speech synthesis, or ElevenLabs when configured | #### Node Setup @@ -205,23 +206,24 @@ When Node Mode is enabled in Settings, your Windows PC becomes a **node** that t "canvas.hide", "canvas.navigate", "canvas.eval", - "canvas.snapshot", - "canvas.a2ui.push", - "canvas.a2ui.pushJSONL", - "canvas.a2ui.reset", - "screen.snapshot", - "camera.list", - "camera.snap", - "camera.clip", - "location.get", - "device.info", - "device.status" + "canvas.snapshot", + "canvas.a2ui.push", + "canvas.a2ui.pushJSONL", + "canvas.a2ui.reset", + "screen.snapshot", + "camera.list", + "camera.snap", + "camera.clip", + "location.get", + "device.info", + "device.status", + "tts.speak" ] - } - } + } + } } ``` - > ⚠️ **Important**: The gateway has a server-side allowlist. Commands must be listed explicitly - wildcards like `canvas.*` don't work! Privacy-sensitive commands such as `screen.record` should only be added to `allowCommands` when you explicitly want to allow them. + > ⚠️ **Important**: The gateway has a server-side allowlist. Commands must be listed explicitly - wildcards like `canvas.*` don't work! Privacy-sensitive commands such as `screen.record` and agent-driven audio playback via `tts.speak` should only be added to `allowCommands` when you explicitly want to allow them. 5. **Test it** from your Mac/gateway: ```bash @@ -249,6 +251,9 @@ When Node Mode is enabled in Settings, your Windows PC becomes a **node** that t # Take a photo (NV12/MediaCapture fallback) openclaw nodes invoke --node --command camera.snap --params '{"deviceId":"","format":"jpeg","quality":80}' + # Speak text aloud on the Windows node (requires TTS enabled in Settings and tts.speak allowed on the gateway) + openclaw nodes invoke --node --command tts.speak --params '{"text":"Hello from OpenClaw","provider":"windows"}' + # Execute a command on the Windows node openclaw nodes invoke --node --command system.run --params '{"command":"Get-Process | Select -First 5","shell":"powershell","timeoutMs":10000}' diff --git a/docs/MCP_MODE.md b/docs/MCP_MODE.md index 52bfedf..3491a0c 100644 --- a/docs/MCP_MODE.md +++ b/docs/MCP_MODE.md @@ -4,7 +4,7 @@ ## Summary -The Windows tray app now ships a **local Model Context Protocol (MCP) server** alongside its existing OpenClaw gateway client. The same node capabilities the agent reaches over the OpenClaw gateway WebSocket — `system.run`, `screen.snapshot`, `canvas.*`, `camera.list`, `camera.snap`, `camera.clip`, `location.get`, `system.notify`, `system.execApprovals.*` — are advertised, on the same machine, as MCP tools over `http://127.0.0.1:8765/`. +The Windows tray app now ships a **local Model Context Protocol (MCP) server** alongside its existing OpenClaw gateway client. The same node capabilities the agent reaches over the OpenClaw gateway WebSocket — `system.run`, `screen.snapshot`, `canvas.*`, `camera.list`, `camera.snap`, `camera.clip`, `location.get`, `tts.speak`, `system.notify`, `system.execApprovals.*` — are advertised, on the same machine, as MCP tools over `http://127.0.0.1:8765/`. This means any local MCP client (Claude Desktop, Claude Code, Cursor, an MCP-aware CLI, a custom dev script) can reach into the running tray and drive Windows-native capabilities directly, without an OpenClaw gateway in the loop. The tray app can run in **MCP-only mode** with no gateway connection at all. diff --git a/docs/WINDOWS_NODE_TESTING.md b/docs/WINDOWS_NODE_TESTING.md index 2f60127..2c20c29 100644 --- a/docs/WINDOWS_NODE_TESTING.md +++ b/docs/WINDOWS_NODE_TESTING.md @@ -61,6 +61,7 @@ These features need the gateway to send `node.invoke` commands: | `location.get` | Get Windows location | Uses Windows location permission/settings | | `device.info` / `device.status` | Device metadata/status | Returns host/app/locale plus battery/storage/network/uptime payloads | | `browser.proxy` | Proxy browser-control host requests | Requires Browser proxy bridge enabled, a compatible browser-control host listening on gateway port + 2, and matching browser-control auth | +| `tts.speak` | Speak text aloud | Requires Text-to-speech playback enabled in Settings; gateway mode also requires `tts.speak` in `gateway.nodes.allowCommands` | ## Capabilities Advertised @@ -72,6 +73,7 @@ When the node connects, it advertises these capabilities: - `location` - Windows.Devices.Geolocation - `device` - Host/app metadata and lightweight status - `browser` - Local `browser.proxy` bridge to a browser-control host on gateway port + 2, when enabled in Settings +- `tts` - Windows speech synthesis or ElevenLabs playback, when enabled in Settings ## Security Features diff --git a/docs/gateway-node-integration.md b/docs/gateway-node-integration.md index ee62390..17fe3a9 100644 --- a/docs/gateway-node-integration.md +++ b/docs/gateway-node-integration.md @@ -79,6 +79,8 @@ Add ALL needed commands to `gateway.nodes.allowCommands` in `~/.openclaw/opencla // Device metadata/status "device.info", "device.status", + // Text-to-speech playback (enable only when agent-driven audio is desired) + "tts.speak", // System (already in Windows defaults, but listed for completeness) // "system.run", // "system.run.prepare", diff --git a/src/OpenClaw.Shared/Capabilities/TtsCapability.cs b/src/OpenClaw.Shared/Capabilities/TtsCapability.cs new file mode 100644 index 0000000..c640782 --- /dev/null +++ b/src/OpenClaw.Shared/Capabilities/TtsCapability.cs @@ -0,0 +1,108 @@ +using System; +using System.Collections.Generic; +using System.Threading; +using System.Threading.Tasks; + +namespace OpenClaw.Shared.Capabilities; + +public sealed class TtsCapability : NodeCapabilityBase +{ + public const string SpeakCommand = "tts.speak"; + public const string WindowsProvider = "windows"; + public const string ElevenLabsProvider = "elevenlabs"; + public const int MaxTextLength = 5000; + + private static readonly string[] _commands = [SpeakCommand]; + + public override string Category => "tts"; + public override IReadOnlyList Commands => _commands; + + public event Func>? SpeakRequested; + + public TtsCapability(IOpenClawLogger logger) : base(logger) + { + } + + public static string ResolveProvider(string? requestedProvider, string? configuredProvider) + { + var provider = string.IsNullOrWhiteSpace(requestedProvider) + ? configuredProvider + : requestedProvider; + + return string.IsNullOrWhiteSpace(provider) + ? WindowsProvider + : provider.Trim().ToLowerInvariant(); + } + + public override Task ExecuteAsync(NodeInvokeRequest request) + => ExecuteAsync(request, CancellationToken.None); + + public override async Task ExecuteAsync( + NodeInvokeRequest request, + CancellationToken cancellationToken) + { + if (!string.Equals(request.Command, SpeakCommand, StringComparison.Ordinal)) + return Error($"Unknown command: {request.Command}"); + + var text = GetStringArg(request.Args, "text")?.Trim(); + if (string.IsNullOrWhiteSpace(text)) + return Error("Missing required text"); + if (text.Length > MaxTextLength) + return Error($"TTS text exceeds {MaxTextLength} characters."); + + if (SpeakRequested == null) + return Error("TTS speak not available"); + + var args = new TtsSpeakArgs + { + Text = text, + Provider = NormalizeOptional(GetStringArg(request.Args, "provider")), + VoiceId = NormalizeOptional(GetStringArg(request.Args, "voiceId")), + Model = NormalizeOptional(GetStringArg(request.Args, "model")), + Interrupt = GetBoolArg(request.Args, "interrupt") + }; + + Logger.Info($"tts.speak: provider={args.Provider ?? "(default)"}, chars={args.Text.Length}, interrupt={args.Interrupt}"); + + try + { + var result = await SpeakRequested(args, cancellationToken).ConfigureAwait(false); + return Success(new + { + spoken = result.Spoken, + provider = result.Provider, + contentType = result.ContentType, + durationMs = result.DurationMs + }); + } + catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested) + { + return Error("Speak canceled"); + } + catch (Exception ex) + { + Logger.Error("TTS speak failed", ex); + return Error($"Speak failed: {ex.Message}"); + } + } + + private static string? NormalizeOptional(string? value) + => string.IsNullOrWhiteSpace(value) ? null : value.Trim(); +} + +public sealed class TtsSpeakArgs +{ + public string Text { get; set; } = ""; + public string? Provider { get; set; } + public string? VoiceId { get; set; } + public string? Model { get; set; } + public bool Interrupt { get; set; } +} + +public sealed class TtsSpeakResult +{ + public bool Spoken { get; set; } = true; + public string Provider { get; set; } = TtsCapability.WindowsProvider; + public string? ContentType { get; set; } + public int? DurationMs { get; set; } +} diff --git a/src/OpenClaw.Shared/Mcp/McpToolBridge.cs b/src/OpenClaw.Shared/Mcp/McpToolBridge.cs index 3957c8b..4dba2ca 100644 --- a/src/OpenClaw.Shared/Mcp/McpToolBridge.cs +++ b/src/OpenClaw.Shared/Mcp/McpToolBridge.cs @@ -235,6 +235,10 @@ public class McpToolBridge "Capture a still photo from a camera. Args: deviceId (string, optional — defaults to system default camera), format ('jpeg'|'png', default 'jpeg'), maxWidth (int, default 1280), quality (int 1-100, default 80). Returns { format, width, height, base64 }.", ["camera.clip"] = "Record a short clip from a camera. Args: deviceId (string, optional), durationMs (int, required, max 60000), format ('mp4'|'webm', default 'mp4'), maxWidth (int, default 1280). Returns { format, durationMs, base64 }.", + + // tts.* + ["tts.speak"] = + "Speak text aloud on the Windows node. Args: text (string, required), provider ('windows'|'elevenlabs', optional), voiceId (string, optional), model (string, optional), interrupt (bool, default false). Returns { spoken, provider, contentType, durationMs }.", }; private async Task HandleToolsCallAsync(JsonElement parameters, CancellationToken cancellationToken) diff --git a/src/OpenClaw.Shared/Models.cs b/src/OpenClaw.Shared/Models.cs index 0d7888a..c71fac0 100644 --- a/src/OpenClaw.Shared/Models.cs +++ b/src/OpenClaw.Shared/Models.cs @@ -1023,7 +1023,8 @@ public static class CommandCenterCommandGroups [ "camera.snap", "camera.clip", - "screen.record" + "screen.record", + "tts.speak" ]; public static readonly FrozenSet DangerousCommandSet = @@ -1046,7 +1047,9 @@ public static class CommandCenterCommandGroups public static readonly string[] MacNodeParityCommands = [ .. SafeCompanionCommands, - .. DangerousCommands, + "camera.snap", + "camera.clip", + "screen.record", "system.notify", "system.run", "system.which", diff --git a/src/OpenClaw.Shared/SettingsData.cs b/src/OpenClaw.Shared/SettingsData.cs index 6f3fe49..a98aaa2 100644 --- a/src/OpenClaw.Shared/SettingsData.cs +++ b/src/OpenClaw.Shared/SettingsData.cs @@ -34,6 +34,11 @@ public class SettingsData public bool NodeCameraEnabled { get; set; } = true; public bool NodeLocationEnabled { get; set; } = true; public bool NodeBrowserProxyEnabled { get; set; } = true; + public bool NodeTtsEnabled { get; set; } = false; + public string TtsProvider { get; set; } = "windows"; + public string? TtsElevenLabsApiKey { get; set; } + public string? TtsElevenLabsModel { get; set; } + public string? TtsElevenLabsVoiceId { get; set; } /// Run the local MCP HTTP server. Independent of EnableNodeMode. public bool EnableMcpServer { get; set; } = false; /// diff --git a/src/OpenClaw.Tray.WinUI/OpenClaw.Tray.WinUI.csproj b/src/OpenClaw.Tray.WinUI/OpenClaw.Tray.WinUI.csproj index 6f2f15f..96dc8ff 100644 --- a/src/OpenClaw.Tray.WinUI/OpenClaw.Tray.WinUI.csproj +++ b/src/OpenClaw.Tray.WinUI/OpenClaw.Tray.WinUI.csproj @@ -57,6 +57,7 @@ + diff --git a/src/OpenClaw.Tray.WinUI/Services/NodeService.cs b/src/OpenClaw.Tray.WinUI/Services/NodeService.cs index 2d9a6ac..f6abf77 100644 --- a/src/OpenClaw.Tray.WinUI/Services/NodeService.cs +++ b/src/OpenClaw.Tray.WinUI/Services/NodeService.cs @@ -69,6 +69,8 @@ public sealed class NodeService : IDisposable private LocationCapability? _locationCapability; private DeviceCapability? _deviceCapability; private BrowserProxyCapability? _browserProxyCapability; + private TtsCapability? _ttsCapability; + private TextToSpeechService? _textToSpeechService; private readonly string _dataPath; private string? _token; @@ -282,6 +284,14 @@ public sealed class NodeService : IDisposable Register(_locationCapability); } + if (_settings?.NodeTtsEnabled == true) + { + _textToSpeechService ??= new TextToSpeechService(_logger, _settings); + _ttsCapability = new TtsCapability(_logger); + _ttsCapability.SpeakRequested += OnTtsSpeakAsync; + Register(_ttsCapability); + } + // Device metadata/status capability _deviceCapability = new DeviceCapability(_logger); Register(_deviceCapability); @@ -447,6 +457,8 @@ public sealed class NodeService : IDisposable disabled.AddRange(CommandCenterCommandGroups.SafeCompanionCommands.Where(command => command.StartsWith("location.", StringComparison.OrdinalIgnoreCase))); if (_settings?.NodeBrowserProxyEnabled == false) disabled.Add("browser.proxy"); + if (_settings?.NodeTtsEnabled != true) + disabled.AddRange(CommandCenterCommandGroups.DangerousCommands.Where(command => command.StartsWith("tts.", StringComparison.OrdinalIgnoreCase))); return disabled; } @@ -1265,6 +1277,14 @@ public sealed class NodeService : IDisposable TimestampMs = position.Coordinate.Timestamp.ToUnixTimeMilliseconds() }; } + + private Task OnTtsSpeakAsync(TtsSpeakArgs args, CancellationToken cancellationToken) + { + if (_textToSpeechService == null) + throw new InvalidOperationException("Text-to-speech service not available"); + + return _textToSpeechService.SpeakAsync(args, cancellationToken); + } #endregion @@ -1278,6 +1298,7 @@ public sealed class NodeService : IDisposable try { _cameraCaptureService?.Dispose(); } catch { /* ignore */ } try { _screenRecordingService?.Dispose(); } catch { /* ignore */ } + try { _textToSpeechService?.Dispose(); } catch { /* ignore */ } // MediaResolver owns SocketsHttpHandler + HttpClient (disposeHandler:true); // without disposal the connection pool survives node teardown/recreate. try { _mediaResolver?.Dispose(); } catch { /* ignore */ } diff --git a/src/OpenClaw.Tray.WinUI/Services/SettingsManager.cs b/src/OpenClaw.Tray.WinUI/Services/SettingsManager.cs index 701fe63..7d7f277 100644 --- a/src/OpenClaw.Tray.WinUI/Services/SettingsManager.cs +++ b/src/OpenClaw.Tray.WinUI/Services/SettingsManager.cs @@ -1,5 +1,7 @@ using System; using System.IO; +using System.Security.Cryptography; +using System.Text; using System.Text.Json; using OpenClaw.Shared; @@ -17,6 +19,8 @@ public class SettingsManager private readonly string _settingsDirectory; private readonly string _settingsFilePath; + private const string ProtectedSecretPrefix = "dpapi:"; + private static readonly byte[] ProtectedSecretEntropy = Encoding.UTF8.GetBytes("OpenClawTray.Settings.v1"); public static string SettingsDirectoryPath => DefaultSettingsDirectory; public static string SettingsPath => DefaultSettingsFilePath; @@ -61,6 +65,11 @@ public class SettingsManager public bool NodeCameraEnabled { get; set; } = true; public bool NodeLocationEnabled { get; set; } = true; public bool NodeBrowserProxyEnabled { get; set; } = true; + public bool NodeTtsEnabled { get; set; } = false; + public string TtsProvider { get; set; } = "windows"; + public string TtsElevenLabsApiKey { get; set; } = ""; + public string TtsElevenLabsModel { get; set; } = ""; + public string TtsElevenLabsVoiceId { get; set; } = ""; // Local MCP HTTP server (independent of EnableNodeMode) public bool EnableMcpServer { get; set; } = false; /// @@ -132,6 +141,11 @@ public class SettingsManager NodeCameraEnabled = loaded.NodeCameraEnabled; NodeLocationEnabled = loaded.NodeLocationEnabled; NodeBrowserProxyEnabled = loaded.NodeBrowserProxyEnabled; + NodeTtsEnabled = loaded.NodeTtsEnabled; + TtsProvider = string.IsNullOrWhiteSpace(loaded.TtsProvider) ? TtsProvider : loaded.TtsProvider; + TtsElevenLabsApiKey = UnprotectSettingSecret(loaded.TtsElevenLabsApiKey) ?? TtsElevenLabsApiKey; + TtsElevenLabsModel = loaded.TtsElevenLabsModel ?? TtsElevenLabsModel; + TtsElevenLabsVoiceId = loaded.TtsElevenLabsVoiceId ?? TtsElevenLabsVoiceId; EnableMcpServer = loaded.EnableMcpServer; A2UIImageHosts = loaded.A2UIImageHosts ?? new List(); // Legacy McpOnlyMode migration: @@ -200,6 +214,11 @@ public class SettingsManager NodeCameraEnabled = NodeCameraEnabled, NodeLocationEnabled = NodeLocationEnabled, NodeBrowserProxyEnabled = NodeBrowserProxyEnabled, + NodeTtsEnabled = NodeTtsEnabled, + TtsProvider = TtsProvider, + TtsElevenLabsApiKey = ProtectSettingSecret(TtsElevenLabsApiKey), + TtsElevenLabsModel = string.IsNullOrWhiteSpace(TtsElevenLabsModel) ? null : TtsElevenLabsModel, + TtsElevenLabsVoiceId = string.IsNullOrWhiteSpace(TtsElevenLabsVoiceId) ? null : TtsElevenLabsVoiceId, EnableMcpServer = EnableMcpServer, A2UIImageHosts = A2UIImageHosts.Count == 0 ? null : new List(A2UIImageHosts), // McpOnlyMode is legacy — never written; remains null in serialized output. @@ -221,6 +240,60 @@ public class SettingsManager } } + internal static string? ProtectSettingSecret(string? value) + { + if (string.IsNullOrWhiteSpace(value)) + return null; + + if (!OperatingSystem.IsWindows()) + throw new PlatformNotSupportedException("Windows Data Protection API is required for protected settings secrets."); + + var bytes = Encoding.UTF8.GetBytes(value); + var protectedBytes = ProtectedData.Protect(bytes, ProtectedSecretEntropy, DataProtectionScope.CurrentUser); + return ProtectedSecretPrefix + Convert.ToBase64String(protectedBytes); + } + + internal static string? UnprotectSettingSecret(string? value) + { + if (string.IsNullOrWhiteSpace(value)) + return value; + if (!value.StartsWith(ProtectedSecretPrefix, StringComparison.Ordinal)) + return value; + + if (!OperatingSystem.IsWindows()) + { + Logger.Warn("Failed to decrypt protected settings secret: Windows Data Protection API is unavailable."); + return null; + } + + try + { + var protectedBytes = Convert.FromBase64String(value[ProtectedSecretPrefix.Length..]); + var bytes = ProtectedData.Unprotect(protectedBytes, ProtectedSecretEntropy, DataProtectionScope.CurrentUser); + return Encoding.UTF8.GetString(bytes); + } + catch (FormatException ex) + { + Logger.Warn($"Failed to decode protected settings secret: {ex.Message}"); + return null; + } + catch (CryptographicException ex) + { + Logger.Warn($"Failed to decrypt protected settings secret: {ex.Message}"); + return null; + } + catch (NotSupportedException ex) + { + Logger.Warn($"Failed to decrypt protected settings secret: {ex.Message}"); + return null; + } + catch (ArgumentException ex) + { + Logger.Warn($"Failed to decrypt protected settings secret: {ex.Message}"); + return null; + } + } + public string GetEffectiveGatewayUrl() { if (!UseSshTunnel) diff --git a/src/OpenClaw.Tray.WinUI/Services/TextToSpeech/ElevenLabsTextToSpeechClient.cs b/src/OpenClaw.Tray.WinUI/Services/TextToSpeech/ElevenLabsTextToSpeechClient.cs new file mode 100644 index 0000000..864a6cc --- /dev/null +++ b/src/OpenClaw.Tray.WinUI/Services/TextToSpeech/ElevenLabsTextToSpeechClient.cs @@ -0,0 +1,116 @@ +using System; +using System.Net; +using System.Net.Http; +using System.Net.Http.Headers; +using System.Text; +using System.Text.Json; +using System.Threading; +using System.Threading.Tasks; +using OpenClaw.Shared.Capabilities; + +namespace OpenClawTray.Services; + +public sealed class ElevenLabsSynthesisRequest +{ + public string ApiKey { get; set; } = ""; + public string VoiceId { get; set; } = ""; + public string Text { get; set; } = ""; + public string? ModelId { get; set; } +} + +public sealed class ElevenLabsSynthesisResult +{ + public byte[] AudioBytes { get; set; } = []; + public string ContentType { get; set; } = "audio/mpeg"; +} + +public sealed class ElevenLabsTextToSpeechClient : IDisposable +{ + private const string DefaultBaseUrl = "https://api.elevenlabs.io"; + public const int MaxTextLength = TtsCapability.MaxTextLength; + internal static readonly TimeSpan DefaultTimeout = TimeSpan.FromSeconds(30); + private readonly HttpClient _httpClient; + private readonly bool _ownsHttpClient; + private readonly Uri _baseUri; + + internal TimeSpan Timeout => _httpClient.Timeout; + + public ElevenLabsTextToSpeechClient() + : this(new HttpClient(), ownsHttpClient: true, baseUrl: DefaultBaseUrl) + { + } + + public ElevenLabsTextToSpeechClient(HttpMessageHandler handler, string baseUrl = DefaultBaseUrl) + : this(new HttpClient(handler), ownsHttpClient: true, baseUrl) + { + } + + private ElevenLabsTextToSpeechClient(HttpClient httpClient, bool ownsHttpClient, string baseUrl) + { + _httpClient = httpClient; + _httpClient.Timeout = DefaultTimeout; + _ownsHttpClient = ownsHttpClient; + _baseUri = new Uri(baseUrl.TrimEnd('/') + "/", UriKind.Absolute); + } + + public async Task SynthesizeAsync( + ElevenLabsSynthesisRequest request, + CancellationToken cancellationToken = default) + { + if (string.IsNullOrWhiteSpace(request.ApiKey)) + throw new InvalidOperationException("ElevenLabs API key is required."); + if (string.IsNullOrWhiteSpace(request.VoiceId)) + throw new InvalidOperationException("ElevenLabs voice ID is required."); + if (string.IsNullOrWhiteSpace(request.Text)) + throw new InvalidOperationException("Text is required."); + if (request.Text.Length > MaxTextLength) + throw new InvalidOperationException($"ElevenLabs TTS text exceeds {MaxTextLength} characters."); + + var path = $"v1/text-to-speech/{Uri.EscapeDataString(request.VoiceId.Trim())}"; + using var httpRequest = new HttpRequestMessage(HttpMethod.Post, new Uri(_baseUri, path)); + httpRequest.Headers.Add("xi-api-key", request.ApiKey.Trim()); + httpRequest.Headers.Accept.Add(new MediaTypeWithQualityHeaderValue("audio/mpeg")); + + var body = JsonSerializer.Serialize(new + { + text = request.Text, + model_id = string.IsNullOrWhiteSpace(request.ModelId) ? null : request.ModelId.Trim() + }); + httpRequest.Content = new StringContent(body, Encoding.UTF8, "application/json"); + + using var response = await _httpClient.SendAsync( + httpRequest, + HttpCompletionOption.ResponseHeadersRead, + cancellationToken).ConfigureAwait(false); + var bytes = await response.Content.ReadAsByteArrayAsync(cancellationToken).ConfigureAwait(false); + + if (!response.IsSuccessStatusCode) + throw new InvalidOperationException(BuildFailureMessage(response.StatusCode, bytes)); + if (bytes.Length == 0) + throw new InvalidOperationException("ElevenLabs returned an empty audio response."); + + return new ElevenLabsSynthesisResult + { + AudioBytes = bytes, + ContentType = response.Content.Headers.ContentType?.MediaType ?? "audio/mpeg" + }; + } + + internal static string BuildFailureMessage(HttpStatusCode statusCode, byte[] bodyBytes) + { + var body = Encoding.UTF8.GetString(bodyBytes); + if (body.Length > 300) + body = body[..300]; + body = body.Trim(); + + return string.IsNullOrEmpty(body) + ? $"ElevenLabs TTS failed with HTTP {(int)statusCode} ({statusCode})." + : $"ElevenLabs TTS failed with HTTP {(int)statusCode} ({statusCode}): {body}"; + } + + public void Dispose() + { + if (_ownsHttpClient) + _httpClient.Dispose(); + } +} diff --git a/src/OpenClaw.Tray.WinUI/Services/TextToSpeech/TextToSpeechService.cs b/src/OpenClaw.Tray.WinUI/Services/TextToSpeech/TextToSpeechService.cs new file mode 100644 index 0000000..13be3bf --- /dev/null +++ b/src/OpenClaw.Tray.WinUI/Services/TextToSpeech/TextToSpeechService.cs @@ -0,0 +1,208 @@ +using System; +using System.Diagnostics; +using System.Linq; +using System.Threading; +using System.Threading.Tasks; +using OpenClaw.Shared; +using OpenClaw.Shared.Capabilities; +using Windows.Media.Core; +using Windows.Media.Playback; +using Windows.Media.SpeechSynthesis; +using Windows.Storage.Streams; + +namespace OpenClawTray.Services; + +public sealed class TextToSpeechService : IDisposable +{ + private readonly IOpenClawLogger _logger; + private readonly SettingsManager _settings; + private readonly ElevenLabsTextToSpeechClient _elevenLabsClient; + private readonly SemaphoreSlim _playbackGate = new(1, 1); + private readonly object _activeLock = new(); + private MediaPlayer? _activePlayer; + private TaskCompletionSource? _activeCompletion; + + public TextToSpeechService(IOpenClawLogger logger, SettingsManager settings) + : this(logger, settings, new ElevenLabsTextToSpeechClient()) + { + } + + internal TextToSpeechService( + IOpenClawLogger logger, + SettingsManager settings, + ElevenLabsTextToSpeechClient elevenLabsClient) + { + _logger = logger; + _settings = settings; + _elevenLabsClient = elevenLabsClient; + } + + public async Task SpeakAsync(TtsSpeakArgs args, CancellationToken cancellationToken = default) + { + var provider = TtsCapability.ResolveProvider(args.Provider, _settings.TtsProvider); + var stopwatch = Stopwatch.StartNew(); + + if (string.Equals(provider, TtsCapability.WindowsProvider, StringComparison.OrdinalIgnoreCase)) + { + await SpeakWithWindowsAsync(args, cancellationToken).ConfigureAwait(false); + } + else if (string.Equals(provider, TtsCapability.ElevenLabsProvider, StringComparison.OrdinalIgnoreCase)) + { + await SpeakWithElevenLabsAsync(args, cancellationToken).ConfigureAwait(false); + } + else + { + throw new InvalidOperationException($"Unsupported TTS provider '{provider}'."); + } + + stopwatch.Stop(); + return new TtsSpeakResult + { + Provider = provider, + ContentType = string.Equals(provider, TtsCapability.ElevenLabsProvider, StringComparison.OrdinalIgnoreCase) + ? "audio/mpeg" + : "audio/wav", + DurationMs = (int)Math.Min(stopwatch.ElapsedMilliseconds, int.MaxValue) + }; + } + + private async Task SpeakWithWindowsAsync(TtsSpeakArgs args, CancellationToken cancellationToken) + { + using var synthesizer = new SpeechSynthesizer(); + if (!string.IsNullOrWhiteSpace(args.VoiceId)) + { + var requestedVoice = args.VoiceId.Trim(); + var voice = SpeechSynthesizer.AllVoices.FirstOrDefault(v => + string.Equals(v.Id, requestedVoice, StringComparison.OrdinalIgnoreCase) || + string.Equals(v.DisplayName, requestedVoice, StringComparison.OrdinalIgnoreCase)); + if (voice == null) + throw new InvalidOperationException($"Windows TTS voice '{requestedVoice}' was not found."); + + synthesizer.Voice = voice; + } + + using var stream = await synthesizer + .SynthesizeTextToStreamAsync(args.Text) + .AsTask(cancellationToken) + .ConfigureAwait(false); + await PlayStreamAsync(stream, stream.ContentType, args.Interrupt, cancellationToken).ConfigureAwait(false); + } + + private async Task SpeakWithElevenLabsAsync(TtsSpeakArgs args, CancellationToken cancellationToken) + { + var apiKey = _settings.TtsElevenLabsApiKey; + if (string.IsNullOrWhiteSpace(apiKey)) + throw new InvalidOperationException("ElevenLabs API key is required in Settings."); + + var voiceId = string.IsNullOrWhiteSpace(args.VoiceId) + ? _settings.TtsElevenLabsVoiceId + : args.VoiceId; + if (string.IsNullOrWhiteSpace(voiceId)) + throw new InvalidOperationException("ElevenLabs voice ID is required in Settings or the tts.speak voiceId argument."); + + var model = string.IsNullOrWhiteSpace(args.Model) + ? _settings.TtsElevenLabsModel + : args.Model; + + var audio = await _elevenLabsClient.SynthesizeAsync(new ElevenLabsSynthesisRequest + { + ApiKey = apiKey, + VoiceId = voiceId, + Text = args.Text, + ModelId = model + }, cancellationToken).ConfigureAwait(false); + + using var stream = await CreateStreamAsync(audio.AudioBytes, cancellationToken).ConfigureAwait(false); + await PlayStreamAsync(stream, audio.ContentType, args.Interrupt, cancellationToken).ConfigureAwait(false); + } + + private static async Task CreateStreamAsync(byte[] bytes, CancellationToken cancellationToken) + { + var stream = new InMemoryRandomAccessStream(); + using var writer = new DataWriter(stream); + writer.WriteBytes(bytes); + await writer.StoreAsync().AsTask(cancellationToken).ConfigureAwait(false); + await writer.FlushAsync().AsTask(cancellationToken).ConfigureAwait(false); + writer.DetachStream(); + stream.Seek(0); + return stream; + } + + private async Task PlayStreamAsync( + IRandomAccessStream stream, + string contentType, + bool interrupt, + CancellationToken cancellationToken) + { + if (interrupt) + InterruptActivePlayback(); + + await _playbackGate.WaitAsync(cancellationToken).ConfigureAwait(false); + + MediaPlayer? player = null; + var completion = new TaskCompletionSource(TaskCreationOptions.RunContinuationsAsynchronously); + try + { + player = new MediaPlayer(); + player.MediaEnded += (_, _) => completion.TrySetResult(true); + player.MediaFailed += (_, e) => + completion.TrySetException(new InvalidOperationException($"TTS playback failed: {e.ErrorMessage}")); + player.Source = MediaSource.CreateFromStream(stream, contentType); + + lock (_activeLock) + { + _activePlayer = player; + _activeCompletion = completion; + } + + player.Play(); + + using var cancellationRegistration = cancellationToken.Register( + static state => ((TaskCompletionSource)state!).TrySetCanceled(), + completion); + await completion.Task.ConfigureAwait(false); + } + finally + { + lock (_activeLock) + { + if (ReferenceEquals(_activePlayer, player)) + { + _activePlayer = null; + _activeCompletion = null; + } + } + + if (player != null) + { + player.Pause(); + player.Source = null; + player.Dispose(); + } + + _playbackGate.Release(); + } + } + + private void InterruptActivePlayback() + { + TaskCompletionSource? completion; + lock (_activeLock) + { + completion = _activeCompletion; + } + + if (completion != null) + { + _logger.Info("Interrupting active TTS playback"); + completion.TrySetException(new InvalidOperationException("TTS playback was interrupted.")); + } + } + + public void Dispose() + { + InterruptActivePlayback(); + // Playback may still release the gate after an interrupt during shutdown. + _elevenLabsClient.Dispose(); + } +} diff --git a/src/OpenClaw.Tray.WinUI/Windows/SettingsWindow.xaml b/src/OpenClaw.Tray.WinUI/Windows/SettingsWindow.xaml index e601254..3977df8 100644 --- a/src/OpenClaw.Tray.WinUI/Windows/SettingsWindow.xaml +++ b/src/OpenClaw.Tray.WinUI/Windows/SettingsWindow.xaml @@ -200,6 +200,32 @@ + + + + + + + + + + + diff --git a/src/OpenClaw.Tray.WinUI/Windows/SettingsWindow.xaml.cs b/src/OpenClaw.Tray.WinUI/Windows/SettingsWindow.xaml.cs index 4ad6ad6..96fecbf 100644 --- a/src/OpenClaw.Tray.WinUI/Windows/SettingsWindow.xaml.cs +++ b/src/OpenClaw.Tray.WinUI/Windows/SettingsWindow.xaml.cs @@ -95,6 +95,12 @@ public sealed partial class SettingsWindow : WindowEx NodeCameraToggle.IsOn = _settings.NodeCameraEnabled; NodeLocationToggle.IsOn = _settings.NodeLocationEnabled; NodeBrowserProxyToggle.IsOn = _settings.NodeBrowserProxyEnabled; + NodeTtsToggle.IsOn = _settings.NodeTtsEnabled; + SelectTtsProvider(_settings.TtsProvider); + TtsElevenLabsApiKeyPasswordBox.Password = _settings.TtsElevenLabsApiKey; + TtsElevenLabsVoiceIdTextBox.Text = _settings.TtsElevenLabsVoiceId; + TtsElevenLabsModelTextBox.Text = _settings.TtsElevenLabsModel; + UpdateTtsProviderUiState(); UpdateSshTunnelPreviewText(); McpServerToggle.IsOn = _settings.EnableMcpServer; McpUrlTextBox.Text = NodeService.McpServerUrl; @@ -387,6 +393,11 @@ public sealed partial class SettingsWindow : WindowEx _settings.NodeCameraEnabled = NodeCameraToggle.IsOn; _settings.NodeLocationEnabled = NodeLocationToggle.IsOn; _settings.NodeBrowserProxyEnabled = NodeBrowserProxyToggle.IsOn; + _settings.NodeTtsEnabled = NodeTtsToggle.IsOn; + _settings.TtsProvider = GetSelectedTtsProvider(); + _settings.TtsElevenLabsApiKey = TtsElevenLabsApiKeyPasswordBox.Password.Trim(); + _settings.TtsElevenLabsVoiceId = TtsElevenLabsVoiceIdTextBox.Text.Trim(); + _settings.TtsElevenLabsModel = TtsElevenLabsModelTextBox.Text.Trim(); _settings.EnableMcpServer = McpServerToggle.IsOn; _settings.Save(); @@ -631,6 +642,48 @@ public sealed partial class SettingsWindow : WindowEx UpdateSshTunnelPreviewText(); } + private void OnTtsProviderSelectionChanged(object sender, Microsoft.UI.Xaml.Controls.SelectionChangedEventArgs e) + { + UpdateTtsProviderUiState(); + } + + private void SelectTtsProvider(string provider) + { + for (int i = 0; i < TtsProviderComboBox.Items.Count; i++) + { + if (TtsProviderComboBox.Items[i] is Microsoft.UI.Xaml.Controls.ComboBoxItem item && + string.Equals(item.Tag?.ToString(), provider, StringComparison.OrdinalIgnoreCase)) + { + TtsProviderComboBox.SelectedIndex = i; + return; + } + } + + TtsProviderComboBox.SelectedIndex = 0; + } + + private string GetSelectedTtsProvider() + { + if (TtsProviderComboBox.SelectedItem is Microsoft.UI.Xaml.Controls.ComboBoxItem item && + item.Tag is not null) + { + return item.Tag.ToString() ?? "windows"; + } + + return "windows"; + } + + private void UpdateTtsProviderUiState() + { + if (TtsElevenLabsSettingsPanel == null) + return; + + TtsElevenLabsSettingsPanel.Visibility = + string.Equals(GetSelectedTtsProvider(), "elevenlabs", StringComparison.OrdinalIgnoreCase) + ? Visibility.Visible + : Visibility.Collapsed; + } + private void OnUseLocalGateway(object sender, RoutedEventArgs e) { UseSshTunnelToggle.IsOn = false; diff --git a/tests/OpenClaw.Shared.Tests/CapabilityTests.cs b/tests/OpenClaw.Shared.Tests/CapabilityTests.cs index 472c883..161a3f7 100644 --- a/tests/OpenClaw.Shared.Tests/CapabilityTests.cs +++ b/tests/OpenClaw.Shared.Tests/CapabilityTests.cs @@ -2093,6 +2093,178 @@ public class CameraCapabilityTests } } +public class TtsCapabilityTests +{ + private static JsonElement Parse(string json) + { + using var doc = JsonDocument.Parse(json); + return doc.RootElement.Clone(); + } + + [Fact] + public void CanHandle_TtsSpeak() + { + var cap = new TtsCapability(NullLogger.Instance); + + Assert.True(cap.CanHandle("tts.speak")); + Assert.False(cap.CanHandle("tts.stop")); + Assert.Equal("tts", cap.Category); + } + + [Theory] + [InlineData("elevenlabs", "windows", "elevenlabs")] + [InlineData(" ELEVENLABS ", "windows", "elevenlabs")] + [InlineData(null, "elevenlabs", "elevenlabs")] + [InlineData(" ", "elevenlabs", "elevenlabs")] + [InlineData(null, "", "windows")] + [InlineData(null, " ", "windows")] + public void ResolveProvider_NormalizesRequestedAndConfiguredValues( + string? requestedProvider, + string? configuredProvider, + string expected) + { + Assert.Equal(expected, TtsCapability.ResolveProvider(requestedProvider, configuredProvider)); + } + + [Fact] + public async Task Speak_ReturnsError_WhenTextMissing() + { + var cap = new TtsCapability(NullLogger.Instance); + var handlerCalled = false; + cap.SpeakRequested += (_, _) => + { + handlerCalled = true; + return Task.FromResult(new TtsSpeakResult()); + }; + + var res = await cap.ExecuteAsync(new NodeInvokeRequest + { + Id = "tts-missing", + Command = "tts.speak", + Args = Parse("""{"text":" "}""") + }); + + Assert.False(res.Ok); + Assert.False(handlerCalled); + Assert.Contains("text", res.Error, StringComparison.OrdinalIgnoreCase); + } + + [Fact] + public async Task Speak_ReturnsError_WhenNoHandler() + { + var cap = new TtsCapability(NullLogger.Instance); + + var res = await cap.ExecuteAsync(new NodeInvokeRequest + { + Id = "tts-unavailable", + Command = "tts.speak", + Args = Parse("""{"text":"hello"}""") + }); + + Assert.False(res.Ok); + Assert.Contains("not available", res.Error, StringComparison.OrdinalIgnoreCase); + } + + [Fact] + public async Task Speak_ReturnsError_WhenTextTooLong() + { + var cap = new TtsCapability(NullLogger.Instance); + var handlerCalled = false; + cap.SpeakRequested += (_, _) => + { + handlerCalled = true; + return Task.FromResult(new TtsSpeakResult()); + }; + + var res = await cap.ExecuteAsync(new NodeInvokeRequest + { + Id = "tts-too-long", + Command = "tts.speak", + Args = Parse(JsonSerializer.Serialize(new + { + text = new string('x', TtsCapability.MaxTextLength + 1) + })) + }); + + Assert.False(res.Ok); + Assert.False(handlerCalled); + Assert.Contains(TtsCapability.MaxTextLength.ToString(), res.Error); + } + + [Fact] + public async Task Speak_RaisesEvent_WithArgs() + { + var cap = new TtsCapability(NullLogger.Instance); + TtsSpeakArgs? received = null; + cap.SpeakRequested += (args, _) => + { + received = args; + return Task.FromResult(new TtsSpeakResult + { + Provider = TtsCapability.ElevenLabsProvider, + ContentType = "audio/mpeg", + DurationMs = 123 + }); + }; + + var res = await cap.ExecuteAsync(new NodeInvokeRequest + { + Id = "tts-args", + Command = "tts.speak", + Args = Parse("""{"text":" hello world ","provider":"elevenlabs","voiceId":"voice-1","model":"model-1","interrupt":true}""") + }); + + Assert.True(res.Ok); + Assert.NotNull(received); + Assert.Equal("hello world", received!.Text); + Assert.Equal("elevenlabs", received.Provider); + Assert.Equal("voice-1", received.VoiceId); + Assert.Equal("model-1", received.Model); + Assert.True(received.Interrupt); + + var json = JsonSerializer.Serialize(res.Payload); + using var doc = JsonDocument.Parse(json); + var root = doc.RootElement; + Assert.True(root.GetProperty("spoken").GetBoolean()); + Assert.Equal("elevenlabs", root.GetProperty("provider").GetString()); + Assert.Equal("audio/mpeg", root.GetProperty("contentType").GetString()); + Assert.Equal(123, root.GetProperty("durationMs").GetInt32()); + } + + [Fact] + public async Task Speak_ReturnsError_WhenHandlerThrows() + { + var cap = new TtsCapability(NullLogger.Instance); + cap.SpeakRequested += (_, _) => throw new InvalidOperationException("Audio device unavailable"); + + var res = await cap.ExecuteAsync(new NodeInvokeRequest + { + Id = "tts-fail", + Command = "tts.speak", + Args = Parse("""{"text":"hello"}""") + }); + + Assert.False(res.Ok); + Assert.Contains("Audio device unavailable", res.Error); + } + + [Fact] + public async Task UnknownCommand_ReturnsError() + { + var cap = new TtsCapability(NullLogger.Instance); + + var res = await cap.ExecuteAsync(new NodeInvokeRequest + { + Id = "tts-unknown", + Command = "tts.stop", + Args = Parse("""{}""") + }); + + Assert.False(res.Ok); + Assert.Contains("Unknown command", res.Error); + } +} + public class LocationCapabilityTests { private static JsonElement Parse(string json) diff --git a/tests/OpenClaw.Shared.Tests/McpToolBridgeTests.cs b/tests/OpenClaw.Shared.Tests/McpToolBridgeTests.cs index 164957d..043b821 100644 --- a/tests/OpenClaw.Shared.Tests/McpToolBridgeTests.cs +++ b/tests/OpenClaw.Shared.Tests/McpToolBridgeTests.cs @@ -78,6 +78,7 @@ public class McpToolBridgeTests new FakeCapability("canvas", "canvas.a2ui.push"), new FakeCapability("screen", "screen.snapshot"), new FakeCapability("camera", "camera.snap"), + new FakeCapability("tts", "tts.speak"), new FakeCapability("custom", "custom.unknown"), }; var bridge = CreateBridge(caps); @@ -95,6 +96,7 @@ public class McpToolBridgeTests Assert.Contains("A2UI v0.8", byName["canvas.a2ui.push"]); Assert.Contains("screenshot", byName["screen.snapshot"]); Assert.Contains("camera", byName["camera.snap"], System.StringComparison.OrdinalIgnoreCase); + Assert.Contains("Speak text", byName["tts.speak"]); // Unknown commands keep the generic fallback so newly-added capabilities still render. Assert.Equal("custom capability: custom.unknown", byName["custom.unknown"]); diff --git a/tests/OpenClaw.Shared.Tests/ModelsTests.cs b/tests/OpenClaw.Shared.Tests/ModelsTests.cs index c6b3d39..b046f77 100644 --- a/tests/OpenClaw.Shared.Tests/ModelsTests.cs +++ b/tests/OpenClaw.Shared.Tests/ModelsTests.cs @@ -937,6 +937,8 @@ public class CommandCenterModelTests Assert.Contains("device.info", CommandCenterCommandGroups.SafeCompanionCommands); Assert.Contains("device.status", CommandCenterCommandGroups.SafeCompanionCommands); Assert.Contains("screen.record", CommandCenterCommandGroups.DangerousCommands); + Assert.Contains("tts.speak", CommandCenterCommandGroups.DangerousCommands); + Assert.DoesNotContain("tts.speak", CommandCenterCommandGroups.MacNodeParityCommands); Assert.Contains("browser.proxy", CommandCenterCommandGroups.BrowserCommands); Assert.Contains("browser.proxy", CommandCenterCommandGroups.MacNodeParityCommands); } diff --git a/tests/OpenClaw.Tray.Tests/ElevenLabsTextToSpeechClientTests.cs b/tests/OpenClaw.Tray.Tests/ElevenLabsTextToSpeechClientTests.cs new file mode 100644 index 0000000..bf0aed7 --- /dev/null +++ b/tests/OpenClaw.Tray.Tests/ElevenLabsTextToSpeechClientTests.cs @@ -0,0 +1,137 @@ +using System.Net; +using System.Net.Http; +using System.Text.Json; +using OpenClawTray.Services; + +namespace OpenClaw.Tray.Tests; + +public class ElevenLabsTextToSpeechClientTests +{ + [Fact] + public async Task SynthesizeAsync_PostsExpectedRequest() + { + var handler = new CapturingHandler(new HttpResponseMessage(HttpStatusCode.OK) + { + Content = new ByteArrayContent([1, 2, 3]) + { + Headers = { ContentType = new("audio/mpeg") } + } + }); + var client = new ElevenLabsTextToSpeechClient(handler, "https://example.test"); + + var result = await client.SynthesizeAsync(new ElevenLabsSynthesisRequest + { + ApiKey = "key-123", + VoiceId = "voice/with slash", + Text = "Hello", + ModelId = "model-1" + }); + + Assert.Equal([1, 2, 3], result.AudioBytes); + Assert.Equal("audio/mpeg", result.ContentType); + Assert.NotNull(handler.LastRequest); + Assert.Equal(HttpMethod.Post, handler.LastRequest!.Method); + Assert.Equal("https://example.test/v1/text-to-speech/voice%2Fwith%20slash", handler.LastRequest.RequestUri!.AbsoluteUri); + Assert.True(handler.LastRequest.Headers.TryGetValues("xi-api-key", out var keyValues)); + Assert.Contains("key-123", keyValues); + + using var doc = JsonDocument.Parse(handler.LastBody!); + Assert.Equal("Hello", doc.RootElement.GetProperty("text").GetString()); + Assert.Equal("model-1", doc.RootElement.GetProperty("model_id").GetString()); + } + + [Fact] + public async Task SynthesizeAsync_ReturnsErrorMessageForProviderFailure() + { + var handler = new CapturingHandler(new HttpResponseMessage(HttpStatusCode.Unauthorized) + { + Content = new StringContent("""{"detail":"bad key"}""") + }); + var client = new ElevenLabsTextToSpeechClient(handler, "https://example.test"); + + var ex = await Assert.ThrowsAsync(() => client.SynthesizeAsync(new ElevenLabsSynthesisRequest + { + ApiKey = "bad", + VoiceId = "voice-1", + Text = "Hello" + })); + + Assert.Contains("401", ex.Message); + Assert.Contains("bad key", ex.Message); + } + + [Fact] + public async Task SynthesizeAsync_ValidatesRequiredFieldsBeforeNetwork() + { + var handler = new CapturingHandler(new HttpResponseMessage(HttpStatusCode.OK) + { + Content = new ByteArrayContent([1]) + }); + var client = new ElevenLabsTextToSpeechClient(handler, "https://example.test"); + + await Assert.ThrowsAsync(() => client.SynthesizeAsync(new ElevenLabsSynthesisRequest + { + ApiKey = "", + VoiceId = "voice-1", + Text = "Hello" + })); + Assert.Null(handler.LastRequest); + } + + [Fact] + public async Task SynthesizeAsync_RejectsOversizedTextBeforeNetwork() + { + var handler = new CapturingHandler(new HttpResponseMessage(HttpStatusCode.OK) + { + Content = new ByteArrayContent([1]) + }); + var client = new ElevenLabsTextToSpeechClient(handler, "https://example.test"); + + var ex = await Assert.ThrowsAsync(() => client.SynthesizeAsync(new ElevenLabsSynthesisRequest + { + ApiKey = "key-123", + VoiceId = "voice-1", + Text = new string('x', ElevenLabsTextToSpeechClient.MaxTextLength + 1) + })); + + Assert.Contains(ElevenLabsTextToSpeechClient.MaxTextLength.ToString(), ex.Message); + Assert.Null(handler.LastRequest); + } + + [Fact] + public void Constructor_SetsRequestTimeout() + { + var handler = new CapturingHandler(new HttpResponseMessage(HttpStatusCode.OK) + { + Content = new ByteArrayContent([1]) + }); + + using var client = new ElevenLabsTextToSpeechClient(handler, "https://example.test"); + + Assert.Equal(ElevenLabsTextToSpeechClient.DefaultTimeout, client.Timeout); + } + + private sealed class CapturingHandler : HttpMessageHandler + { + private readonly HttpResponseMessage _response; + + public HttpRequestMessage? LastRequest { get; private set; } + public string? LastBody { get; private set; } + + public CapturingHandler(HttpResponseMessage response) + { + _response = response; + } + + protected override async Task SendAsync( + HttpRequestMessage request, + CancellationToken cancellationToken) + { + LastRequest = request; + LastBody = request.Content is null + ? null + : await request.Content.ReadAsStringAsync(cancellationToken); + return _response; + } + } +} diff --git a/tests/OpenClaw.Tray.Tests/OpenClaw.Tray.Tests.csproj b/tests/OpenClaw.Tray.Tests/OpenClaw.Tray.Tests.csproj index 2c01a75..134bd24 100644 --- a/tests/OpenClaw.Tray.Tests/OpenClaw.Tray.Tests.csproj +++ b/tests/OpenClaw.Tray.Tests/OpenClaw.Tray.Tests.csproj @@ -16,12 +16,15 @@ + + + @@ -29,7 +32,6 @@ - diff --git a/tests/OpenClaw.Tray.Tests/SettingsRoundTripTests.cs b/tests/OpenClaw.Tray.Tests/SettingsRoundTripTests.cs index 231fba4..6181bb3 100644 --- a/tests/OpenClaw.Tray.Tests/SettingsRoundTripTests.cs +++ b/tests/OpenClaw.Tray.Tests/SettingsRoundTripTests.cs @@ -1,5 +1,6 @@ using System.Text.Json; using OpenClaw.Shared; +using OpenClawTray.Services; namespace OpenClaw.Tray.Tests; @@ -36,6 +37,11 @@ public class SettingsRoundTripTests NodeCameraEnabled = false, NodeLocationEnabled = true, NodeBrowserProxyEnabled = false, + NodeTtsEnabled = true, + TtsProvider = "elevenlabs", + TtsElevenLabsApiKey = "elevenlabs-key", + TtsElevenLabsModel = "eleven_multilingual_v2", + TtsElevenLabsVoiceId = "voice-123", HasSeenActivityStreamTip = true, SkippedUpdateTag = "v1.2.3", NotifyChatResponses = false, @@ -76,6 +82,11 @@ public class SettingsRoundTripTests Assert.Equal(original.NodeCameraEnabled, restored.NodeCameraEnabled); Assert.Equal(original.NodeLocationEnabled, restored.NodeLocationEnabled); Assert.Equal(original.NodeBrowserProxyEnabled, restored.NodeBrowserProxyEnabled); + Assert.Equal(original.NodeTtsEnabled, restored.NodeTtsEnabled); + Assert.Equal(original.TtsProvider, restored.TtsProvider); + Assert.Equal(original.TtsElevenLabsApiKey, restored.TtsElevenLabsApiKey); + Assert.Equal(original.TtsElevenLabsModel, restored.TtsElevenLabsModel); + Assert.Equal(original.TtsElevenLabsVoiceId, restored.TtsElevenLabsVoiceId); Assert.Equal(original.HasSeenActivityStreamTip, restored.HasSeenActivityStreamTip); Assert.Equal(original.SkippedUpdateTag, restored.SkippedUpdateTag); Assert.Equal(original.NotifyChatResponses, restored.NotifyChatResponses); @@ -133,6 +144,11 @@ public class SettingsRoundTripTests Assert.True(settings.NodeCameraEnabled); Assert.True(settings.NodeLocationEnabled); Assert.True(settings.NodeBrowserProxyEnabled); + Assert.False(settings.NodeTtsEnabled); + Assert.Equal("windows", settings.TtsProvider); + Assert.Null(settings.TtsElevenLabsApiKey); + Assert.Null(settings.TtsElevenLabsModel); + Assert.Null(settings.TtsElevenLabsVoiceId); Assert.False(settings.HasSeenActivityStreamTip); Assert.Null(settings.SkippedUpdateTag); Assert.True(settings.NotifyChatResponses); @@ -182,6 +198,11 @@ public class SettingsRoundTripTests Assert.True(settings.NodeCameraEnabled); Assert.True(settings.NodeLocationEnabled); Assert.True(settings.NodeBrowserProxyEnabled); + Assert.False(settings.NodeTtsEnabled); + Assert.Equal("windows", settings.TtsProvider); + Assert.Null(settings.TtsElevenLabsApiKey); + Assert.Null(settings.TtsElevenLabsModel); + Assert.Null(settings.TtsElevenLabsVoiceId); Assert.False(settings.HasSeenActivityStreamTip); Assert.Null(settings.SkippedUpdateTag); Assert.True(settings.GlobalHotkeyEnabled); @@ -194,6 +215,23 @@ public class SettingsRoundTripTests Assert.Null(SettingsData.FromJson("not json at all")); } + [Fact] + public void SettingsManager_ProtectsElevenLabsApiKeyForStorage() + { + var protectedValue = SettingsManager.ProtectSettingSecret("elevenlabs-key"); + + Assert.NotNull(protectedValue); + Assert.StartsWith("dpapi:", protectedValue); + Assert.DoesNotContain("elevenlabs-key", protectedValue); + Assert.Equal("elevenlabs-key", SettingsManager.UnprotectSettingSecret(protectedValue)); + } + + [Fact] + public void SettingsManager_ReturnsNullForCorruptedProtectedSecret() + { + Assert.Null(SettingsManager.UnprotectSettingSecret("dpapi:not-base64")); + } + [Theory] [InlineData(null)] [InlineData("")] diff --git a/tests/OpenClaw.Tray.Tests/TrayMenuWindowMarkupTests.cs b/tests/OpenClaw.Tray.Tests/TrayMenuWindowMarkupTests.cs index 498eeb3..2a6f473 100644 --- a/tests/OpenClaw.Tray.Tests/TrayMenuWindowMarkupTests.cs +++ b/tests/OpenClaw.Tray.Tests/TrayMenuWindowMarkupTests.cs @@ -346,6 +346,10 @@ public class TrayMenuWindowMarkupTests Assert.Contains(@"AutomationProperties.AutomationId=""NodeCameraToggle""", xaml); Assert.Contains(@"AutomationProperties.AutomationId=""NodeLocationToggle""", xaml); Assert.Contains(@"AutomationProperties.AutomationId=""NodeBrowserProxyToggle""", xaml); + Assert.Contains(@"AutomationProperties.AutomationId=""NodeTtsToggle""", xaml); + Assert.Contains(@"AutomationProperties.AutomationId=""TtsProviderComboBox""", xaml); + Assert.Contains(@"AutomationProperties.AutomationId=""TtsElevenLabsSettingsPanel""", xaml); + Assert.Contains(@"AutomationProperties.AutomationId=""TtsElevenLabsApiKeyPasswordBox""", xaml); } [Fact]