feat: add Windows node text-to-speech (#253)
Adds a focused Windows node text-to-speech capability as the first stable voice-support primitive. - adds the shared `tts.speak` capability and MCP/gateway documentation - wires Windows and ElevenLabs TTS behind opt-in tray settings - protects the ElevenLabs API key with DPAPI - adds shared and tray tests for capability behavior, settings, and ElevenLabs requests This lands the focused TTS foundation from the broader Voice Mode discussion in #120 so remaining voice UX/STT/repeater work can build on top in smaller follow-up PRs. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
This commit is contained in:
parent
758c881f9d
commit
e0c40985a7
33
README.md
33
README.md
@ -179,6 +179,7 @@ When Node Mode is enabled in Settings, your Windows PC becomes a **node** that t
|
||||
| **Camera** | `camera.list`, `camera.snap`, `camera.clip` | Enumerate cameras and capture still photos or short video clips |
|
||||
| **Location** | `location.get` | Return Windows geolocation when permission is available |
|
||||
| **Device** | `device.info`, `device.status` | Return Windows host/app metadata and lightweight status |
|
||||
| **Text-to-speech** | `tts.speak` | Speak text aloud through Windows speech synthesis, or ElevenLabs when configured |
|
||||
|
||||
#### Node Setup
|
||||
|
||||
@ -205,23 +206,24 @@ When Node Mode is enabled in Settings, your Windows PC becomes a **node** that t
|
||||
"canvas.hide",
|
||||
"canvas.navigate",
|
||||
"canvas.eval",
|
||||
"canvas.snapshot",
|
||||
"canvas.a2ui.push",
|
||||
"canvas.a2ui.pushJSONL",
|
||||
"canvas.a2ui.reset",
|
||||
"screen.snapshot",
|
||||
"camera.list",
|
||||
"camera.snap",
|
||||
"camera.clip",
|
||||
"location.get",
|
||||
"device.info",
|
||||
"device.status"
|
||||
"canvas.snapshot",
|
||||
"canvas.a2ui.push",
|
||||
"canvas.a2ui.pushJSONL",
|
||||
"canvas.a2ui.reset",
|
||||
"screen.snapshot",
|
||||
"camera.list",
|
||||
"camera.snap",
|
||||
"camera.clip",
|
||||
"location.get",
|
||||
"device.info",
|
||||
"device.status",
|
||||
"tts.speak"
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
> ⚠️ **Important**: The gateway has a server-side allowlist. Commands must be listed explicitly - wildcards like `canvas.*` don't work! Privacy-sensitive commands such as `screen.record` should only be added to `allowCommands` when you explicitly want to allow them.
|
||||
> ⚠️ **Important**: The gateway has a server-side allowlist. Commands must be listed explicitly - wildcards like `canvas.*` don't work! Privacy-sensitive commands such as `screen.record` and agent-driven audio playback via `tts.speak` should only be added to `allowCommands` when you explicitly want to allow them.
|
||||
|
||||
5. **Test it** from your Mac/gateway:
|
||||
```bash
|
||||
@ -249,6 +251,9 @@ When Node Mode is enabled in Settings, your Windows PC becomes a **node** that t
|
||||
# Take a photo (NV12/MediaCapture fallback)
|
||||
openclaw nodes invoke --node <id> --command camera.snap --params '{"deviceId":"<device-id>","format":"jpeg","quality":80}'
|
||||
|
||||
# Speak text aloud on the Windows node (requires TTS enabled in Settings and tts.speak allowed on the gateway)
|
||||
openclaw nodes invoke --node <id> --command tts.speak --params '{"text":"Hello from OpenClaw","provider":"windows"}'
|
||||
|
||||
# Execute a command on the Windows node
|
||||
openclaw nodes invoke --node <id> --command system.run --params '{"command":"Get-Process | Select -First 5","shell":"powershell","timeoutMs":10000}'
|
||||
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
|
||||
## Summary
|
||||
|
||||
The Windows tray app now ships a **local Model Context Protocol (MCP) server** alongside its existing OpenClaw gateway client. The same node capabilities the agent reaches over the OpenClaw gateway WebSocket — `system.run`, `screen.snapshot`, `canvas.*`, `camera.list`, `camera.snap`, `camera.clip`, `location.get`, `system.notify`, `system.execApprovals.*` — are advertised, on the same machine, as MCP tools over `http://127.0.0.1:8765/`.
|
||||
The Windows tray app now ships a **local Model Context Protocol (MCP) server** alongside its existing OpenClaw gateway client. The same node capabilities the agent reaches over the OpenClaw gateway WebSocket — `system.run`, `screen.snapshot`, `canvas.*`, `camera.list`, `camera.snap`, `camera.clip`, `location.get`, `tts.speak`, `system.notify`, `system.execApprovals.*` — are advertised, on the same machine, as MCP tools over `http://127.0.0.1:8765/`.
|
||||
|
||||
This means any local MCP client (Claude Desktop, Claude Code, Cursor, an MCP-aware CLI, a custom dev script) can reach into the running tray and drive Windows-native capabilities directly, without an OpenClaw gateway in the loop. The tray app can run in **MCP-only mode** with no gateway connection at all.
|
||||
|
||||
|
||||
@ -61,6 +61,7 @@ These features need the gateway to send `node.invoke` commands:
|
||||
| `location.get` | Get Windows location | Uses Windows location permission/settings |
|
||||
| `device.info` / `device.status` | Device metadata/status | Returns host/app/locale plus battery/storage/network/uptime payloads |
|
||||
| `browser.proxy` | Proxy browser-control host requests | Requires Browser proxy bridge enabled, a compatible browser-control host listening on gateway port + 2, and matching browser-control auth |
|
||||
| `tts.speak` | Speak text aloud | Requires Text-to-speech playback enabled in Settings; gateway mode also requires `tts.speak` in `gateway.nodes.allowCommands` |
|
||||
|
||||
## Capabilities Advertised
|
||||
|
||||
@ -72,6 +73,7 @@ When the node connects, it advertises these capabilities:
|
||||
- `location` - Windows.Devices.Geolocation
|
||||
- `device` - Host/app metadata and lightweight status
|
||||
- `browser` - Local `browser.proxy` bridge to a browser-control host on gateway port + 2, when enabled in Settings
|
||||
- `tts` - Windows speech synthesis or ElevenLabs playback, when enabled in Settings
|
||||
|
||||
## Security Features
|
||||
|
||||
|
||||
@ -79,6 +79,8 @@ Add ALL needed commands to `gateway.nodes.allowCommands` in `~/.openclaw/opencla
|
||||
// Device metadata/status
|
||||
"device.info",
|
||||
"device.status",
|
||||
// Text-to-speech playback (enable only when agent-driven audio is desired)
|
||||
"tts.speak",
|
||||
// System (already in Windows defaults, but listed for completeness)
|
||||
// "system.run",
|
||||
// "system.run.prepare",
|
||||
|
||||
108
src/OpenClaw.Shared/Capabilities/TtsCapability.cs
Normal file
108
src/OpenClaw.Shared/Capabilities/TtsCapability.cs
Normal file
@ -0,0 +1,108 @@
|
||||
using System;
|
||||
using System.Collections.Generic;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
|
||||
namespace OpenClaw.Shared.Capabilities;
|
||||
|
||||
public sealed class TtsCapability : NodeCapabilityBase
|
||||
{
|
||||
public const string SpeakCommand = "tts.speak";
|
||||
public const string WindowsProvider = "windows";
|
||||
public const string ElevenLabsProvider = "elevenlabs";
|
||||
public const int MaxTextLength = 5000;
|
||||
|
||||
private static readonly string[] _commands = [SpeakCommand];
|
||||
|
||||
public override string Category => "tts";
|
||||
public override IReadOnlyList<string> Commands => _commands;
|
||||
|
||||
public event Func<TtsSpeakArgs, CancellationToken, Task<TtsSpeakResult>>? SpeakRequested;
|
||||
|
||||
public TtsCapability(IOpenClawLogger logger) : base(logger)
|
||||
{
|
||||
}
|
||||
|
||||
public static string ResolveProvider(string? requestedProvider, string? configuredProvider)
|
||||
{
|
||||
var provider = string.IsNullOrWhiteSpace(requestedProvider)
|
||||
? configuredProvider
|
||||
: requestedProvider;
|
||||
|
||||
return string.IsNullOrWhiteSpace(provider)
|
||||
? WindowsProvider
|
||||
: provider.Trim().ToLowerInvariant();
|
||||
}
|
||||
|
||||
public override Task<NodeInvokeResponse> ExecuteAsync(NodeInvokeRequest request)
|
||||
=> ExecuteAsync(request, CancellationToken.None);
|
||||
|
||||
public override async Task<NodeInvokeResponse> ExecuteAsync(
|
||||
NodeInvokeRequest request,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
if (!string.Equals(request.Command, SpeakCommand, StringComparison.Ordinal))
|
||||
return Error($"Unknown command: {request.Command}");
|
||||
|
||||
var text = GetStringArg(request.Args, "text")?.Trim();
|
||||
if (string.IsNullOrWhiteSpace(text))
|
||||
return Error("Missing required text");
|
||||
if (text.Length > MaxTextLength)
|
||||
return Error($"TTS text exceeds {MaxTextLength} characters.");
|
||||
|
||||
if (SpeakRequested == null)
|
||||
return Error("TTS speak not available");
|
||||
|
||||
var args = new TtsSpeakArgs
|
||||
{
|
||||
Text = text,
|
||||
Provider = NormalizeOptional(GetStringArg(request.Args, "provider")),
|
||||
VoiceId = NormalizeOptional(GetStringArg(request.Args, "voiceId")),
|
||||
Model = NormalizeOptional(GetStringArg(request.Args, "model")),
|
||||
Interrupt = GetBoolArg(request.Args, "interrupt")
|
||||
};
|
||||
|
||||
Logger.Info($"tts.speak: provider={args.Provider ?? "(default)"}, chars={args.Text.Length}, interrupt={args.Interrupt}");
|
||||
|
||||
try
|
||||
{
|
||||
var result = await SpeakRequested(args, cancellationToken).ConfigureAwait(false);
|
||||
return Success(new
|
||||
{
|
||||
spoken = result.Spoken,
|
||||
provider = result.Provider,
|
||||
contentType = result.ContentType,
|
||||
durationMs = result.DurationMs
|
||||
});
|
||||
}
|
||||
catch (OperationCanceledException) when (cancellationToken.IsCancellationRequested)
|
||||
{
|
||||
return Error("Speak canceled");
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
Logger.Error("TTS speak failed", ex);
|
||||
return Error($"Speak failed: {ex.Message}");
|
||||
}
|
||||
}
|
||||
|
||||
private static string? NormalizeOptional(string? value)
|
||||
=> string.IsNullOrWhiteSpace(value) ? null : value.Trim();
|
||||
}
|
||||
|
||||
public sealed class TtsSpeakArgs
|
||||
{
|
||||
public string Text { get; set; } = "";
|
||||
public string? Provider { get; set; }
|
||||
public string? VoiceId { get; set; }
|
||||
public string? Model { get; set; }
|
||||
public bool Interrupt { get; set; }
|
||||
}
|
||||
|
||||
public sealed class TtsSpeakResult
|
||||
{
|
||||
public bool Spoken { get; set; } = true;
|
||||
public string Provider { get; set; } = TtsCapability.WindowsProvider;
|
||||
public string? ContentType { get; set; }
|
||||
public int? DurationMs { get; set; }
|
||||
}
|
||||
@ -235,6 +235,10 @@ public class McpToolBridge
|
||||
"Capture a still photo from a camera. Args: deviceId (string, optional — defaults to system default camera), format ('jpeg'|'png', default 'jpeg'), maxWidth (int, default 1280), quality (int 1-100, default 80). Returns { format, width, height, base64 }.",
|
||||
["camera.clip"] =
|
||||
"Record a short clip from a camera. Args: deviceId (string, optional), durationMs (int, required, max 60000), format ('mp4'|'webm', default 'mp4'), maxWidth (int, default 1280). Returns { format, durationMs, base64 }.",
|
||||
|
||||
// tts.*
|
||||
["tts.speak"] =
|
||||
"Speak text aloud on the Windows node. Args: text (string, required), provider ('windows'|'elevenlabs', optional), voiceId (string, optional), model (string, optional), interrupt (bool, default false). Returns { spoken, provider, contentType, durationMs }.",
|
||||
};
|
||||
|
||||
private async Task<object> HandleToolsCallAsync(JsonElement parameters, CancellationToken cancellationToken)
|
||||
|
||||
@ -1023,7 +1023,8 @@ public static class CommandCenterCommandGroups
|
||||
[
|
||||
"camera.snap",
|
||||
"camera.clip",
|
||||
"screen.record"
|
||||
"screen.record",
|
||||
"tts.speak"
|
||||
];
|
||||
|
||||
public static readonly FrozenSet<string> DangerousCommandSet =
|
||||
@ -1046,7 +1047,9 @@ public static class CommandCenterCommandGroups
|
||||
public static readonly string[] MacNodeParityCommands =
|
||||
[
|
||||
.. SafeCompanionCommands,
|
||||
.. DangerousCommands,
|
||||
"camera.snap",
|
||||
"camera.clip",
|
||||
"screen.record",
|
||||
"system.notify",
|
||||
"system.run",
|
||||
"system.which",
|
||||
|
||||
@ -34,6 +34,11 @@ public class SettingsData
|
||||
public bool NodeCameraEnabled { get; set; } = true;
|
||||
public bool NodeLocationEnabled { get; set; } = true;
|
||||
public bool NodeBrowserProxyEnabled { get; set; } = true;
|
||||
public bool NodeTtsEnabled { get; set; } = false;
|
||||
public string TtsProvider { get; set; } = "windows";
|
||||
public string? TtsElevenLabsApiKey { get; set; }
|
||||
public string? TtsElevenLabsModel { get; set; }
|
||||
public string? TtsElevenLabsVoiceId { get; set; }
|
||||
/// <summary>Run the local MCP HTTP server. Independent of EnableNodeMode.</summary>
|
||||
public bool EnableMcpServer { get; set; } = false;
|
||||
/// <summary>
|
||||
|
||||
@ -57,6 +57,7 @@
|
||||
<PackageReference Include="WinUIEx" Version="2.9.0" />
|
||||
<PackageReference Include="Microsoft.Toolkit.Uwp.Notifications" Version="7.1.3" />
|
||||
<PackageReference Include="System.Drawing.Common" Version="10.0.7" />
|
||||
<PackageReference Include="System.Security.Cryptography.ProtectedData" Version="10.0.0" />
|
||||
<PackageReference Include="Updatum" Version="1.3.4" />
|
||||
<PackageReference Include="ZXing.Net" Version="0.16.10" />
|
||||
</ItemGroup>
|
||||
|
||||
@ -69,6 +69,8 @@ public sealed class NodeService : IDisposable
|
||||
private LocationCapability? _locationCapability;
|
||||
private DeviceCapability? _deviceCapability;
|
||||
private BrowserProxyCapability? _browserProxyCapability;
|
||||
private TtsCapability? _ttsCapability;
|
||||
private TextToSpeechService? _textToSpeechService;
|
||||
private readonly string _dataPath;
|
||||
private string? _token;
|
||||
|
||||
@ -282,6 +284,14 @@ public sealed class NodeService : IDisposable
|
||||
Register(_locationCapability);
|
||||
}
|
||||
|
||||
if (_settings?.NodeTtsEnabled == true)
|
||||
{
|
||||
_textToSpeechService ??= new TextToSpeechService(_logger, _settings);
|
||||
_ttsCapability = new TtsCapability(_logger);
|
||||
_ttsCapability.SpeakRequested += OnTtsSpeakAsync;
|
||||
Register(_ttsCapability);
|
||||
}
|
||||
|
||||
// Device metadata/status capability
|
||||
_deviceCapability = new DeviceCapability(_logger);
|
||||
Register(_deviceCapability);
|
||||
@ -447,6 +457,8 @@ public sealed class NodeService : IDisposable
|
||||
disabled.AddRange(CommandCenterCommandGroups.SafeCompanionCommands.Where(command => command.StartsWith("location.", StringComparison.OrdinalIgnoreCase)));
|
||||
if (_settings?.NodeBrowserProxyEnabled == false)
|
||||
disabled.Add("browser.proxy");
|
||||
if (_settings?.NodeTtsEnabled != true)
|
||||
disabled.AddRange(CommandCenterCommandGroups.DangerousCommands.Where(command => command.StartsWith("tts.", StringComparison.OrdinalIgnoreCase)));
|
||||
return disabled;
|
||||
}
|
||||
|
||||
@ -1265,6 +1277,14 @@ public sealed class NodeService : IDisposable
|
||||
TimestampMs = position.Coordinate.Timestamp.ToUnixTimeMilliseconds()
|
||||
};
|
||||
}
|
||||
|
||||
private Task<TtsSpeakResult> OnTtsSpeakAsync(TtsSpeakArgs args, CancellationToken cancellationToken)
|
||||
{
|
||||
if (_textToSpeechService == null)
|
||||
throw new InvalidOperationException("Text-to-speech service not available");
|
||||
|
||||
return _textToSpeechService.SpeakAsync(args, cancellationToken);
|
||||
}
|
||||
|
||||
#endregion
|
||||
|
||||
@ -1278,6 +1298,7 @@ public sealed class NodeService : IDisposable
|
||||
|
||||
try { _cameraCaptureService?.Dispose(); } catch { /* ignore */ }
|
||||
try { _screenRecordingService?.Dispose(); } catch { /* ignore */ }
|
||||
try { _textToSpeechService?.Dispose(); } catch { /* ignore */ }
|
||||
// MediaResolver owns SocketsHttpHandler + HttpClient (disposeHandler:true);
|
||||
// without disposal the connection pool survives node teardown/recreate.
|
||||
try { _mediaResolver?.Dispose(); } catch { /* ignore */ }
|
||||
|
||||
@ -1,5 +1,7 @@
|
||||
using System;
|
||||
using System.IO;
|
||||
using System.Security.Cryptography;
|
||||
using System.Text;
|
||||
using System.Text.Json;
|
||||
using OpenClaw.Shared;
|
||||
|
||||
@ -17,6 +19,8 @@ public class SettingsManager
|
||||
|
||||
private readonly string _settingsDirectory;
|
||||
private readonly string _settingsFilePath;
|
||||
private const string ProtectedSecretPrefix = "dpapi:";
|
||||
private static readonly byte[] ProtectedSecretEntropy = Encoding.UTF8.GetBytes("OpenClawTray.Settings.v1");
|
||||
|
||||
public static string SettingsDirectoryPath => DefaultSettingsDirectory;
|
||||
public static string SettingsPath => DefaultSettingsFilePath;
|
||||
@ -61,6 +65,11 @@ public class SettingsManager
|
||||
public bool NodeCameraEnabled { get; set; } = true;
|
||||
public bool NodeLocationEnabled { get; set; } = true;
|
||||
public bool NodeBrowserProxyEnabled { get; set; } = true;
|
||||
public bool NodeTtsEnabled { get; set; } = false;
|
||||
public string TtsProvider { get; set; } = "windows";
|
||||
public string TtsElevenLabsApiKey { get; set; } = "";
|
||||
public string TtsElevenLabsModel { get; set; } = "";
|
||||
public string TtsElevenLabsVoiceId { get; set; } = "";
|
||||
// Local MCP HTTP server (independent of EnableNodeMode)
|
||||
public bool EnableMcpServer { get; set; } = false;
|
||||
/// <summary>
|
||||
@ -132,6 +141,11 @@ public class SettingsManager
|
||||
NodeCameraEnabled = loaded.NodeCameraEnabled;
|
||||
NodeLocationEnabled = loaded.NodeLocationEnabled;
|
||||
NodeBrowserProxyEnabled = loaded.NodeBrowserProxyEnabled;
|
||||
NodeTtsEnabled = loaded.NodeTtsEnabled;
|
||||
TtsProvider = string.IsNullOrWhiteSpace(loaded.TtsProvider) ? TtsProvider : loaded.TtsProvider;
|
||||
TtsElevenLabsApiKey = UnprotectSettingSecret(loaded.TtsElevenLabsApiKey) ?? TtsElevenLabsApiKey;
|
||||
TtsElevenLabsModel = loaded.TtsElevenLabsModel ?? TtsElevenLabsModel;
|
||||
TtsElevenLabsVoiceId = loaded.TtsElevenLabsVoiceId ?? TtsElevenLabsVoiceId;
|
||||
EnableMcpServer = loaded.EnableMcpServer;
|
||||
A2UIImageHosts = loaded.A2UIImageHosts ?? new List<string>();
|
||||
// Legacy McpOnlyMode migration:
|
||||
@ -200,6 +214,11 @@ public class SettingsManager
|
||||
NodeCameraEnabled = NodeCameraEnabled,
|
||||
NodeLocationEnabled = NodeLocationEnabled,
|
||||
NodeBrowserProxyEnabled = NodeBrowserProxyEnabled,
|
||||
NodeTtsEnabled = NodeTtsEnabled,
|
||||
TtsProvider = TtsProvider,
|
||||
TtsElevenLabsApiKey = ProtectSettingSecret(TtsElevenLabsApiKey),
|
||||
TtsElevenLabsModel = string.IsNullOrWhiteSpace(TtsElevenLabsModel) ? null : TtsElevenLabsModel,
|
||||
TtsElevenLabsVoiceId = string.IsNullOrWhiteSpace(TtsElevenLabsVoiceId) ? null : TtsElevenLabsVoiceId,
|
||||
EnableMcpServer = EnableMcpServer,
|
||||
A2UIImageHosts = A2UIImageHosts.Count == 0 ? null : new List<string>(A2UIImageHosts),
|
||||
// McpOnlyMode is legacy — never written; remains null in serialized output.
|
||||
@ -221,6 +240,60 @@ public class SettingsManager
|
||||
}
|
||||
}
|
||||
|
||||
internal static string? ProtectSettingSecret(string? value)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(value))
|
||||
return null;
|
||||
|
||||
if (!OperatingSystem.IsWindows())
|
||||
throw new PlatformNotSupportedException("Windows Data Protection API is required for protected settings secrets.");
|
||||
|
||||
var bytes = Encoding.UTF8.GetBytes(value);
|
||||
var protectedBytes = ProtectedData.Protect(bytes, ProtectedSecretEntropy, DataProtectionScope.CurrentUser);
|
||||
return ProtectedSecretPrefix + Convert.ToBase64String(protectedBytes);
|
||||
}
|
||||
|
||||
internal static string? UnprotectSettingSecret(string? value)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(value))
|
||||
return value;
|
||||
if (!value.StartsWith(ProtectedSecretPrefix, StringComparison.Ordinal))
|
||||
return value;
|
||||
|
||||
if (!OperatingSystem.IsWindows())
|
||||
{
|
||||
Logger.Warn("Failed to decrypt protected settings secret: Windows Data Protection API is unavailable.");
|
||||
return null;
|
||||
}
|
||||
|
||||
try
|
||||
{
|
||||
var protectedBytes = Convert.FromBase64String(value[ProtectedSecretPrefix.Length..]);
|
||||
var bytes = ProtectedData.Unprotect(protectedBytes, ProtectedSecretEntropy, DataProtectionScope.CurrentUser);
|
||||
return Encoding.UTF8.GetString(bytes);
|
||||
}
|
||||
catch (FormatException ex)
|
||||
{
|
||||
Logger.Warn($"Failed to decode protected settings secret: {ex.Message}");
|
||||
return null;
|
||||
}
|
||||
catch (CryptographicException ex)
|
||||
{
|
||||
Logger.Warn($"Failed to decrypt protected settings secret: {ex.Message}");
|
||||
return null;
|
||||
}
|
||||
catch (NotSupportedException ex)
|
||||
{
|
||||
Logger.Warn($"Failed to decrypt protected settings secret: {ex.Message}");
|
||||
return null;
|
||||
}
|
||||
catch (ArgumentException ex)
|
||||
{
|
||||
Logger.Warn($"Failed to decrypt protected settings secret: {ex.Message}");
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
public string GetEffectiveGatewayUrl()
|
||||
{
|
||||
if (!UseSshTunnel)
|
||||
|
||||
@ -0,0 +1,116 @@
|
||||
using System;
|
||||
using System.Net;
|
||||
using System.Net.Http;
|
||||
using System.Net.Http.Headers;
|
||||
using System.Text;
|
||||
using System.Text.Json;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
using OpenClaw.Shared.Capabilities;
|
||||
|
||||
namespace OpenClawTray.Services;
|
||||
|
||||
public sealed class ElevenLabsSynthesisRequest
|
||||
{
|
||||
public string ApiKey { get; set; } = "";
|
||||
public string VoiceId { get; set; } = "";
|
||||
public string Text { get; set; } = "";
|
||||
public string? ModelId { get; set; }
|
||||
}
|
||||
|
||||
public sealed class ElevenLabsSynthesisResult
|
||||
{
|
||||
public byte[] AudioBytes { get; set; } = [];
|
||||
public string ContentType { get; set; } = "audio/mpeg";
|
||||
}
|
||||
|
||||
public sealed class ElevenLabsTextToSpeechClient : IDisposable
|
||||
{
|
||||
private const string DefaultBaseUrl = "https://api.elevenlabs.io";
|
||||
public const int MaxTextLength = TtsCapability.MaxTextLength;
|
||||
internal static readonly TimeSpan DefaultTimeout = TimeSpan.FromSeconds(30);
|
||||
private readonly HttpClient _httpClient;
|
||||
private readonly bool _ownsHttpClient;
|
||||
private readonly Uri _baseUri;
|
||||
|
||||
internal TimeSpan Timeout => _httpClient.Timeout;
|
||||
|
||||
public ElevenLabsTextToSpeechClient()
|
||||
: this(new HttpClient(), ownsHttpClient: true, baseUrl: DefaultBaseUrl)
|
||||
{
|
||||
}
|
||||
|
||||
public ElevenLabsTextToSpeechClient(HttpMessageHandler handler, string baseUrl = DefaultBaseUrl)
|
||||
: this(new HttpClient(handler), ownsHttpClient: true, baseUrl)
|
||||
{
|
||||
}
|
||||
|
||||
private ElevenLabsTextToSpeechClient(HttpClient httpClient, bool ownsHttpClient, string baseUrl)
|
||||
{
|
||||
_httpClient = httpClient;
|
||||
_httpClient.Timeout = DefaultTimeout;
|
||||
_ownsHttpClient = ownsHttpClient;
|
||||
_baseUri = new Uri(baseUrl.TrimEnd('/') + "/", UriKind.Absolute);
|
||||
}
|
||||
|
||||
public async Task<ElevenLabsSynthesisResult> SynthesizeAsync(
|
||||
ElevenLabsSynthesisRequest request,
|
||||
CancellationToken cancellationToken = default)
|
||||
{
|
||||
if (string.IsNullOrWhiteSpace(request.ApiKey))
|
||||
throw new InvalidOperationException("ElevenLabs API key is required.");
|
||||
if (string.IsNullOrWhiteSpace(request.VoiceId))
|
||||
throw new InvalidOperationException("ElevenLabs voice ID is required.");
|
||||
if (string.IsNullOrWhiteSpace(request.Text))
|
||||
throw new InvalidOperationException("Text is required.");
|
||||
if (request.Text.Length > MaxTextLength)
|
||||
throw new InvalidOperationException($"ElevenLabs TTS text exceeds {MaxTextLength} characters.");
|
||||
|
||||
var path = $"v1/text-to-speech/{Uri.EscapeDataString(request.VoiceId.Trim())}";
|
||||
using var httpRequest = new HttpRequestMessage(HttpMethod.Post, new Uri(_baseUri, path));
|
||||
httpRequest.Headers.Add("xi-api-key", request.ApiKey.Trim());
|
||||
httpRequest.Headers.Accept.Add(new MediaTypeWithQualityHeaderValue("audio/mpeg"));
|
||||
|
||||
var body = JsonSerializer.Serialize(new
|
||||
{
|
||||
text = request.Text,
|
||||
model_id = string.IsNullOrWhiteSpace(request.ModelId) ? null : request.ModelId.Trim()
|
||||
});
|
||||
httpRequest.Content = new StringContent(body, Encoding.UTF8, "application/json");
|
||||
|
||||
using var response = await _httpClient.SendAsync(
|
||||
httpRequest,
|
||||
HttpCompletionOption.ResponseHeadersRead,
|
||||
cancellationToken).ConfigureAwait(false);
|
||||
var bytes = await response.Content.ReadAsByteArrayAsync(cancellationToken).ConfigureAwait(false);
|
||||
|
||||
if (!response.IsSuccessStatusCode)
|
||||
throw new InvalidOperationException(BuildFailureMessage(response.StatusCode, bytes));
|
||||
if (bytes.Length == 0)
|
||||
throw new InvalidOperationException("ElevenLabs returned an empty audio response.");
|
||||
|
||||
return new ElevenLabsSynthesisResult
|
||||
{
|
||||
AudioBytes = bytes,
|
||||
ContentType = response.Content.Headers.ContentType?.MediaType ?? "audio/mpeg"
|
||||
};
|
||||
}
|
||||
|
||||
internal static string BuildFailureMessage(HttpStatusCode statusCode, byte[] bodyBytes)
|
||||
{
|
||||
var body = Encoding.UTF8.GetString(bodyBytes);
|
||||
if (body.Length > 300)
|
||||
body = body[..300];
|
||||
body = body.Trim();
|
||||
|
||||
return string.IsNullOrEmpty(body)
|
||||
? $"ElevenLabs TTS failed with HTTP {(int)statusCode} ({statusCode})."
|
||||
: $"ElevenLabs TTS failed with HTTP {(int)statusCode} ({statusCode}): {body}";
|
||||
}
|
||||
|
||||
public void Dispose()
|
||||
{
|
||||
if (_ownsHttpClient)
|
||||
_httpClient.Dispose();
|
||||
}
|
||||
}
|
||||
@ -0,0 +1,208 @@
|
||||
using System;
|
||||
using System.Diagnostics;
|
||||
using System.Linq;
|
||||
using System.Threading;
|
||||
using System.Threading.Tasks;
|
||||
using OpenClaw.Shared;
|
||||
using OpenClaw.Shared.Capabilities;
|
||||
using Windows.Media.Core;
|
||||
using Windows.Media.Playback;
|
||||
using Windows.Media.SpeechSynthesis;
|
||||
using Windows.Storage.Streams;
|
||||
|
||||
namespace OpenClawTray.Services;
|
||||
|
||||
public sealed class TextToSpeechService : IDisposable
|
||||
{
|
||||
private readonly IOpenClawLogger _logger;
|
||||
private readonly SettingsManager _settings;
|
||||
private readonly ElevenLabsTextToSpeechClient _elevenLabsClient;
|
||||
private readonly SemaphoreSlim _playbackGate = new(1, 1);
|
||||
private readonly object _activeLock = new();
|
||||
private MediaPlayer? _activePlayer;
|
||||
private TaskCompletionSource<bool>? _activeCompletion;
|
||||
|
||||
public TextToSpeechService(IOpenClawLogger logger, SettingsManager settings)
|
||||
: this(logger, settings, new ElevenLabsTextToSpeechClient())
|
||||
{
|
||||
}
|
||||
|
||||
internal TextToSpeechService(
|
||||
IOpenClawLogger logger,
|
||||
SettingsManager settings,
|
||||
ElevenLabsTextToSpeechClient elevenLabsClient)
|
||||
{
|
||||
_logger = logger;
|
||||
_settings = settings;
|
||||
_elevenLabsClient = elevenLabsClient;
|
||||
}
|
||||
|
||||
public async Task<TtsSpeakResult> SpeakAsync(TtsSpeakArgs args, CancellationToken cancellationToken = default)
|
||||
{
|
||||
var provider = TtsCapability.ResolveProvider(args.Provider, _settings.TtsProvider);
|
||||
var stopwatch = Stopwatch.StartNew();
|
||||
|
||||
if (string.Equals(provider, TtsCapability.WindowsProvider, StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
await SpeakWithWindowsAsync(args, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
else if (string.Equals(provider, TtsCapability.ElevenLabsProvider, StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
await SpeakWithElevenLabsAsync(args, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
else
|
||||
{
|
||||
throw new InvalidOperationException($"Unsupported TTS provider '{provider}'.");
|
||||
}
|
||||
|
||||
stopwatch.Stop();
|
||||
return new TtsSpeakResult
|
||||
{
|
||||
Provider = provider,
|
||||
ContentType = string.Equals(provider, TtsCapability.ElevenLabsProvider, StringComparison.OrdinalIgnoreCase)
|
||||
? "audio/mpeg"
|
||||
: "audio/wav",
|
||||
DurationMs = (int)Math.Min(stopwatch.ElapsedMilliseconds, int.MaxValue)
|
||||
};
|
||||
}
|
||||
|
||||
private async Task SpeakWithWindowsAsync(TtsSpeakArgs args, CancellationToken cancellationToken)
|
||||
{
|
||||
using var synthesizer = new SpeechSynthesizer();
|
||||
if (!string.IsNullOrWhiteSpace(args.VoiceId))
|
||||
{
|
||||
var requestedVoice = args.VoiceId.Trim();
|
||||
var voice = SpeechSynthesizer.AllVoices.FirstOrDefault(v =>
|
||||
string.Equals(v.Id, requestedVoice, StringComparison.OrdinalIgnoreCase) ||
|
||||
string.Equals(v.DisplayName, requestedVoice, StringComparison.OrdinalIgnoreCase));
|
||||
if (voice == null)
|
||||
throw new InvalidOperationException($"Windows TTS voice '{requestedVoice}' was not found.");
|
||||
|
||||
synthesizer.Voice = voice;
|
||||
}
|
||||
|
||||
using var stream = await synthesizer
|
||||
.SynthesizeTextToStreamAsync(args.Text)
|
||||
.AsTask(cancellationToken)
|
||||
.ConfigureAwait(false);
|
||||
await PlayStreamAsync(stream, stream.ContentType, args.Interrupt, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
private async Task SpeakWithElevenLabsAsync(TtsSpeakArgs args, CancellationToken cancellationToken)
|
||||
{
|
||||
var apiKey = _settings.TtsElevenLabsApiKey;
|
||||
if (string.IsNullOrWhiteSpace(apiKey))
|
||||
throw new InvalidOperationException("ElevenLabs API key is required in Settings.");
|
||||
|
||||
var voiceId = string.IsNullOrWhiteSpace(args.VoiceId)
|
||||
? _settings.TtsElevenLabsVoiceId
|
||||
: args.VoiceId;
|
||||
if (string.IsNullOrWhiteSpace(voiceId))
|
||||
throw new InvalidOperationException("ElevenLabs voice ID is required in Settings or the tts.speak voiceId argument.");
|
||||
|
||||
var model = string.IsNullOrWhiteSpace(args.Model)
|
||||
? _settings.TtsElevenLabsModel
|
||||
: args.Model;
|
||||
|
||||
var audio = await _elevenLabsClient.SynthesizeAsync(new ElevenLabsSynthesisRequest
|
||||
{
|
||||
ApiKey = apiKey,
|
||||
VoiceId = voiceId,
|
||||
Text = args.Text,
|
||||
ModelId = model
|
||||
}, cancellationToken).ConfigureAwait(false);
|
||||
|
||||
using var stream = await CreateStreamAsync(audio.AudioBytes, cancellationToken).ConfigureAwait(false);
|
||||
await PlayStreamAsync(stream, audio.ContentType, args.Interrupt, cancellationToken).ConfigureAwait(false);
|
||||
}
|
||||
|
||||
private static async Task<InMemoryRandomAccessStream> CreateStreamAsync(byte[] bytes, CancellationToken cancellationToken)
|
||||
{
|
||||
var stream = new InMemoryRandomAccessStream();
|
||||
using var writer = new DataWriter(stream);
|
||||
writer.WriteBytes(bytes);
|
||||
await writer.StoreAsync().AsTask(cancellationToken).ConfigureAwait(false);
|
||||
await writer.FlushAsync().AsTask(cancellationToken).ConfigureAwait(false);
|
||||
writer.DetachStream();
|
||||
stream.Seek(0);
|
||||
return stream;
|
||||
}
|
||||
|
||||
private async Task PlayStreamAsync(
|
||||
IRandomAccessStream stream,
|
||||
string contentType,
|
||||
bool interrupt,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
if (interrupt)
|
||||
InterruptActivePlayback();
|
||||
|
||||
await _playbackGate.WaitAsync(cancellationToken).ConfigureAwait(false);
|
||||
|
||||
MediaPlayer? player = null;
|
||||
var completion = new TaskCompletionSource<bool>(TaskCreationOptions.RunContinuationsAsynchronously);
|
||||
try
|
||||
{
|
||||
player = new MediaPlayer();
|
||||
player.MediaEnded += (_, _) => completion.TrySetResult(true);
|
||||
player.MediaFailed += (_, e) =>
|
||||
completion.TrySetException(new InvalidOperationException($"TTS playback failed: {e.ErrorMessage}"));
|
||||
player.Source = MediaSource.CreateFromStream(stream, contentType);
|
||||
|
||||
lock (_activeLock)
|
||||
{
|
||||
_activePlayer = player;
|
||||
_activeCompletion = completion;
|
||||
}
|
||||
|
||||
player.Play();
|
||||
|
||||
using var cancellationRegistration = cancellationToken.Register(
|
||||
static state => ((TaskCompletionSource<bool>)state!).TrySetCanceled(),
|
||||
completion);
|
||||
await completion.Task.ConfigureAwait(false);
|
||||
}
|
||||
finally
|
||||
{
|
||||
lock (_activeLock)
|
||||
{
|
||||
if (ReferenceEquals(_activePlayer, player))
|
||||
{
|
||||
_activePlayer = null;
|
||||
_activeCompletion = null;
|
||||
}
|
||||
}
|
||||
|
||||
if (player != null)
|
||||
{
|
||||
player.Pause();
|
||||
player.Source = null;
|
||||
player.Dispose();
|
||||
}
|
||||
|
||||
_playbackGate.Release();
|
||||
}
|
||||
}
|
||||
|
||||
private void InterruptActivePlayback()
|
||||
{
|
||||
TaskCompletionSource<bool>? completion;
|
||||
lock (_activeLock)
|
||||
{
|
||||
completion = _activeCompletion;
|
||||
}
|
||||
|
||||
if (completion != null)
|
||||
{
|
||||
_logger.Info("Interrupting active TTS playback");
|
||||
completion.TrySetException(new InvalidOperationException("TTS playback was interrupted."));
|
||||
}
|
||||
}
|
||||
|
||||
public void Dispose()
|
||||
{
|
||||
InterruptActivePlayback();
|
||||
// Playback may still release the gate after an interrupt during shutdown.
|
||||
_elevenLabsClient.Dispose();
|
||||
}
|
||||
}
|
||||
@ -200,6 +200,32 @@
|
||||
<ToggleSwitch x:Name="NodeCameraToggle" AutomationProperties.AutomationId="NodeCameraToggle" Header="Camera list, snapshot, and clips"/>
|
||||
<ToggleSwitch x:Name="NodeLocationToggle" AutomationProperties.AutomationId="NodeLocationToggle" Header="Location"/>
|
||||
<ToggleSwitch x:Name="NodeBrowserProxyToggle" AutomationProperties.AutomationId="NodeBrowserProxyToggle" Header="Browser proxy bridge" Toggled="OnNodeBrowserProxyToggled"/>
|
||||
<ToggleSwitch x:Name="NodeTtsToggle" AutomationProperties.AutomationId="NodeTtsToggle" Header="Text-to-speech playback"/>
|
||||
<TextBlock Text="Text-to-speech exposes tts.speak. Gateway use still requires tts.speak in gateway.nodes.allowCommands."
|
||||
Style="{StaticResource CaptionTextBlockStyle}"
|
||||
Foreground="{ThemeResource TextFillColorSecondaryBrush}"
|
||||
TextWrapping="Wrap"/>
|
||||
<ComboBox x:Name="TtsProviderComboBox"
|
||||
AutomationProperties.AutomationId="TtsProviderComboBox"
|
||||
Header="TTS provider"
|
||||
SelectionChanged="OnTtsProviderSelectionChanged">
|
||||
<ComboBoxItem Content="Windows built-in speech" Tag="windows"/>
|
||||
<ComboBoxItem Content="ElevenLabs" Tag="elevenlabs"/>
|
||||
</ComboBox>
|
||||
<StackPanel x:Name="TtsElevenLabsSettingsPanel"
|
||||
AutomationProperties.AutomationId="TtsElevenLabsSettingsPanel"
|
||||
Spacing="6">
|
||||
<PasswordBox x:Name="TtsElevenLabsApiKeyPasswordBox"
|
||||
AutomationProperties.AutomationId="TtsElevenLabsApiKeyPasswordBox"
|
||||
Header="ElevenLabs API key"/>
|
||||
<TextBox x:Name="TtsElevenLabsVoiceIdTextBox"
|
||||
AutomationProperties.AutomationId="TtsElevenLabsVoiceIdTextBox"
|
||||
Header="ElevenLabs voice ID"/>
|
||||
<TextBox x:Name="TtsElevenLabsModelTextBox"
|
||||
AutomationProperties.AutomationId="TtsElevenLabsModelTextBox"
|
||||
Header="ElevenLabs model"
|
||||
PlaceholderText="eleven_multilingual_v2"/>
|
||||
</StackPanel>
|
||||
</StackPanel>
|
||||
</Border>
|
||||
|
||||
|
||||
@ -95,6 +95,12 @@ public sealed partial class SettingsWindow : WindowEx
|
||||
NodeCameraToggle.IsOn = _settings.NodeCameraEnabled;
|
||||
NodeLocationToggle.IsOn = _settings.NodeLocationEnabled;
|
||||
NodeBrowserProxyToggle.IsOn = _settings.NodeBrowserProxyEnabled;
|
||||
NodeTtsToggle.IsOn = _settings.NodeTtsEnabled;
|
||||
SelectTtsProvider(_settings.TtsProvider);
|
||||
TtsElevenLabsApiKeyPasswordBox.Password = _settings.TtsElevenLabsApiKey;
|
||||
TtsElevenLabsVoiceIdTextBox.Text = _settings.TtsElevenLabsVoiceId;
|
||||
TtsElevenLabsModelTextBox.Text = _settings.TtsElevenLabsModel;
|
||||
UpdateTtsProviderUiState();
|
||||
UpdateSshTunnelPreviewText();
|
||||
McpServerToggle.IsOn = _settings.EnableMcpServer;
|
||||
McpUrlTextBox.Text = NodeService.McpServerUrl;
|
||||
@ -387,6 +393,11 @@ public sealed partial class SettingsWindow : WindowEx
|
||||
_settings.NodeCameraEnabled = NodeCameraToggle.IsOn;
|
||||
_settings.NodeLocationEnabled = NodeLocationToggle.IsOn;
|
||||
_settings.NodeBrowserProxyEnabled = NodeBrowserProxyToggle.IsOn;
|
||||
_settings.NodeTtsEnabled = NodeTtsToggle.IsOn;
|
||||
_settings.TtsProvider = GetSelectedTtsProvider();
|
||||
_settings.TtsElevenLabsApiKey = TtsElevenLabsApiKeyPasswordBox.Password.Trim();
|
||||
_settings.TtsElevenLabsVoiceId = TtsElevenLabsVoiceIdTextBox.Text.Trim();
|
||||
_settings.TtsElevenLabsModel = TtsElevenLabsModelTextBox.Text.Trim();
|
||||
_settings.EnableMcpServer = McpServerToggle.IsOn;
|
||||
|
||||
_settings.Save();
|
||||
@ -631,6 +642,48 @@ public sealed partial class SettingsWindow : WindowEx
|
||||
UpdateSshTunnelPreviewText();
|
||||
}
|
||||
|
||||
private void OnTtsProviderSelectionChanged(object sender, Microsoft.UI.Xaml.Controls.SelectionChangedEventArgs e)
|
||||
{
|
||||
UpdateTtsProviderUiState();
|
||||
}
|
||||
|
||||
private void SelectTtsProvider(string provider)
|
||||
{
|
||||
for (int i = 0; i < TtsProviderComboBox.Items.Count; i++)
|
||||
{
|
||||
if (TtsProviderComboBox.Items[i] is Microsoft.UI.Xaml.Controls.ComboBoxItem item &&
|
||||
string.Equals(item.Tag?.ToString(), provider, StringComparison.OrdinalIgnoreCase))
|
||||
{
|
||||
TtsProviderComboBox.SelectedIndex = i;
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
TtsProviderComboBox.SelectedIndex = 0;
|
||||
}
|
||||
|
||||
private string GetSelectedTtsProvider()
|
||||
{
|
||||
if (TtsProviderComboBox.SelectedItem is Microsoft.UI.Xaml.Controls.ComboBoxItem item &&
|
||||
item.Tag is not null)
|
||||
{
|
||||
return item.Tag.ToString() ?? "windows";
|
||||
}
|
||||
|
||||
return "windows";
|
||||
}
|
||||
|
||||
private void UpdateTtsProviderUiState()
|
||||
{
|
||||
if (TtsElevenLabsSettingsPanel == null)
|
||||
return;
|
||||
|
||||
TtsElevenLabsSettingsPanel.Visibility =
|
||||
string.Equals(GetSelectedTtsProvider(), "elevenlabs", StringComparison.OrdinalIgnoreCase)
|
||||
? Visibility.Visible
|
||||
: Visibility.Collapsed;
|
||||
}
|
||||
|
||||
private void OnUseLocalGateway(object sender, RoutedEventArgs e)
|
||||
{
|
||||
UseSshTunnelToggle.IsOn = false;
|
||||
|
||||
@ -2093,6 +2093,178 @@ public class CameraCapabilityTests
|
||||
}
|
||||
}
|
||||
|
||||
public class TtsCapabilityTests
|
||||
{
|
||||
private static JsonElement Parse(string json)
|
||||
{
|
||||
using var doc = JsonDocument.Parse(json);
|
||||
return doc.RootElement.Clone();
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void CanHandle_TtsSpeak()
|
||||
{
|
||||
var cap = new TtsCapability(NullLogger.Instance);
|
||||
|
||||
Assert.True(cap.CanHandle("tts.speak"));
|
||||
Assert.False(cap.CanHandle("tts.stop"));
|
||||
Assert.Equal("tts", cap.Category);
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData("elevenlabs", "windows", "elevenlabs")]
|
||||
[InlineData(" ELEVENLABS ", "windows", "elevenlabs")]
|
||||
[InlineData(null, "elevenlabs", "elevenlabs")]
|
||||
[InlineData(" ", "elevenlabs", "elevenlabs")]
|
||||
[InlineData(null, "", "windows")]
|
||||
[InlineData(null, " ", "windows")]
|
||||
public void ResolveProvider_NormalizesRequestedAndConfiguredValues(
|
||||
string? requestedProvider,
|
||||
string? configuredProvider,
|
||||
string expected)
|
||||
{
|
||||
Assert.Equal(expected, TtsCapability.ResolveProvider(requestedProvider, configuredProvider));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Speak_ReturnsError_WhenTextMissing()
|
||||
{
|
||||
var cap = new TtsCapability(NullLogger.Instance);
|
||||
var handlerCalled = false;
|
||||
cap.SpeakRequested += (_, _) =>
|
||||
{
|
||||
handlerCalled = true;
|
||||
return Task.FromResult(new TtsSpeakResult());
|
||||
};
|
||||
|
||||
var res = await cap.ExecuteAsync(new NodeInvokeRequest
|
||||
{
|
||||
Id = "tts-missing",
|
||||
Command = "tts.speak",
|
||||
Args = Parse("""{"text":" "}""")
|
||||
});
|
||||
|
||||
Assert.False(res.Ok);
|
||||
Assert.False(handlerCalled);
|
||||
Assert.Contains("text", res.Error, StringComparison.OrdinalIgnoreCase);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Speak_ReturnsError_WhenNoHandler()
|
||||
{
|
||||
var cap = new TtsCapability(NullLogger.Instance);
|
||||
|
||||
var res = await cap.ExecuteAsync(new NodeInvokeRequest
|
||||
{
|
||||
Id = "tts-unavailable",
|
||||
Command = "tts.speak",
|
||||
Args = Parse("""{"text":"hello"}""")
|
||||
});
|
||||
|
||||
Assert.False(res.Ok);
|
||||
Assert.Contains("not available", res.Error, StringComparison.OrdinalIgnoreCase);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Speak_ReturnsError_WhenTextTooLong()
|
||||
{
|
||||
var cap = new TtsCapability(NullLogger.Instance);
|
||||
var handlerCalled = false;
|
||||
cap.SpeakRequested += (_, _) =>
|
||||
{
|
||||
handlerCalled = true;
|
||||
return Task.FromResult(new TtsSpeakResult());
|
||||
};
|
||||
|
||||
var res = await cap.ExecuteAsync(new NodeInvokeRequest
|
||||
{
|
||||
Id = "tts-too-long",
|
||||
Command = "tts.speak",
|
||||
Args = Parse(JsonSerializer.Serialize(new
|
||||
{
|
||||
text = new string('x', TtsCapability.MaxTextLength + 1)
|
||||
}))
|
||||
});
|
||||
|
||||
Assert.False(res.Ok);
|
||||
Assert.False(handlerCalled);
|
||||
Assert.Contains(TtsCapability.MaxTextLength.ToString(), res.Error);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Speak_RaisesEvent_WithArgs()
|
||||
{
|
||||
var cap = new TtsCapability(NullLogger.Instance);
|
||||
TtsSpeakArgs? received = null;
|
||||
cap.SpeakRequested += (args, _) =>
|
||||
{
|
||||
received = args;
|
||||
return Task.FromResult(new TtsSpeakResult
|
||||
{
|
||||
Provider = TtsCapability.ElevenLabsProvider,
|
||||
ContentType = "audio/mpeg",
|
||||
DurationMs = 123
|
||||
});
|
||||
};
|
||||
|
||||
var res = await cap.ExecuteAsync(new NodeInvokeRequest
|
||||
{
|
||||
Id = "tts-args",
|
||||
Command = "tts.speak",
|
||||
Args = Parse("""{"text":" hello world ","provider":"elevenlabs","voiceId":"voice-1","model":"model-1","interrupt":true}""")
|
||||
});
|
||||
|
||||
Assert.True(res.Ok);
|
||||
Assert.NotNull(received);
|
||||
Assert.Equal("hello world", received!.Text);
|
||||
Assert.Equal("elevenlabs", received.Provider);
|
||||
Assert.Equal("voice-1", received.VoiceId);
|
||||
Assert.Equal("model-1", received.Model);
|
||||
Assert.True(received.Interrupt);
|
||||
|
||||
var json = JsonSerializer.Serialize(res.Payload);
|
||||
using var doc = JsonDocument.Parse(json);
|
||||
var root = doc.RootElement;
|
||||
Assert.True(root.GetProperty("spoken").GetBoolean());
|
||||
Assert.Equal("elevenlabs", root.GetProperty("provider").GetString());
|
||||
Assert.Equal("audio/mpeg", root.GetProperty("contentType").GetString());
|
||||
Assert.Equal(123, root.GetProperty("durationMs").GetInt32());
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task Speak_ReturnsError_WhenHandlerThrows()
|
||||
{
|
||||
var cap = new TtsCapability(NullLogger.Instance);
|
||||
cap.SpeakRequested += (_, _) => throw new InvalidOperationException("Audio device unavailable");
|
||||
|
||||
var res = await cap.ExecuteAsync(new NodeInvokeRequest
|
||||
{
|
||||
Id = "tts-fail",
|
||||
Command = "tts.speak",
|
||||
Args = Parse("""{"text":"hello"}""")
|
||||
});
|
||||
|
||||
Assert.False(res.Ok);
|
||||
Assert.Contains("Audio device unavailable", res.Error);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task UnknownCommand_ReturnsError()
|
||||
{
|
||||
var cap = new TtsCapability(NullLogger.Instance);
|
||||
|
||||
var res = await cap.ExecuteAsync(new NodeInvokeRequest
|
||||
{
|
||||
Id = "tts-unknown",
|
||||
Command = "tts.stop",
|
||||
Args = Parse("""{}""")
|
||||
});
|
||||
|
||||
Assert.False(res.Ok);
|
||||
Assert.Contains("Unknown command", res.Error);
|
||||
}
|
||||
}
|
||||
|
||||
public class LocationCapabilityTests
|
||||
{
|
||||
private static JsonElement Parse(string json)
|
||||
|
||||
@ -78,6 +78,7 @@ public class McpToolBridgeTests
|
||||
new FakeCapability("canvas", "canvas.a2ui.push"),
|
||||
new FakeCapability("screen", "screen.snapshot"),
|
||||
new FakeCapability("camera", "camera.snap"),
|
||||
new FakeCapability("tts", "tts.speak"),
|
||||
new FakeCapability("custom", "custom.unknown"),
|
||||
};
|
||||
var bridge = CreateBridge(caps);
|
||||
@ -95,6 +96,7 @@ public class McpToolBridgeTests
|
||||
Assert.Contains("A2UI v0.8", byName["canvas.a2ui.push"]);
|
||||
Assert.Contains("screenshot", byName["screen.snapshot"]);
|
||||
Assert.Contains("camera", byName["camera.snap"], System.StringComparison.OrdinalIgnoreCase);
|
||||
Assert.Contains("Speak text", byName["tts.speak"]);
|
||||
|
||||
// Unknown commands keep the generic fallback so newly-added capabilities still render.
|
||||
Assert.Equal("custom capability: custom.unknown", byName["custom.unknown"]);
|
||||
|
||||
@ -937,6 +937,8 @@ public class CommandCenterModelTests
|
||||
Assert.Contains("device.info", CommandCenterCommandGroups.SafeCompanionCommands);
|
||||
Assert.Contains("device.status", CommandCenterCommandGroups.SafeCompanionCommands);
|
||||
Assert.Contains("screen.record", CommandCenterCommandGroups.DangerousCommands);
|
||||
Assert.Contains("tts.speak", CommandCenterCommandGroups.DangerousCommands);
|
||||
Assert.DoesNotContain("tts.speak", CommandCenterCommandGroups.MacNodeParityCommands);
|
||||
Assert.Contains("browser.proxy", CommandCenterCommandGroups.BrowserCommands);
|
||||
Assert.Contains("browser.proxy", CommandCenterCommandGroups.MacNodeParityCommands);
|
||||
}
|
||||
|
||||
137
tests/OpenClaw.Tray.Tests/ElevenLabsTextToSpeechClientTests.cs
Normal file
137
tests/OpenClaw.Tray.Tests/ElevenLabsTextToSpeechClientTests.cs
Normal file
@ -0,0 +1,137 @@
|
||||
using System.Net;
|
||||
using System.Net.Http;
|
||||
using System.Text.Json;
|
||||
using OpenClawTray.Services;
|
||||
|
||||
namespace OpenClaw.Tray.Tests;
|
||||
|
||||
public class ElevenLabsTextToSpeechClientTests
|
||||
{
|
||||
[Fact]
|
||||
public async Task SynthesizeAsync_PostsExpectedRequest()
|
||||
{
|
||||
var handler = new CapturingHandler(new HttpResponseMessage(HttpStatusCode.OK)
|
||||
{
|
||||
Content = new ByteArrayContent([1, 2, 3])
|
||||
{
|
||||
Headers = { ContentType = new("audio/mpeg") }
|
||||
}
|
||||
});
|
||||
var client = new ElevenLabsTextToSpeechClient(handler, "https://example.test");
|
||||
|
||||
var result = await client.SynthesizeAsync(new ElevenLabsSynthesisRequest
|
||||
{
|
||||
ApiKey = "key-123",
|
||||
VoiceId = "voice/with slash",
|
||||
Text = "Hello",
|
||||
ModelId = "model-1"
|
||||
});
|
||||
|
||||
Assert.Equal([1, 2, 3], result.AudioBytes);
|
||||
Assert.Equal("audio/mpeg", result.ContentType);
|
||||
Assert.NotNull(handler.LastRequest);
|
||||
Assert.Equal(HttpMethod.Post, handler.LastRequest!.Method);
|
||||
Assert.Equal("https://example.test/v1/text-to-speech/voice%2Fwith%20slash", handler.LastRequest.RequestUri!.AbsoluteUri);
|
||||
Assert.True(handler.LastRequest.Headers.TryGetValues("xi-api-key", out var keyValues));
|
||||
Assert.Contains("key-123", keyValues);
|
||||
|
||||
using var doc = JsonDocument.Parse(handler.LastBody!);
|
||||
Assert.Equal("Hello", doc.RootElement.GetProperty("text").GetString());
|
||||
Assert.Equal("model-1", doc.RootElement.GetProperty("model_id").GetString());
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task SynthesizeAsync_ReturnsErrorMessageForProviderFailure()
|
||||
{
|
||||
var handler = new CapturingHandler(new HttpResponseMessage(HttpStatusCode.Unauthorized)
|
||||
{
|
||||
Content = new StringContent("""{"detail":"bad key"}""")
|
||||
});
|
||||
var client = new ElevenLabsTextToSpeechClient(handler, "https://example.test");
|
||||
|
||||
var ex = await Assert.ThrowsAsync<InvalidOperationException>(() => client.SynthesizeAsync(new ElevenLabsSynthesisRequest
|
||||
{
|
||||
ApiKey = "bad",
|
||||
VoiceId = "voice-1",
|
||||
Text = "Hello"
|
||||
}));
|
||||
|
||||
Assert.Contains("401", ex.Message);
|
||||
Assert.Contains("bad key", ex.Message);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task SynthesizeAsync_ValidatesRequiredFieldsBeforeNetwork()
|
||||
{
|
||||
var handler = new CapturingHandler(new HttpResponseMessage(HttpStatusCode.OK)
|
||||
{
|
||||
Content = new ByteArrayContent([1])
|
||||
});
|
||||
var client = new ElevenLabsTextToSpeechClient(handler, "https://example.test");
|
||||
|
||||
await Assert.ThrowsAsync<InvalidOperationException>(() => client.SynthesizeAsync(new ElevenLabsSynthesisRequest
|
||||
{
|
||||
ApiKey = "",
|
||||
VoiceId = "voice-1",
|
||||
Text = "Hello"
|
||||
}));
|
||||
Assert.Null(handler.LastRequest);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public async Task SynthesizeAsync_RejectsOversizedTextBeforeNetwork()
|
||||
{
|
||||
var handler = new CapturingHandler(new HttpResponseMessage(HttpStatusCode.OK)
|
||||
{
|
||||
Content = new ByteArrayContent([1])
|
||||
});
|
||||
var client = new ElevenLabsTextToSpeechClient(handler, "https://example.test");
|
||||
|
||||
var ex = await Assert.ThrowsAsync<InvalidOperationException>(() => client.SynthesizeAsync(new ElevenLabsSynthesisRequest
|
||||
{
|
||||
ApiKey = "key-123",
|
||||
VoiceId = "voice-1",
|
||||
Text = new string('x', ElevenLabsTextToSpeechClient.MaxTextLength + 1)
|
||||
}));
|
||||
|
||||
Assert.Contains(ElevenLabsTextToSpeechClient.MaxTextLength.ToString(), ex.Message);
|
||||
Assert.Null(handler.LastRequest);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void Constructor_SetsRequestTimeout()
|
||||
{
|
||||
var handler = new CapturingHandler(new HttpResponseMessage(HttpStatusCode.OK)
|
||||
{
|
||||
Content = new ByteArrayContent([1])
|
||||
});
|
||||
|
||||
using var client = new ElevenLabsTextToSpeechClient(handler, "https://example.test");
|
||||
|
||||
Assert.Equal(ElevenLabsTextToSpeechClient.DefaultTimeout, client.Timeout);
|
||||
}
|
||||
|
||||
private sealed class CapturingHandler : HttpMessageHandler
|
||||
{
|
||||
private readonly HttpResponseMessage _response;
|
||||
|
||||
public HttpRequestMessage? LastRequest { get; private set; }
|
||||
public string? LastBody { get; private set; }
|
||||
|
||||
public CapturingHandler(HttpResponseMessage response)
|
||||
{
|
||||
_response = response;
|
||||
}
|
||||
|
||||
protected override async Task<HttpResponseMessage> SendAsync(
|
||||
HttpRequestMessage request,
|
||||
CancellationToken cancellationToken)
|
||||
{
|
||||
LastRequest = request;
|
||||
LastBody = request.Content is null
|
||||
? null
|
||||
: await request.Content.ReadAsStringAsync(cancellationToken);
|
||||
return _response;
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -16,12 +16,15 @@
|
||||
|
||||
<ItemGroup>
|
||||
<ProjectReference Include="..\..\src\OpenClaw.Shared\OpenClaw.Shared.csproj" />
|
||||
<PackageReference Include="System.Security.Cryptography.ProtectedData" Version="10.0.0" />
|
||||
</ItemGroup>
|
||||
|
||||
<ItemGroup>
|
||||
<Compile Include="..\..\src\OpenClaw.Tray.WinUI\Services\DeepLinkHandler.cs" Link="Services\DeepLinkHandler.cs" />
|
||||
<Compile Include="..\..\src\OpenClaw.Tray.WinUI\Services\Logger.cs" Link="Services\Logger.cs" />
|
||||
<Compile Include="..\..\src\OpenClaw.Tray.WinUI\Services\SettingsManager.cs" Link="Services\SettingsManager.cs" />
|
||||
<Compile Include="..\..\src\OpenClaw.Tray.WinUI\Services\ActivityStreamService.cs" Link="Services\ActivityStreamService.cs" />
|
||||
<Compile Include="..\..\src\OpenClaw.Tray.WinUI\Services\TextToSpeech\ElevenLabsTextToSpeechClient.cs" Link="Services\TextToSpeech\ElevenLabsTextToSpeechClient.cs" />
|
||||
<Compile Include="..\..\src\OpenClaw.Tray.WinUI\A2UI\Actions\AgentMessageFormatter.cs" Link="A2UI\Actions\AgentMessageFormatter.cs" />
|
||||
<Compile Include="..\..\src\OpenClaw.Tray.WinUI\A2UI\Actions\GatewayActionTransport.cs" Link="A2UI\Actions\GatewayActionTransport.cs" />
|
||||
<Compile Include="..\..\src\OpenClaw.Tray.WinUI\A2UI\Actions\IActionSink.cs" Link="A2UI\Actions\IActionSink.cs" />
|
||||
@ -29,7 +32,6 @@
|
||||
<Compile Include="..\..\src\OpenClaw.Tray.WinUI\A2UI\Rendering\SecretRedactor.cs" Link="A2UI\Rendering\SecretRedactor.cs" />
|
||||
<Compile Include="..\..\src\OpenClaw.Tray.WinUI\Onboarding\Services\OnboardingState.cs" Link="Imported\OnboardingState.cs" />
|
||||
<Compile Include="..\..\src\OpenClaw.Tray.WinUI\Onboarding\Widgets\WizardStepModels.cs" Link="Imported\WizardStepModels.cs" />
|
||||
<Compile Include="..\..\src\OpenClaw.Tray.WinUI\Services\SettingsManager.cs" Link="Imported\SettingsManager.cs" />
|
||||
<Compile Include="..\..\src\OpenClaw.Tray.WinUI\Helpers\GatewayChatUrlBuilder.cs" Link="Imported\GatewayChatUrlBuilder.cs" />
|
||||
<Compile Include="..\..\src\OpenClaw.Tray.WinUI\Onboarding\Services\LocalGatewayApprover.cs" Link="Imported\LocalGatewayApprover.cs" />
|
||||
<Compile Include="..\..\src\OpenClaw.Tray.WinUI\Onboarding\Services\SetupCodeDecoder.cs" Link="Imported\SetupCodeDecoder.cs" />
|
||||
|
||||
@ -1,5 +1,6 @@
|
||||
using System.Text.Json;
|
||||
using OpenClaw.Shared;
|
||||
using OpenClawTray.Services;
|
||||
|
||||
namespace OpenClaw.Tray.Tests;
|
||||
|
||||
@ -36,6 +37,11 @@ public class SettingsRoundTripTests
|
||||
NodeCameraEnabled = false,
|
||||
NodeLocationEnabled = true,
|
||||
NodeBrowserProxyEnabled = false,
|
||||
NodeTtsEnabled = true,
|
||||
TtsProvider = "elevenlabs",
|
||||
TtsElevenLabsApiKey = "elevenlabs-key",
|
||||
TtsElevenLabsModel = "eleven_multilingual_v2",
|
||||
TtsElevenLabsVoiceId = "voice-123",
|
||||
HasSeenActivityStreamTip = true,
|
||||
SkippedUpdateTag = "v1.2.3",
|
||||
NotifyChatResponses = false,
|
||||
@ -76,6 +82,11 @@ public class SettingsRoundTripTests
|
||||
Assert.Equal(original.NodeCameraEnabled, restored.NodeCameraEnabled);
|
||||
Assert.Equal(original.NodeLocationEnabled, restored.NodeLocationEnabled);
|
||||
Assert.Equal(original.NodeBrowserProxyEnabled, restored.NodeBrowserProxyEnabled);
|
||||
Assert.Equal(original.NodeTtsEnabled, restored.NodeTtsEnabled);
|
||||
Assert.Equal(original.TtsProvider, restored.TtsProvider);
|
||||
Assert.Equal(original.TtsElevenLabsApiKey, restored.TtsElevenLabsApiKey);
|
||||
Assert.Equal(original.TtsElevenLabsModel, restored.TtsElevenLabsModel);
|
||||
Assert.Equal(original.TtsElevenLabsVoiceId, restored.TtsElevenLabsVoiceId);
|
||||
Assert.Equal(original.HasSeenActivityStreamTip, restored.HasSeenActivityStreamTip);
|
||||
Assert.Equal(original.SkippedUpdateTag, restored.SkippedUpdateTag);
|
||||
Assert.Equal(original.NotifyChatResponses, restored.NotifyChatResponses);
|
||||
@ -133,6 +144,11 @@ public class SettingsRoundTripTests
|
||||
Assert.True(settings.NodeCameraEnabled);
|
||||
Assert.True(settings.NodeLocationEnabled);
|
||||
Assert.True(settings.NodeBrowserProxyEnabled);
|
||||
Assert.False(settings.NodeTtsEnabled);
|
||||
Assert.Equal("windows", settings.TtsProvider);
|
||||
Assert.Null(settings.TtsElevenLabsApiKey);
|
||||
Assert.Null(settings.TtsElevenLabsModel);
|
||||
Assert.Null(settings.TtsElevenLabsVoiceId);
|
||||
Assert.False(settings.HasSeenActivityStreamTip);
|
||||
Assert.Null(settings.SkippedUpdateTag);
|
||||
Assert.True(settings.NotifyChatResponses);
|
||||
@ -182,6 +198,11 @@ public class SettingsRoundTripTests
|
||||
Assert.True(settings.NodeCameraEnabled);
|
||||
Assert.True(settings.NodeLocationEnabled);
|
||||
Assert.True(settings.NodeBrowserProxyEnabled);
|
||||
Assert.False(settings.NodeTtsEnabled);
|
||||
Assert.Equal("windows", settings.TtsProvider);
|
||||
Assert.Null(settings.TtsElevenLabsApiKey);
|
||||
Assert.Null(settings.TtsElevenLabsModel);
|
||||
Assert.Null(settings.TtsElevenLabsVoiceId);
|
||||
Assert.False(settings.HasSeenActivityStreamTip);
|
||||
Assert.Null(settings.SkippedUpdateTag);
|
||||
Assert.True(settings.GlobalHotkeyEnabled);
|
||||
@ -194,6 +215,23 @@ public class SettingsRoundTripTests
|
||||
Assert.Null(SettingsData.FromJson("not json at all"));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void SettingsManager_ProtectsElevenLabsApiKeyForStorage()
|
||||
{
|
||||
var protectedValue = SettingsManager.ProtectSettingSecret("elevenlabs-key");
|
||||
|
||||
Assert.NotNull(protectedValue);
|
||||
Assert.StartsWith("dpapi:", protectedValue);
|
||||
Assert.DoesNotContain("elevenlabs-key", protectedValue);
|
||||
Assert.Equal("elevenlabs-key", SettingsManager.UnprotectSettingSecret(protectedValue));
|
||||
}
|
||||
|
||||
[Fact]
|
||||
public void SettingsManager_ReturnsNullForCorruptedProtectedSecret()
|
||||
{
|
||||
Assert.Null(SettingsManager.UnprotectSettingSecret("dpapi:not-base64"));
|
||||
}
|
||||
|
||||
[Theory]
|
||||
[InlineData(null)]
|
||||
[InlineData("")]
|
||||
|
||||
@ -346,6 +346,10 @@ public class TrayMenuWindowMarkupTests
|
||||
Assert.Contains(@"AutomationProperties.AutomationId=""NodeCameraToggle""", xaml);
|
||||
Assert.Contains(@"AutomationProperties.AutomationId=""NodeLocationToggle""", xaml);
|
||||
Assert.Contains(@"AutomationProperties.AutomationId=""NodeBrowserProxyToggle""", xaml);
|
||||
Assert.Contains(@"AutomationProperties.AutomationId=""NodeTtsToggle""", xaml);
|
||||
Assert.Contains(@"AutomationProperties.AutomationId=""TtsProviderComboBox""", xaml);
|
||||
Assert.Contains(@"AutomationProperties.AutomationId=""TtsElevenLabsSettingsPanel""", xaml);
|
||||
Assert.Contains(@"AutomationProperties.AutomationId=""TtsElevenLabsApiKeyPasswordBox""", xaml);
|
||||
}
|
||||
|
||||
[Fact]
|
||||
|
||||
Loading…
Reference in New Issue
Block a user