Tachikoma/Examples/RealtimeVoiceAssistant.swift
2026-05-10 09:43:33 +01:00

553 lines
20 KiB
Swift

import Foundation
import Tachikoma
import TachikomaAudio
// MARK: - Complete Realtime API Example
/// Comprehensive example demonstrating all Realtime API features
@available(macOS 14.0, iOS 17.0, *)
@MainActor
class RealtimeVoiceAssistant {
private let apiKey: String
private var conversation: RealtimeConversation?
init(apiKey: String) {
self.apiKey = apiKey
}
// MARK: - Basic Voice Conversation
func basicVoiceConversation() async throws {
print("🎙️ Starting Basic Voice Conversation...")
// Simple configuration for voice conversation
let config = TachikomaConfiguration()
config.setAPIKey(self.apiKey, for: .openai)
// Create basic conversation
let conversation = try RealtimeConversation(configuration: config)
// Start with voice configuration
try await conversation.start(
model: .custom("gpt-realtime"),
voice: .nova,
instructions: "You are a helpful, witty, and friendly AI assistant. Keep responses concise.",
)
print("✅ Connected to Realtime API")
print("🎤 Starting to listen...")
// Manual turn control
try await conversation.startListening()
// Simulate user speaking for 3 seconds
try await Task.sleep(nanoseconds: 3_000_000_000)
try await conversation.stopListening()
print("🛑 Stopped listening, processing response...")
// Handle transcript updates
Task {
for await transcript in conversation.transcriptUpdates {
print("📝 Transcript: \(transcript)")
}
}
// Monitor audio levels
Task {
for await level in conversation.audioLevelUpdates {
if level > 0.5 {
print("🔊 Audio Level: \(String(format: "%.2f", level))")
}
}
}
// Let conversation run for 10 seconds
try await Task.sleep(nanoseconds: 10_000_000_000)
// End conversation
await conversation.end()
print("👋 Conversation ended")
}
// MARK: - Advanced Configuration with VAD
func advancedVoiceWithVAD() async throws {
print("\n🎯 Starting Advanced Voice Conversation with Server VAD...")
// Advanced configuration with all features
let config = SessionConfiguration(
model: "gpt-realtime",
voice: .nova,
instructions: """
You are an expert AI assistant with deep knowledge across many domains.
Provide helpful, accurate, and engaging responses.
Use a conversational tone while maintaining professionalism.
""",
inputAudioFormat: .pcm16,
outputAudioFormat: .pcm16,
inputAudioTranscription: .whisper, // Enable transcription
turnDetection: RealtimeTurnDetection(
type: .serverVad,
threshold: 0.5,
silenceDurationMs: 200, // 200ms silence to end turn
prefixPaddingMs: 300, // Include 300ms before speech
createResponse: true, // Auto-respond after turn
),
tools: nil,
toolChoice: nil,
temperature: 0.8,
maxResponseOutputTokens: 4096,
modalities: .all, // Both text and audio
)
// Production settings with auto-reconnect
let settings = ConversationSettings(
autoReconnect: true,
maxReconnectAttempts: 3,
reconnectDelay: 2.0,
bufferWhileDisconnected: true,
maxAudioBufferSize: 1024 * 1024, // 1MB buffer
enableEchoCancellation: true,
enableNoiseSuppression: true,
localVADThreshold: 0.3,
showAudioLevels: true,
persistConversation: false,
)
// Create advanced conversation
self.conversation = try RealtimeConversation(
apiKey: self.apiKey,
configuration: config,
settings: settings,
)
// Start conversation
try await self.conversation!.start()
print("✅ Connected with Server VAD enabled")
// Monitor conversation state
self.observeConversationState()
// Server VAD will automatically detect speech start/stop
print("🎤 Server VAD is listening for speech...")
print("💡 Speak naturally - the server will detect when you start and stop talking")
// Run for 30 seconds
try await Task.sleep(nanoseconds: 30_000_000_000)
await self.conversation!.end()
print("👋 Advanced conversation ended")
}
// MARK: - Function Calling Example
func voiceWithFunctionCalling() async throws {
print("\n🛠️ Starting Voice Conversation with Function Calling...")
// Configuration with tools
let config = SessionConfiguration.withTools(
model: "gpt-realtime",
voice: .nova,
tools: [
// Weather tool
RealtimeTool(
name: "get_weather",
description: "Get current weather for any location",
parameters: AgentToolParameters(
properties: [
"location": AgentToolParameterProperty(
name: "location",
type: .string,
description: "City and state/country, e.g., 'Tokyo, Japan'",
),
"units": AgentToolParameterProperty(
name: "units",
type: .string,
description: "Temperature units: 'celsius' or 'fahrenheit'",
enumValues: ["celsius", "fahrenheit"],
),
],
required: ["location"],
),
),
// Calculator tool
RealtimeTool(
name: "calculate",
description: "Perform mathematical calculations",
parameters: AgentToolParameters(
properties: [
"expression": AgentToolParameterProperty(
name: "expression",
type: .string,
description: "Mathematical expression to evaluate",
),
],
required: ["expression"],
),
),
// Time tool
RealtimeTool(
name: "get_time",
description: "Get current time in any timezone",
parameters: AgentToolParameters(
properties: [
"timezone": AgentToolParameterProperty(
name: "timezone",
type: .string,
description: "Timezone name, e.g., 'America/New_York', 'Asia/Tokyo'",
),
],
required: ["timezone"],
),
),
],
)
self.conversation = try RealtimeConversation(
apiKey: self.apiKey,
configuration: config,
settings: .production,
)
// Register tool executors
await self.conversation!.registerTools([
createTool(
name: "get_weather",
parameters: [
AgentToolParameterProperty(name: "location", type: .string, description: "Location"),
AgentToolParameterProperty(name: "units", type: .string, description: "Units"),
],
) { args in
let location = try args.stringValue("location")
let units = args.optionalStringValue("units") ?? "celsius"
// Simulate weather API call
let temp = Int.random(in: 15...30)
let conditions = ["sunny", "cloudy", "partly cloudy", "rainy"].randomElement()!
return .string("""
Weather in \(location):
Temperature: \(temp)°\(units == "celsius" ? "C" : "F")
Conditions: \(conditions)
Humidity: \(Int.random(in: 40...80))%
Wind: \(Int.random(in: 5...20)) km/h
""")
},
createTool(
name: "calculate",
parameters: [
AgentToolParameterProperty(name: "expression", type: .string, description: "Math expression"),
],
) { args in
let expression = try args.stringValue("expression")
// Simple calculator (in production, use proper expression parser)
let result = NSExpression(format: expression).expressionValue(with: nil, context: nil) as? NSNumber
if let result {
return .string("Result: \(result.doubleValue)")
} else {
return .string("Error: Invalid expression")
}
},
createTool(
name: "get_time",
parameters: [
AgentToolParameterProperty(name: "timezone", type: .string, description: "Timezone"),
],
) { args in
let timezone = try args.stringValue("timezone")
let formatter = DateFormatter()
formatter.timeZone = TimeZone(identifier: timezone) ?? TimeZone.current
formatter.dateFormat = "yyyy-MM-dd HH:mm:ss z"
return .string("Current time in \(timezone): \(formatter.string(from: Date()))")
},
])
try await self.conversation!.start()
print("✅ Connected with function calling enabled")
print("\n📢 Try these voice commands:")
print(" - 'What's the weather in Tokyo?'")
print(" - 'Calculate 25 times 4 plus 10'")
print(" - 'What time is it in New York?'")
print(" - 'What's the weather in Paris in fahrenheit?'")
// Monitor function calls
Task {
while self.conversation != nil {
if let items = conversation?.items {
for item in items {
if item.type == "function_call" {
print("🔧 Function called: \(item.name ?? "unknown")")
if let output = item.output {
print(" Result: \(output)")
}
}
}
}
try await Task.sleep(nanoseconds: 1_000_000_000)
}
}
// Run for 30 seconds
try await Task.sleep(nanoseconds: 30_000_000_000)
await self.conversation!.end()
print("👋 Function calling conversation ended")
}
// MARK: - Dynamic Modality Switching
func dynamicModalitySwitching() async throws {
print("\n🔄 Starting Dynamic Modality Switching Example...")
let config = SessionConfiguration.voiceConversation()
self.conversation = try RealtimeConversation(
apiKey: self.apiKey,
configuration: config,
settings: .production,
)
try await self.conversation!.start()
print("✅ Connected with all modalities")
// Start with both text and audio
print("🎙️ Mode: Text + Audio")
try await Task.sleep(nanoseconds: 5_000_000_000)
// Switch to text-only
print("📝 Switching to text-only mode...")
try await self.conversation!.updateModalities(.text)
// Send text message
try await self.conversation!.sendText("Hello! Can you explain what modalities are?")
try await Task.sleep(nanoseconds: 5_000_000_000)
// Switch to audio-only
print("🎤 Switching to audio-only mode...")
try await self.conversation!.updateModalities(.audio)
try await Task.sleep(nanoseconds: 5_000_000_000)
// Switch back to both
print("🎙️📝 Switching back to text + audio mode...")
try await self.conversation!.updateModalities(.all)
try await Task.sleep(nanoseconds: 5_000_000_000)
await self.conversation!.end()
print("👋 Modality switching example ended")
}
// MARK: - Conversation Management
func conversationManagement() async throws {
print("\n📚 Starting Conversation Management Example...")
let config = SessionConfiguration.voiceConversation()
self.conversation = try RealtimeConversation(
apiKey: self.apiKey,
configuration: config,
settings: .production,
)
try await self.conversation!.start()
// Send initial messages
try await self.conversation!.sendText("Remember this number: 42")
try await Task.sleep(nanoseconds: 2_000_000_000)
try await self.conversation!.sendText("Also remember this word: Tachikoma")
try await Task.sleep(nanoseconds: 2_000_000_000)
// Check conversation history
print("📜 Conversation items: \(self.conversation!.items.count)")
for item in self.conversation!.items {
if let content = item.content?.first {
switch content.type {
case "text":
print(" [\(item.role ?? "unknown")]: \(content.text ?? "")")
default:
break
}
}
}
// Test memory
try await self.conversation!.sendText("What number and word did I ask you to remember?")
try await Task.sleep(nanoseconds: 5_000_000_000)
// Clear conversation
print("🗑️ Clearing conversation history...")
try await self.conversation!.clearConversation()
// Test memory after clear
try await self.conversation!.sendText("What number and word did I mention earlier?")
try await Task.sleep(nanoseconds: 5_000_000_000)
await self.conversation!.end()
print("👋 Conversation management example ended")
}
// MARK: - Error Handling and Reconnection
func errorHandlingExample() async throws {
print("\n⚠️ Starting Error Handling and Reconnection Example...")
let config = SessionConfiguration.voiceConversation()
// Settings with aggressive reconnection
let settings = ConversationSettings(
autoReconnect: true,
maxReconnectAttempts: 5,
reconnectDelay: 1.0,
bufferWhileDisconnected: true,
maxAudioBufferSize: 2 * 1024 * 1024, // 2MB buffer
)
self.conversation = try RealtimeConversation(
apiKey: self.apiKey,
configuration: config,
settings: settings,
)
// Monitor connection state
Task {
while self.conversation != nil {
let state = self.conversation!.state
let connected = self.conversation!.isConnected
print("📡 State: \(state.rawValue), Connected: \(connected)")
if state == .reconnecting {
print("🔄 Attempting to reconnect...")
} else if state == .error {
print("❌ Error state detected")
}
try await Task.sleep(nanoseconds: 2_000_000_000)
}
}
try await self.conversation!.start()
print("✅ Connected with auto-reconnect enabled")
// Simulate conversation
try await self.conversation!.sendText("Testing connection stability")
// Note: In a real scenario, you could test disconnection by:
// - Disabling network
// - Killing the connection
// - Server-side timeout
print("💡 Auto-reconnect will handle network interruptions")
print("💾 Audio is buffered during disconnection")
// Run for 10 seconds
try await Task.sleep(nanoseconds: 10_000_000_000)
await self.conversation!.end()
print("👋 Error handling example ended")
}
// MARK: - Helper Methods
private func observeConversationState() {
guard let conversation else { return }
Task {
while self.conversation != nil {
print("""
📊 Status:
State: \(conversation.state.rawValue)
Connected: \(conversation.isConnected)
Listening: \(conversation.isListening)
Speaking: \(conversation.isSpeaking)
Turn Active: \(conversation.turnActive)
Audio Level: \(String(format: "%.2f", conversation.audioLevel))
Items: \(conversation.items.count)
""")
try await Task.sleep(nanoseconds: 3_000_000_000)
}
}
}
}
// MARK: - Main Example Runner
@available(macOS 14.0, iOS 17.0, *)
@MainActor
func runRealtimeExamples() async throws {
guard let apiKey = ProcessInfo.processInfo.environment["OPENAI_API_KEY"] else {
print("❌ Error: OPENAI_API_KEY environment variable not set")
return
}
let assistant = RealtimeVoiceAssistant(apiKey: apiKey)
print("""
╔════════════════════════════════════════════╗
║ OpenAI Realtime API Examples ║
║ Tachikoma Swift SDK ║
╚════════════════════════════════════════════╝
""")
// Run examples based on command line argument
if CommandLine.arguments.contains("--basic") {
try await assistant.basicVoiceConversation()
} else if CommandLine.arguments.contains("--vad") {
try await assistant.advancedVoiceWithVAD()
} else if CommandLine.arguments.contains("--tools") {
try await assistant.voiceWithFunctionCalling()
} else if CommandLine.arguments.contains("--modality") {
try await assistant.dynamicModalitySwitching()
} else if CommandLine.arguments.contains("--conversation") {
try await assistant.conversationManagement()
} else if CommandLine.arguments.contains("--error") {
try await assistant.errorHandlingExample()
} else if CommandLine.arguments.contains("--all") {
// Run all examples
try await assistant.basicVoiceConversation()
try await assistant.advancedVoiceWithVAD()
try await assistant.voiceWithFunctionCalling()
try await assistant.dynamicModalitySwitching()
try await assistant.conversationManagement()
try await assistant.errorHandlingExample()
} else {
print("""
Usage: swift run RealtimeVoiceAssistant [option]
Options:
--basic Basic voice conversation
--vad Advanced with Server VAD
--tools Function calling example
--modality Dynamic modality switching
--conversation Conversation management
--error Error handling and reconnection
--all Run all examples
Make sure OPENAI_API_KEY is set in your environment.
""")
}
}
// Entry point for standalone execution
#if os(macOS) || os(iOS)
if #available(macOS 14.0, iOS 17.0, *) {
Task {
do {
try await runRealtimeExamples()
} catch {
print("❌ Error: \(error)")
}
}
}
#endif