From c4f4dc556ad36c8537f250ef7a105f048d3e8980 Mon Sep 17 00:00:00 2001 From: Haiping Chen Date: Tue, 8 Apr 2025 17:00:35 -0500 Subject: [PATCH] InputAudioTranscribe --- .../MLTasks/IRealTimeCompletion.cs | 2 +- .../Realtime/Models/RealtimeHubConnection.cs | 2 -- .../Realtime/Models/RealtimeModelSettings.cs | 2 ++ .../Services/RealtimeHub.cs | 2 +- .../Realtime/RealTimeCompletionProvider.cs | 33 +++++++++++-------- .../Controllers/TwilioVoiceController.cs | 8 +++++ .../Interfaces/ITwilioCallStatusHook.cs | 4 +++ 7 files changed, 35 insertions(+), 18 deletions(-) diff --git a/src/Infrastructure/BotSharp.Abstraction/MLTasks/IRealTimeCompletion.cs b/src/Infrastructure/BotSharp.Abstraction/MLTasks/IRealTimeCompletion.cs index fbbcc61f4..396ccf02b 100644 --- a/src/Infrastructure/BotSharp.Abstraction/MLTasks/IRealTimeCompletion.cs +++ b/src/Infrastructure/BotSharp.Abstraction/MLTasks/IRealTimeCompletion.cs @@ -16,7 +16,7 @@ Task Connect(RealtimeHubConnection conn, Action> onModelResponseDone, Action onConversationItemCreated, Action onInputAudioTranscriptionCompleted, - Action onUserInterrupted); + Action onInterruptionDetected); Task AppenAudioBuffer(string message); Task AppenAudioBuffer(ArraySegment data, int length); diff --git a/src/Infrastructure/BotSharp.Abstraction/Realtime/Models/RealtimeHubConnection.cs b/src/Infrastructure/BotSharp.Abstraction/Realtime/Models/RealtimeHubConnection.cs index 7f234cb3d..6201967a9 100644 --- a/src/Infrastructure/BotSharp.Abstraction/Realtime/Models/RealtimeHubConnection.cs +++ b/src/Infrastructure/BotSharp.Abstraction/Realtime/Models/RealtimeHubConnection.cs @@ -1,5 +1,3 @@ -using System.Collections.Concurrent; - namespace BotSharp.Abstraction.Realtime.Models; public class RealtimeHubConnection diff --git a/src/Infrastructure/BotSharp.Abstraction/Realtime/Models/RealtimeModelSettings.cs b/src/Infrastructure/BotSharp.Abstraction/Realtime/Models/RealtimeModelSettings.cs index 424e5efea..7ebe2c426 100644 --- a/src/Infrastructure/BotSharp.Abstraction/Realtime/Models/RealtimeModelSettings.cs +++ b/src/Infrastructure/BotSharp.Abstraction/Realtime/Models/RealtimeModelSettings.cs @@ -7,9 +7,11 @@ public class RealtimeModelSettings public bool InterruptResponse { get; set; } = true; public string InputAudioFormat { get; set; } = "g711_ulaw"; public string OutputAudioFormat { get; set; } = "g711_ulaw"; + public bool InputAudioTranscribe { get; set; } = false; public string Voice { get; set; } = "alloy"; public float Temperature { get; set; } = 0.8f; public int MaxResponseOutputTokens { get; set; } = 512; + public int ModelResponseTimeout { get; set; } = 30; public AudioTranscription InputAudioTranscription { get; set; } = new(); public ModelTurnDetection TurnDetection { get; set; } = new(); } diff --git a/src/Infrastructure/BotSharp.Core.Realtime/Services/RealtimeHub.cs b/src/Infrastructure/BotSharp.Core.Realtime/Services/RealtimeHub.cs index 7e4ad7fe3..25bdbf2e0 100644 --- a/src/Infrastructure/BotSharp.Core.Realtime/Services/RealtimeHub.cs +++ b/src/Infrastructure/BotSharp.Core.Realtime/Services/RealtimeHub.cs @@ -139,7 +139,7 @@ await _completer.Connect(_conn, await hook.OnMessageReceived(message); } }, - onUserInterrupted: async () => + onInterruptionDetected: async () => { if (settings.InterruptResponse) { diff --git a/src/Plugins/BotSharp.Plugin.OpenAI/Providers/Realtime/RealTimeCompletionProvider.cs b/src/Plugins/BotSharp.Plugin.OpenAI/Providers/Realtime/RealTimeCompletionProvider.cs index 647efd9bf..1f77a89f3 100644 --- a/src/Plugins/BotSharp.Plugin.OpenAI/Providers/Realtime/RealTimeCompletionProvider.cs +++ b/src/Plugins/BotSharp.Plugin.OpenAI/Providers/Realtime/RealTimeCompletionProvider.cs @@ -37,7 +37,7 @@ public async Task Connect(RealtimeHubConnection conn, Action> onModelResponseDone, Action onConversationItemCreated, Action onInputAudioTranscriptionCompleted, - Action onUserInterrupted) + Action onInterruptionDetected) { var realtimeModelSettings = _services.GetRequiredService(); _model = realtimeModelSettings.Model; @@ -62,7 +62,7 @@ public async Task Connect(RealtimeHubConnection conn, onModelResponseDone, onConversationItemCreated, onInputAudioTranscriptionCompleted, - onUserInterrupted); + onInterruptionDetected); } } @@ -139,11 +139,12 @@ private async Task ReceiveMessage(RealtimeHubConnection conn, Action> onModelResponseDone, Action onConversationItemCreated, Action onUserAudioTranscriptionCompleted, - Action onUserInterrupted) + Action onInterruptionDetected) { var buffer = new byte[1024 * 32]; // Model response timeout - var timeout = 30; + var settings = _services.GetRequiredService(); + var timeout = settings.ModelResponseTimeout; WebSocketReceiveResult? result = default; do @@ -241,7 +242,7 @@ private async Task ReceiveMessage(RealtimeHubConnection conn, else if (response.Type == "input_audio_buffer.speech_started") { // Handle user interuption - onUserInterrupted(); + onInterruptionDetected(); } } while (!result.CloseStatus.HasValue); @@ -290,9 +291,6 @@ public async Task UpdateSession(RealtimeHubConnection conn) return fn; }).ToArray(); - var words = new List(); - HookEmitter.Emit(_services, hook => words.AddRange(hook.OnModelTranscriptPrompt(agent))); - var realtimeModelSettings = _services.GetRequiredService(); var sessionUpdate = new @@ -302,12 +300,6 @@ public async Task UpdateSession(RealtimeHubConnection conn) { InputAudioFormat = realtimeModelSettings.InputAudioFormat, OutputAudioFormat = realtimeModelSettings.OutputAudioFormat, - /*InputAudioTranscription = new InputAudioTranscription - { - Model = realtimeModelSettings.InputAudioTranscription.Model, - Language = realtimeModelSettings.InputAudioTranscription.Language, - Prompt = string.Join(", ", words.Select(x => x.ToLower().Trim()).Distinct()).SubstringMax(1024) - },*/ Voice = realtimeModelSettings.Voice, Instructions = instruction, ToolChoice = "auto", @@ -329,6 +321,19 @@ public async Task UpdateSession(RealtimeHubConnection conn) } }; + if (realtimeModelSettings.InputAudioTranscribe) + { + var words = new List(); + HookEmitter.Emit(_services, hook => words.AddRange(hook.OnModelTranscriptPrompt(agent))); + + sessionUpdate.session.InputAudioTranscription = new InputAudioTranscription + { + Model = realtimeModelSettings.InputAudioTranscription.Model, + Language = realtimeModelSettings.InputAudioTranscription.Language, + Prompt = string.Join(", ", words.Select(x => x.ToLower().Trim()).Distinct()).SubstringMax(1024) + }; + } + await HookEmitter.Emit(_services, async hook => { await hook.OnSessionUpdated(agent, instruction, functions); diff --git a/src/Plugins/BotSharp.Plugin.Twilio/Controllers/TwilioVoiceController.cs b/src/Plugins/BotSharp.Plugin.Twilio/Controllers/TwilioVoiceController.cs index 8f042ca44..e6cd56123 100644 --- a/src/Plugins/BotSharp.Plugin.Twilio/Controllers/TwilioVoiceController.cs +++ b/src/Plugins/BotSharp.Plugin.Twilio/Controllers/TwilioVoiceController.cs @@ -357,6 +357,14 @@ await HookEmitter.Emit(_services, async hook => { await HookEmitter.Emit(_services, x => x.OnCallNoAnswerStatus(request)); } + else if (request.CallStatus == "canceled") + { + await HookEmitter.Emit(_services, x => x.OnCallCanceledStatus(request)); + } + else if (request.CallStatus == "failed") + { + await HookEmitter.Emit(_services, x => x.OnCallFailedStatus(request)); + } return Ok(); } diff --git a/src/Plugins/BotSharp.Plugin.Twilio/Interfaces/ITwilioCallStatusHook.cs b/src/Plugins/BotSharp.Plugin.Twilio/Interfaces/ITwilioCallStatusHook.cs index 747679cc2..e04d3ae7d 100644 --- a/src/Plugins/BotSharp.Plugin.Twilio/Interfaces/ITwilioCallStatusHook.cs +++ b/src/Plugins/BotSharp.Plugin.Twilio/Interfaces/ITwilioCallStatusHook.cs @@ -20,4 +20,8 @@ public interface ITwilioCallStatusHook Task OnCallBusyStatus(ConversationalVoiceRequest request); Task OnCallNoAnswerStatus(ConversationalVoiceRequest request); + + Task OnCallCanceledStatus(ConversationalVoiceRequest request); + + Task OnCallFailedStatus(ConversationalVoiceRequest request); }