From 21aab6682628f740f4e557d3615a87c39a54eebb Mon Sep 17 00:00:00 2001 From: Haiping Chen Date: Fri, 21 Mar 2025 19:28:22 -0500 Subject: [PATCH] IAudioSynthesis --- .../MLTasks/IAudioCompletion.cs | 15 -- .../MLTasks/IAudioSynthesis.cs | 15 ++ .../MLTasks/IAudioTranscription.cs | 17 ++ .../MLTasks/Settings/LlmModelSetting.cs | 10 +- .../Realtime/Models/ModelTurnDetection.cs | 4 +- .../Services/RealtimeHub.cs | 2 +- .../Instruct/FileInstructService.Audio.cs | 4 +- .../Instruct/FileInstructService.Pdf.cs | 2 +- .../FileInstructService.SelectFile.cs | 2 +- .../Infrastructures/CompletionProvider.cs | 35 +++- .../Controllers/InstructModeController.cs | 8 +- .../Controllers/RealtimeController.cs | 2 +- .../AudioHandlerPlugin.cs | 2 +- .../Functions/HandleAudioRequestFn.cs | 6 +- .../Provider/NativeWhisperProvider.cs | 4 +- .../AzureOpenAiPlugin.cs | 2 +- .../AudioCompletionProvider.SpeechToText.cs | 2 +- .../Audio/AudioCompletionProvider.cs | 2 +- .../Functions/HandleEmailReaderFn.cs | 2 +- .../Functions/ReadImageFn.cs | 2 +- .../Functions/ReadPdfFn.cs | 2 +- .../Models/Realtime/RealtimeSessionBody.cs | 14 +- .../BotSharp.Plugin.OpenAI/OpenAiPlugin.cs | 3 +- .../Audio/AudioCompletionProvider.cs | 21 --- ...tToSpeech.cs => AudioSynthesisProvider.cs} | 20 ++- ...oText.cs => AudioTranscriptionProvider.cs} | 21 ++- .../Realtime/RealTimeCompletionProvider.cs | 4 +- .../Controllers/TwilioOutboundController.cs | 65 ++++++++ .../Controllers/TwilioVoiceController.cs | 157 +----------------- .../Functions/HangupPhoneCallFn.cs | 4 +- .../Functions/LeaveVoicemailFn.cs | 4 +- .../Functions/OutboundPhoneCallFn.cs | 4 +- .../Functions/TransferPhoneCallFn.cs | 4 +- .../Services/TwilioMessageQueueService.cs | 4 +- .../Services/TwilioService.cs | 86 +++++++++- 35 files changed, 307 insertions(+), 244 deletions(-) delete mode 100644 src/Infrastructure/BotSharp.Abstraction/MLTasks/IAudioCompletion.cs create mode 100644 src/Infrastructure/BotSharp.Abstraction/MLTasks/IAudioSynthesis.cs create mode 100644 src/Infrastructure/BotSharp.Abstraction/MLTasks/IAudioTranscription.cs delete mode 100644 src/Plugins/BotSharp.Plugin.OpenAI/Providers/Audio/AudioCompletionProvider.cs rename src/Plugins/BotSharp.Plugin.OpenAI/Providers/Audio/{AudioCompletionProvider.TextToSpeech.cs => AudioSynthesisProvider.cs} (84%) rename src/Plugins/BotSharp.Plugin.OpenAI/Providers/Audio/{AudioCompletionProvider.SpeechToText.cs => AudioTranscriptionProvider.cs} (83%) create mode 100644 src/Plugins/BotSharp.Plugin.Twilio/Controllers/TwilioOutboundController.cs diff --git a/src/Infrastructure/BotSharp.Abstraction/MLTasks/IAudioCompletion.cs b/src/Infrastructure/BotSharp.Abstraction/MLTasks/IAudioCompletion.cs deleted file mode 100644 index 175a79a7d..000000000 --- a/src/Infrastructure/BotSharp.Abstraction/MLTasks/IAudioCompletion.cs +++ /dev/null @@ -1,15 +0,0 @@ -using System.IO; - -namespace BotSharp.Abstraction.MLTasks; - -public interface IAudioCompletion -{ - string Provider { get; } - - string Model { get; } - - Task GenerateTextFromAudioAsync(Stream audio, string audioFileName, string? text = null); - Task GenerateAudioFromTextAsync(string text, string? voice = "alloy", string? format = "mp3"); - - void SetModelName(string model); -} diff --git a/src/Infrastructure/BotSharp.Abstraction/MLTasks/IAudioSynthesis.cs b/src/Infrastructure/BotSharp.Abstraction/MLTasks/IAudioSynthesis.cs new file mode 100644 index 000000000..049b6ddf7 --- /dev/null +++ b/src/Infrastructure/BotSharp.Abstraction/MLTasks/IAudioSynthesis.cs @@ -0,0 +1,15 @@ +namespace BotSharp.Abstraction.MLTasks; + +/// +/// Text to speech synthesis +/// +public interface IAudioSynthesis +{ + string Provider { get; } + + string Model { get; } + + void SetModelName(string model); + + Task GenerateAudioAsync(string text, string? voice = "alloy", string? format = "mp3", string? instructions = null); +} diff --git a/src/Infrastructure/BotSharp.Abstraction/MLTasks/IAudioTranscription.cs b/src/Infrastructure/BotSharp.Abstraction/MLTasks/IAudioTranscription.cs new file mode 100644 index 000000000..f56119e2a --- /dev/null +++ b/src/Infrastructure/BotSharp.Abstraction/MLTasks/IAudioTranscription.cs @@ -0,0 +1,17 @@ +using System.IO; + +namespace BotSharp.Abstraction.MLTasks; + +/// +/// Audio transcription service +/// +public interface IAudioTranscription +{ + string Provider { get; } + + string Model { get; } + + Task TranscriptTextAsync(Stream audio, string audioFileName, string? text = null); + + void SetModelName(string model); +} diff --git a/src/Infrastructure/BotSharp.Abstraction/MLTasks/Settings/LlmModelSetting.cs b/src/Infrastructure/BotSharp.Abstraction/MLTasks/Settings/LlmModelSetting.cs index a40415c70..39a4d2219 100644 --- a/src/Infrastructure/BotSharp.Abstraction/MLTasks/Settings/LlmModelSetting.cs +++ b/src/Infrastructure/BotSharp.Abstraction/MLTasks/Settings/LlmModelSetting.cs @@ -3,14 +3,14 @@ namespace BotSharp.Abstraction.MLTasks.Settings; public class LlmModelSetting { /// - /// Model Id, like "gpt-3.5" and "gpt-4". + /// Model Id, like "gpt-4", "gpt-4o", "o1". /// - public string? Id { get; set; } + public string Id { get; set; } = null!; /// /// Deployment model name /// - public string Name { get; set; } + public string Name { get; set; } = null!; /// /// Model version @@ -28,8 +28,8 @@ public class LlmModelSetting /// public string? Group { get; set; } - public string ApiKey { get; set; } - public string Endpoint { get; set; } + public string ApiKey { get; set; } = null!; + public string? Endpoint { get; set; } public LlmModelType Type { get; set; } = LlmModelType.Chat; /// diff --git a/src/Infrastructure/BotSharp.Abstraction/Realtime/Models/ModelTurnDetection.cs b/src/Infrastructure/BotSharp.Abstraction/Realtime/Models/ModelTurnDetection.cs index 38528688a..3a57791e6 100644 --- a/src/Infrastructure/BotSharp.Abstraction/Realtime/Models/ModelTurnDetection.cs +++ b/src/Infrastructure/BotSharp.Abstraction/Realtime/Models/ModelTurnDetection.cs @@ -11,6 +11,6 @@ public class ModelTurnDetection public class AudioTranscription { - public string Model { get; set; } = "whisper-1"; - public string Language { get; set; } = "en"; + public string Model { get; set; } = "gpt-4o-mini-transcribe"; + public string? Language { get; set; } } diff --git a/src/Infrastructure/BotSharp.Core.Realtime/Services/RealtimeHub.cs b/src/Infrastructure/BotSharp.Core.Realtime/Services/RealtimeHub.cs index 4b20248b3..3ad194ee8 100644 --- a/src/Infrastructure/BotSharp.Core.Realtime/Services/RealtimeHub.cs +++ b/src/Infrastructure/BotSharp.Core.Realtime/Services/RealtimeHub.cs @@ -74,7 +74,7 @@ private async Task ConnectToModel(WebSocket userWebSocket) if (!model.Contains("-realtime-")) { var llmProviderService = _services.GetRequiredService(); - model = llmProviderService.GetProviderModel("openai", "gpt-4", realTime: true).Name; + model = llmProviderService.GetProviderModel("openai", "gpt-4o", realTime: true).Name; } _completer.SetModelName(model); diff --git a/src/Infrastructure/BotSharp.Core/Files/Services/Instruct/FileInstructService.Audio.cs b/src/Infrastructure/BotSharp.Core/Files/Services/Instruct/FileInstructService.Audio.cs index e2aa844cd..0b8f25cd0 100644 --- a/src/Infrastructure/BotSharp.Core/Files/Services/Instruct/FileInstructService.Audio.cs +++ b/src/Infrastructure/BotSharp.Core/Files/Services/Instruct/FileInstructService.Audio.cs @@ -6,14 +6,14 @@ public partial class FileInstructService { public async Task SpeechToText(string? provider, string? model, InstructFileModel audio, string? text = null) { - var completion = CompletionProvider.GetAudioCompletion(_services, provider: provider ?? "openai", model: model ?? "whisper-1"); + var completion = CompletionProvider.GetAudioTranscriber(_services, provider: provider, model: model); var audioBytes = await DownloadFile(audio); using var stream = new MemoryStream(); stream.Write(audioBytes, 0, audioBytes.Length); stream.Position = 0; var fileName = $"{audio.FileName ?? "audio"}.{audio.FileExtension ?? "wav"}"; - var content = await completion.GenerateTextFromAudioAsync(stream, fileName, text); + var content = await completion.TranscriptTextAsync(stream, fileName, text); stream.Close(); return content; } diff --git a/src/Infrastructure/BotSharp.Core/Files/Services/Instruct/FileInstructService.Pdf.cs b/src/Infrastructure/BotSharp.Core/Files/Services/Instruct/FileInstructService.Pdf.cs index 9f2132a1e..af62e4792 100644 --- a/src/Infrastructure/BotSharp.Core/Files/Services/Instruct/FileInstructService.Pdf.cs +++ b/src/Infrastructure/BotSharp.Core/Files/Services/Instruct/FileInstructService.Pdf.cs @@ -27,7 +27,7 @@ public async Task ReadPdf(string? provider, string? model, string? model var innerAgentId = agentId ?? Guid.Empty.ToString(); var completion = CompletionProvider.GetChatCompletion(_services, provider: provider ?? "openai", - model: model, modelId: modelId ?? "gpt-4", multiModal: true); + model: model, modelId: modelId ?? "gpt-4o", multiModal: true); var message = await completion.GetChatCompletions(new Agent() { Id = innerAgentId, diff --git a/src/Infrastructure/BotSharp.Core/Files/Services/Instruct/FileInstructService.SelectFile.cs b/src/Infrastructure/BotSharp.Core/Files/Services/Instruct/FileInstructService.SelectFile.cs index 41e0becc1..0e049b206 100644 --- a/src/Infrastructure/BotSharp.Core/Files/Services/Instruct/FileInstructService.SelectFile.cs +++ b/src/Infrastructure/BotSharp.Core/Files/Services/Instruct/FileInstructService.SelectFile.cs @@ -93,7 +93,7 @@ private async Task> SelectFiles(IEnumerable x == providerName); var model = llmProviderService.GetProviderModel(provider: provider, id: modelId); var completion = CompletionProvider.GetChatCompletion(_services, provider: provider, model: model.Name); diff --git a/src/Infrastructure/BotSharp.Core/Infrastructures/CompletionProvider.cs b/src/Infrastructure/BotSharp.Core/Infrastructures/CompletionProvider.cs index 60677742c..386cce9e0 100644 --- a/src/Infrastructure/BotSharp.Core/Infrastructures/CompletionProvider.cs +++ b/src/Infrastructure/BotSharp.Core/Infrastructures/CompletionProvider.cs @@ -30,7 +30,7 @@ public static object GetCompletion(IServiceProvider services, } else if (settings.Type == LlmModelType.Audio) { - return GetAudioCompletion(services, provider: provider, model: model); + return GetAudioTranscriber(services, provider: provider, model: model); } else { @@ -126,20 +126,39 @@ public static ITextEmbedding GetTextEmbedding(IServiceProvider services, return completer; } - public static IAudioCompletion GetAudioCompletion( + public static IAudioTranscription GetAudioTranscriber( IServiceProvider services, - string provider, - string model) + string? provider = null, + string? model = null) { - var completions = services.GetServices(); - var completer = completions.FirstOrDefault(x => x.Provider == provider); + var completions = services.GetServices(); + var completer = completions.FirstOrDefault(x => x.Provider == (provider ?? "openai")); if (completer == null) { var logger = services.GetRequiredService>(); - logger.LogError($"Can't resolve audio-completion provider by {provider}"); + logger.LogError($"Can't resolve audio-transcriber provider by {provider}"); + return default!; } - completer.SetModelName(model); + completer.SetModelName(model ?? "gpt-4o-mini-transcribe"); + return completer; + } + + public static IAudioSynthesis GetAudioSynthesizer( + IServiceProvider services, + string? provider = null, + string? model = null) + { + var completions = services.GetServices(); + var completer = completions.FirstOrDefault(x => x.Provider == (provider ?? "openai")); + if (completer == null) + { + var logger = services.GetRequiredService>(); + logger.LogError($"Can't resolve audio-synthesizer provider by {provider}"); + return default!; + } + + completer.SetModelName(model ?? "gpt-4o-mini-tts"); return completer; } diff --git a/src/Infrastructure/BotSharp.OpenAPI/Controllers/InstructModeController.cs b/src/Infrastructure/BotSharp.OpenAPI/Controllers/InstructModeController.cs index 6a1c37055..3db477032 100644 --- a/src/Infrastructure/BotSharp.OpenAPI/Controllers/InstructModeController.cs +++ b/src/Infrastructure/BotSharp.OpenAPI/Controllers/InstructModeController.cs @@ -499,8 +499,8 @@ public async Task SpeechToText(IFormFile file, [FromForm] file.CopyTo(stream); stream.Position = 0; - var completion = CompletionProvider.GetAudioCompletion(_services, provider: provider ?? "openai", model: model ?? "whisper-1"); - var content = await completion.GenerateTextFromAudioAsync(stream, file.FileName, text); + var completion = CompletionProvider.GetAudioTranscriber(_services, provider: provider, model: model); + var content = await completion.TranscriptTextAsync(stream, file.FileName, text); viewModel.Content = content; stream.Close(); return viewModel; @@ -520,8 +520,8 @@ public async Task TextToSpeech([FromBody] TextToSpeechRequest inp var state = _services.GetRequiredService(); input.States.ForEach(x => state.SetState(x.Key, x.Value, activeRounds: x.ActiveRounds, source: StateSource.External)); - var completion = CompletionProvider.GetAudioCompletion(_services, provider: input.Provider ?? "openai", model: input.Model ?? "tts-1"); - var binaryData = await completion.GenerateAudioFromTextAsync(input.Text); + var completion = CompletionProvider.GetAudioSynthesizer(_services, provider: input.Provider, model: input.Model); + var binaryData = await completion.GenerateAudioAsync(input.Text); var stream = binaryData.ToStream(); stream.Position = 0; diff --git a/src/Infrastructure/BotSharp.OpenAPI/Controllers/RealtimeController.cs b/src/Infrastructure/BotSharp.OpenAPI/Controllers/RealtimeController.cs index 7d6fc0569..8773e35b9 100644 --- a/src/Infrastructure/BotSharp.OpenAPI/Controllers/RealtimeController.cs +++ b/src/Infrastructure/BotSharp.OpenAPI/Controllers/RealtimeController.cs @@ -22,7 +22,7 @@ public RealtimeController(IServiceProvider services) [HttpGet("/agent/{agentId}/realtime/session")] public async Task CreateSession(string agentId) { - var completion = CompletionProvider.GetRealTimeCompletion(_services, provider: "openai", modelId: "gpt-4"); + var completion = CompletionProvider.GetRealTimeCompletion(_services, provider: "openai", modelId: "gpt-4o"); var agentService = _services.GetRequiredService(); var agent = await agentService.LoadAgent(agentId); diff --git a/src/Plugins/BotSharp.Plugin.AudioHandler/AudioHandlerPlugin.cs b/src/Plugins/BotSharp.Plugin.AudioHandler/AudioHandlerPlugin.cs index d79ca6c2c..1bc9c331f 100644 --- a/src/Plugins/BotSharp.Plugin.AudioHandler/AudioHandlerPlugin.cs +++ b/src/Plugins/BotSharp.Plugin.AudioHandler/AudioHandlerPlugin.cs @@ -16,7 +16,7 @@ public void RegisterDI(IServiceCollection services, IConfiguration config) return settingService.Bind("AudioHandler"); }); - services.AddScoped(); + services.AddScoped(); services.AddScoped(); } } diff --git a/src/Plugins/BotSharp.Plugin.AudioHandler/Functions/HandleAudioRequestFn.cs b/src/Plugins/BotSharp.Plugin.AudioHandler/Functions/HandleAudioRequestFn.cs index 9c0ab4f48..f20a675a8 100644 --- a/src/Plugins/BotSharp.Plugin.AudioHandler/Functions/HandleAudioRequestFn.cs +++ b/src/Plugins/BotSharp.Plugin.AudioHandler/Functions/HandleAudioRequestFn.cs @@ -91,7 +91,7 @@ private async Task GetResponeFromDialogs(List dialogs) using var stream = new MemoryStream(bytes); stream.Position = 0; - var result = await audioCompletion.GenerateTextFromAudioAsync(stream, fileName); + var result = await audioCompletion.TranscriptTextAsync(stream, fileName); transcripts.Add(result); stream.Close(); } @@ -104,9 +104,9 @@ private async Task GetResponeFromDialogs(List dialogs) return string.Join("\r\n\r\n", transcripts); } - private IAudioCompletion PrepareModel() + private IAudioTranscription PrepareModel() { - return CompletionProvider.GetAudioCompletion(_serviceProvider, provider: "openai", model: "whisper-1"); + return CompletionProvider.GetAudioTranscriber(_serviceProvider); } private bool ParseAudioFileType(string fileName) diff --git a/src/Plugins/BotSharp.Plugin.AudioHandler/Provider/NativeWhisperProvider.cs b/src/Plugins/BotSharp.Plugin.AudioHandler/Provider/NativeWhisperProvider.cs index b745e2c16..aa35d17ba 100644 --- a/src/Plugins/BotSharp.Plugin.AudioHandler/Provider/NativeWhisperProvider.cs +++ b/src/Plugins/BotSharp.Plugin.AudioHandler/Provider/NativeWhisperProvider.cs @@ -6,7 +6,7 @@ namespace BotSharp.Plugin.AudioHandler.Provider; /// /// Native Whisper provider for speech to text conversion /// -public class NativeWhisperProvider : IAudioCompletion +public class NativeWhisperProvider : IAudioTranscription { private static WhisperProcessor _whisperProcessor; @@ -29,7 +29,7 @@ public NativeWhisperProvider( _logger = logger; } - public async Task GenerateTextFromAudioAsync(Stream audio, string audioFileName, string? text = null) + public async Task TranscriptTextAsync(Stream audio, string audioFileName, string? text = null) { var textResult = new List(); diff --git a/src/Plugins/BotSharp.Plugin.AzureOpenAI/AzureOpenAiPlugin.cs b/src/Plugins/BotSharp.Plugin.AzureOpenAI/AzureOpenAiPlugin.cs index eba22bfa4..8a2c1c53a 100644 --- a/src/Plugins/BotSharp.Plugin.AzureOpenAI/AzureOpenAiPlugin.cs +++ b/src/Plugins/BotSharp.Plugin.AzureOpenAI/AzureOpenAiPlugin.cs @@ -31,6 +31,6 @@ public void RegisterDI(IServiceCollection services, IConfiguration config) services.AddScoped(); services.AddScoped(); services.AddScoped(); - services.AddScoped(); + services.AddScoped(); } } \ No newline at end of file diff --git a/src/Plugins/BotSharp.Plugin.AzureOpenAI/Providers/Audio/AudioCompletionProvider.SpeechToText.cs b/src/Plugins/BotSharp.Plugin.AzureOpenAI/Providers/Audio/AudioCompletionProvider.SpeechToText.cs index 36f0c3ee7..082daacf1 100644 --- a/src/Plugins/BotSharp.Plugin.AzureOpenAI/Providers/Audio/AudioCompletionProvider.SpeechToText.cs +++ b/src/Plugins/BotSharp.Plugin.AzureOpenAI/Providers/Audio/AudioCompletionProvider.SpeechToText.cs @@ -4,7 +4,7 @@ namespace BotSharp.Plugin.AzureOpenAI.Providers.Audio; public partial class AudioCompletionProvider { - public async Task GenerateTextFromAudioAsync(Stream audio, string audioFileName, string? text = null) + public async Task TranscriptTextAsync(Stream audio, string audioFileName, string? text = null) { var audioClient = ProviderHelper.GetClient(Provider, _model, _services) .GetAudioClient(_model); diff --git a/src/Plugins/BotSharp.Plugin.AzureOpenAI/Providers/Audio/AudioCompletionProvider.cs b/src/Plugins/BotSharp.Plugin.AzureOpenAI/Providers/Audio/AudioCompletionProvider.cs index 2948703e7..595e3f9a3 100644 --- a/src/Plugins/BotSharp.Plugin.AzureOpenAI/Providers/Audio/AudioCompletionProvider.cs +++ b/src/Plugins/BotSharp.Plugin.AzureOpenAI/Providers/Audio/AudioCompletionProvider.cs @@ -1,6 +1,6 @@ namespace BotSharp.Plugin.AzureOpenAI.Providers.Audio; -public partial class AudioCompletionProvider : IAudioCompletion +public partial class AudioCompletionProvider : IAudioTranscription { private readonly IServiceProvider _services; diff --git a/src/Plugins/BotSharp.Plugin.EmailHandler/Functions/HandleEmailReaderFn.cs b/src/Plugins/BotSharp.Plugin.EmailHandler/Functions/HandleEmailReaderFn.cs index 7666e5258..fcd2be3a4 100644 --- a/src/Plugins/BotSharp.Plugin.EmailHandler/Functions/HandleEmailReaderFn.cs +++ b/src/Plugins/BotSharp.Plugin.EmailHandler/Functions/HandleEmailReaderFn.cs @@ -67,7 +67,7 @@ public async Task Execute(RoleDialogModel message) var llmProviderService = _services.GetRequiredService(); var provider = llmProviderService.GetProviders().FirstOrDefault(x => x == "openai"); - var model = llmProviderService.GetProviderModel(provider: provider ?? "openai", id: "gpt-4"); + var model = llmProviderService.GetProviderModel(provider: provider ?? "openai", id: "gpt-4o"); var completion = CompletionProvider.GetChatCompletion(_services, provider: provider, model: model.Name); var convService = _services.GetRequiredService(); var conversationId = convService.ConversationId; diff --git a/src/Plugins/BotSharp.Plugin.FileHandler/Functions/ReadImageFn.cs b/src/Plugins/BotSharp.Plugin.FileHandler/Functions/ReadImageFn.cs index 710c2c800..ff19fbf11 100644 --- a/src/Plugins/BotSharp.Plugin.FileHandler/Functions/ReadImageFn.cs +++ b/src/Plugins/BotSharp.Plugin.FileHandler/Functions/ReadImageFn.cs @@ -100,7 +100,7 @@ private async Task GetChatCompletion(Agent agent, List { var llmProviderService = _services.GetRequiredService(); var provider = llmProviderService.GetProviders().FirstOrDefault(x => x == "openai"); - var model = llmProviderService.GetProviderModel(provider: provider, id: "gpt-4", multiModal: true); + var model = llmProviderService.GetProviderModel(provider: provider, id: "gpt-4o", multiModal: true); var completion = CompletionProvider.GetChatCompletion(_services, provider: provider, model: model.Name); var response = await completion.GetChatCompletions(agent, dialogs); return response.Content; diff --git a/src/Plugins/BotSharp.Plugin.FileHandler/Functions/ReadPdfFn.cs b/src/Plugins/BotSharp.Plugin.FileHandler/Functions/ReadPdfFn.cs index 2aeb64a12..5a2d2d9bf 100644 --- a/src/Plugins/BotSharp.Plugin.FileHandler/Functions/ReadPdfFn.cs +++ b/src/Plugins/BotSharp.Plugin.FileHandler/Functions/ReadPdfFn.cs @@ -78,7 +78,7 @@ private async Task GetChatCompletion(Agent agent, List { var llmProviderService = _services.GetRequiredService(); var provider = llmProviderService.GetProviders().FirstOrDefault(x => x == "openai"); - var model = llmProviderService.GetProviderModel(provider: provider, id: "gpt-4", multiModal: true); + var model = llmProviderService.GetProviderModel(provider: provider, id: "gpt-4o", multiModal: true); var completion = CompletionProvider.GetChatCompletion(_services, provider: provider, model: model.Name); var response = await completion.GetChatCompletions(agent, dialogs); return response.Content; diff --git a/src/Plugins/BotSharp.Plugin.OpenAI/Models/Realtime/RealtimeSessionBody.cs b/src/Plugins/BotSharp.Plugin.OpenAI/Models/Realtime/RealtimeSessionBody.cs index 04fc0df92..cdd9c3089 100644 --- a/src/Plugins/BotSharp.Plugin.OpenAI/Models/Realtime/RealtimeSessionBody.cs +++ b/src/Plugins/BotSharp.Plugin.OpenAI/Models/Realtime/RealtimeSessionBody.cs @@ -61,26 +61,30 @@ public class RealtimeSessionTurnDetection /// /// Milliseconds /// - [JsonPropertyName("prefix_padding_ms")] + /*[JsonPropertyName("prefix_padding_ms")] public int PrefixPadding { get; set; } = 300; [JsonPropertyName("silence_duration_ms")] public int SilenceDuration { get; set; } = 500; [JsonPropertyName("threshold")] - public float Threshold { get; set; } = 0.5f; + public float Threshold { get; set; } = 0.5f;*/ [JsonPropertyName("type")] - public string Type { get; set; } = "server_vad"; + public string Type { get; set; } = "semantic_vad"; + + [JsonPropertyName("eagerness")] + public string eagerness { get;set; } = "auto"; } public class InputAudioTranscription { [JsonPropertyName("model")] - public string Model { get; set; } = "whisper-1"; + public string Model { get; set; } = "gpt-4o-transcribe"; [JsonPropertyName("language")] - public string Language { get; set; } = "en"; + [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] + public string? Language { get; set; } [JsonPropertyName("prompt")] [JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)] diff --git a/src/Plugins/BotSharp.Plugin.OpenAI/OpenAiPlugin.cs b/src/Plugins/BotSharp.Plugin.OpenAI/OpenAiPlugin.cs index c1adbbe7e..fe3dae99d 100644 --- a/src/Plugins/BotSharp.Plugin.OpenAI/OpenAiPlugin.cs +++ b/src/Plugins/BotSharp.Plugin.OpenAI/OpenAiPlugin.cs @@ -33,7 +33,8 @@ public void RegisterDI(IServiceCollection services, IConfiguration config) services.AddScoped(); services.AddScoped(); services.AddScoped(); - services.AddScoped(); + services.AddScoped(); + services.AddScoped(); services.AddScoped(); services.AddRefitClient() diff --git a/src/Plugins/BotSharp.Plugin.OpenAI/Providers/Audio/AudioCompletionProvider.cs b/src/Plugins/BotSharp.Plugin.OpenAI/Providers/Audio/AudioCompletionProvider.cs deleted file mode 100644 index 97a4a9248..000000000 --- a/src/Plugins/BotSharp.Plugin.OpenAI/Providers/Audio/AudioCompletionProvider.cs +++ /dev/null @@ -1,21 +0,0 @@ -namespace BotSharp.Plugin.OpenAI.Providers.Audio; - -public partial class AudioCompletionProvider : IAudioCompletion -{ - private readonly IServiceProvider _services; - - public string Provider => "openai"; - public string Model => _model; - - private string _model; - - public AudioCompletionProvider(IServiceProvider service) - { - _services = service; - } - - public void SetModelName(string model) - { - _model = model; - } -} diff --git a/src/Plugins/BotSharp.Plugin.OpenAI/Providers/Audio/AudioCompletionProvider.TextToSpeech.cs b/src/Plugins/BotSharp.Plugin.OpenAI/Providers/Audio/AudioSynthesisProvider.cs similarity index 84% rename from src/Plugins/BotSharp.Plugin.OpenAI/Providers/Audio/AudioCompletionProvider.TextToSpeech.cs rename to src/Plugins/BotSharp.Plugin.OpenAI/Providers/Audio/AudioSynthesisProvider.cs index fa5e2ec52..feb846f50 100644 --- a/src/Plugins/BotSharp.Plugin.OpenAI/Providers/Audio/AudioCompletionProvider.TextToSpeech.cs +++ b/src/Plugins/BotSharp.Plugin.OpenAI/Providers/Audio/AudioSynthesisProvider.cs @@ -2,9 +2,25 @@ namespace BotSharp.Plugin.OpenAI.Providers.Audio; -public partial class AudioCompletionProvider +public class AudioSynthesisProvider : IAudioSynthesis { - public async Task GenerateAudioFromTextAsync(string text, string? voice = "alloy", string? format = "mp3") + private readonly IServiceProvider _services; + public string Provider => "openai"; + public string Model => _model; + + private string _model; + + public AudioSynthesisProvider(IServiceProvider service) + { + _services = service; + } + + public void SetModelName(string model) + { + _model = model; + } + + public async Task GenerateAudioAsync(string text, string? voice = "alloy", string? format = "mp3", string? instructions = null) { var audioClient = ProviderHelper.GetClient(Provider, _model, _services) .GetAudioClient(_model); diff --git a/src/Plugins/BotSharp.Plugin.OpenAI/Providers/Audio/AudioCompletionProvider.SpeechToText.cs b/src/Plugins/BotSharp.Plugin.OpenAI/Providers/Audio/AudioTranscriptionProvider.cs similarity index 83% rename from src/Plugins/BotSharp.Plugin.OpenAI/Providers/Audio/AudioCompletionProvider.SpeechToText.cs rename to src/Plugins/BotSharp.Plugin.OpenAI/Providers/Audio/AudioTranscriptionProvider.cs index f6213eb71..079df13f7 100644 --- a/src/Plugins/BotSharp.Plugin.OpenAI/Providers/Audio/AudioCompletionProvider.SpeechToText.cs +++ b/src/Plugins/BotSharp.Plugin.OpenAI/Providers/Audio/AudioTranscriptionProvider.cs @@ -2,9 +2,26 @@ namespace BotSharp.Plugin.OpenAI.Providers.Audio; -public partial class AudioCompletionProvider +public class AudioTranscriptionProvider : IAudioTranscription { - public async Task GenerateTextFromAudioAsync(Stream audio, string audioFileName, string? text = null) + private readonly IServiceProvider _services; + + public string Provider => "openai"; + public string Model => _model; + + private string _model; + + public AudioTranscriptionProvider(IServiceProvider service) + { + _services = service; + } + + public void SetModelName(string model) + { + _model = model; + } + + public async Task TranscriptTextAsync(Stream audio, string audioFileName, string? text = null) { var audioClient = ProviderHelper.GetClient(Provider, _model, _services) .GetAudioClient(_model); diff --git a/src/Plugins/BotSharp.Plugin.OpenAI/Providers/Realtime/RealTimeCompletionProvider.cs b/src/Plugins/BotSharp.Plugin.OpenAI/Providers/Realtime/RealTimeCompletionProvider.cs index 39d744439..fe5b32fea 100644 --- a/src/Plugins/BotSharp.Plugin.OpenAI/Providers/Realtime/RealTimeCompletionProvider.cs +++ b/src/Plugins/BotSharp.Plugin.OpenAI/Providers/Realtime/RealTimeCompletionProvider.cs @@ -351,10 +351,10 @@ public async Task UpdateSession(RealtimeHubConnection conn, bool interru MaxResponseOutputTokens = realtimeModelSettings.MaxResponseOutputTokens, TurnDetection = new RealtimeSessionTurnDetection { - InterruptResponse = interruptResponse, + InterruptResponse = interruptResponse/*, Threshold = realtimeModelSettings.TurnDetection.Threshold, PrefixPadding = realtimeModelSettings.TurnDetection.PrefixPadding, - SilenceDuration = realtimeModelSettings.TurnDetection.SilenceDuration + SilenceDuration = realtimeModelSettings.TurnDetection.SilenceDuration*/ }, InputAudioNoiseReduction = new InputAudioNoiseReduction { diff --git a/src/Plugins/BotSharp.Plugin.Twilio/Controllers/TwilioOutboundController.cs b/src/Plugins/BotSharp.Plugin.Twilio/Controllers/TwilioOutboundController.cs new file mode 100644 index 000000000..92d3abe43 --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.Twilio/Controllers/TwilioOutboundController.cs @@ -0,0 +1,65 @@ +using BotSharp.Core.Infrastructures; +using BotSharp.Plugin.Twilio.Interfaces; +using BotSharp.Plugin.Twilio.Models; +using BotSharp.Plugin.Twilio.Services; +using Microsoft.AspNetCore.Http; +using Microsoft.AspNetCore.Mvc; + +namespace BotSharp.Plugin.Twilio.Controllers; + +public class TwilioOutboundController : TwilioController +{ + private readonly TwilioSetting _settings; + private readonly IServiceProvider _services; + private readonly IHttpContextAccessor _context; + private readonly ILogger _logger; + + public TwilioOutboundController(TwilioSetting settings, IServiceProvider services, IHttpContextAccessor context, ILogger logger) + { + _settings = settings; + _services = services; + _context = context; + _logger = logger; + } + + [ValidateRequest] + [HttpPost("twilio/voice/init-outbound-call")] + public async Task InitiateOutboundCall(ConversationalVoiceRequest request) + { + var twilio = _services.GetRequiredService(); + + VoiceResponse response = default!; + if (request.AnsweredBy == "machine_start" && + request.Direction == "outbound-api") + { + response = new VoiceResponse(); + + await HookEmitter.Emit(_services, async hook => + { + await hook.OnVoicemailStarting(request); + }); + + var url = twilio.GetSpeechPath(request.ConversationId, "voicemail.mp3"); + response.Play(new Uri(url)); + } + else + { + var instruction = new ConversationalVoiceResponse + { + AgentId = request.AgentId, + ConversationId = request.ConversationId, + ActionOnEmptyResult = true, + CallbackPath = $"twilio/voice/receive/1?agent-id={request.AgentId}&conversation-id={request.ConversationId}", + }; + + if (request.InitAudioFile != null) + { + instruction.SpeechPaths.Add(request.InitAudioFile); + } + + response = twilio.ReturnNoninterruptedInstructions(instruction); + } + + return TwiML(response); + } +} diff --git a/src/Plugins/BotSharp.Plugin.Twilio/Controllers/TwilioVoiceController.cs b/src/Plugins/BotSharp.Plugin.Twilio/Controllers/TwilioVoiceController.cs index 81d9c0be2..2a1c7cd02 100644 --- a/src/Plugins/BotSharp.Plugin.Twilio/Controllers/TwilioVoiceController.cs +++ b/src/Plugins/BotSharp.Plugin.Twilio/Controllers/TwilioVoiceController.cs @@ -1,5 +1,4 @@ using BotSharp.Abstraction.Files; -using BotSharp.Abstraction.Infrastructures; using BotSharp.Core.Infrastructures; using BotSharp.Plugin.Twilio.Interfaces; using BotSharp.Plugin.Twilio.Models; @@ -69,7 +68,7 @@ await HookEmitter.Emit(_services, async hook => var twilio = _services.GetRequiredService(); if (string.IsNullOrWhiteSpace(request.Intent)) { - instruction.CallbackPath = $"twilio/voice/receive/0?agent-id={request.AgentId}&conversation-id={request.ConversationId}&{GenerateStatesParameter(request.States)}"; + instruction.CallbackPath = $"twilio/voice/receive/0?agent-id={request.AgentId}&conversation-id={request.ConversationId}&{twilio.GenerateStatesParameter(request.States)}"; response = twilio.ReturnNoninterruptedInstructions(instruction); } else @@ -91,7 +90,7 @@ await HookEmitter.Emit(_services, async hook => response = new VoiceResponse(); // delay 3 seconds to wait for the first message reply and caller is listening dudu sound await Task.Delay(1000 * 3); - response.Redirect(new Uri($"{_settings.CallbackHost}/twilio/voice/reply/{seqNum}?agent-id={request.AgentId}&conversation-id={request.ConversationId}&{GenerateStatesParameter(request.States)}"), HttpMethod.Post); + response.Redirect(new Uri($"{_settings.CallbackHost}/twilio/voice/reply/{seqNum}?agent-id={request.AgentId}&conversation-id={request.ConversationId}&{twilio.GenerateStatesParameter(request.States)}"), HttpMethod.Post); } await HookEmitter.Emit(_services, async hook => @@ -145,14 +144,11 @@ public async Task ReceiveCallerMessage(ConversationalVoiceRequest r await messageQueue.EnqueueAsync(callerMessage); response = new VoiceResponse(); - response.Redirect(new Uri($"{_settings.CallbackHost}/twilio/voice/reply/{request.SeqNum}?agent-id={request.AgentId}&conversation-id={request.ConversationId}&{GenerateStatesParameter(request.States)}&AIResponseWaitTime=0"), HttpMethod.Post); + response.Redirect(new Uri($"{_settings.CallbackHost}/twilio/voice/reply/{request.SeqNum}?agent-id={request.AgentId}&conversation-id={request.ConversationId}&{twilio.GenerateStatesParameter(request.States)}&AIResponseWaitTime=0"), HttpMethod.Post); await HookEmitter.Emit(_services, async hook => { await hook.OnReceivedUserMessage(request); - }, new HookEmitOption - { - OnlyOnce = true }); } else @@ -163,9 +159,6 @@ await HookEmitter.Emit(_services, async hook => await HookEmitter.Emit(_services, async hook => { await hook.OnAgentHangUp(request); - }, new HookEmitOption - { - OnlyOnce = true }); response = twilio.HangUp(string.Empty); @@ -178,7 +171,7 @@ await HookEmitter.Emit(_services, async hook => AgentId = request.AgentId, ConversationId = request.ConversationId, SpeechPaths = new List(), - CallbackPath = $"twilio/voice/receive/{request.SeqNum}?agent-id={request.AgentId}&conversation-id={request.ConversationId}&{GenerateStatesParameter(request.States)}&attempts={++request.Attempts}", + CallbackPath = $"twilio/voice/receive/{request.SeqNum}?agent-id={request.AgentId}&conversation-id={request.ConversationId}&{twilio.GenerateStatesParameter(request.States)}&attempts={++request.Attempts}", ActionOnEmptyResult = true }; @@ -190,9 +183,6 @@ await HookEmitter.Emit(_services, async hook => await HookEmitter.Emit(_services, async hook => { await hook.OnWaitingUserResponse(request, instruction); - }, new HookEmitOption - { - OnlyOnce = true }); response = twilio.ReturnInstructions(instruction); @@ -215,9 +205,10 @@ public async Task ReplyCallerMessage(ConversationalVoiceRequest req var sessionManager = _services.GetRequiredService(); var twilio = _services.GetRequiredService(); var fileStorage = _services.GetRequiredService(); - if (request.SpeechResult != null) + var text = (request.SpeechResult + "\r\n" + request.Digits).Trim(); + if (!string.IsNullOrEmpty(text)) { - await sessionManager.StageCallerMessageAsync(request.ConversationId, nextSeqNum, request.SpeechResult); + await sessionManager.StageCallerMessageAsync(request.ConversationId, nextSeqNum, text); } var reply = await sessionManager.GetAssistantReplyAsync(request.ConversationId, request.SeqNum); @@ -230,16 +221,13 @@ await HookEmitter.Emit(_services, async hook => { request.AIResponseErrorMessage = $"AI response timeout: AIResponseWaitTime greater than {request.AIResponseWaitTime}, please check internal error log!"; await hook.OnAgentHangUp(request); - }, new HookEmitOption - { - OnlyOnce = true }); response = twilio.HangUp($"twilio/error.mp3"); } else if (reply == null) { - response = await WaitingForAiResponse(request); + response = await twilio.WaitingForAiResponse(request); } else { @@ -248,9 +236,6 @@ await HookEmitter.Emit(_services, async hook => await HookEmitter.Emit(_services, async hook => { await hook.OnAgentTransferring(request, _settings); - }, new HookEmitOption - { - OnlyOnce = true }); response = twilio.DialCsrAgent($"twilio/voice/speeches/{request.ConversationId}/{reply.SpeechFileName}"); @@ -262,9 +247,6 @@ await HookEmitter.Emit(_services, async hook => await HookEmitter.Emit(_services, async hook => { await hook.OnAgentHangUp(request); - }, new HookEmitOption - { - OnlyOnce = true }); } else @@ -274,7 +256,7 @@ await HookEmitter.Emit(_services, async hook => AgentId = request.AgentId, ConversationId = request.ConversationId, SpeechPaths = [$"twilio/voice/speeches/{request.ConversationId}/{reply.SpeechFileName}"], - CallbackPath = $"twilio/voice/receive/{nextSeqNum}?agent-id={request.AgentId}&conversation-id={request.ConversationId}&{GenerateStatesParameter(request.States)}", + CallbackPath = $"twilio/voice/receive/{nextSeqNum}?agent-id={request.AgentId}&conversation-id={request.ConversationId}&{twilio.GenerateStatesParameter(request.States)}", ActionOnEmptyResult = true, Hints = reply.Hints }; @@ -291,118 +273,6 @@ await HookEmitter.Emit(_services, async hook => return TwiML(response); } - private async Task WaitingForAiResponse(ConversationalVoiceRequest request) - { - VoiceResponse response; - var sessionManager = _services.GetRequiredService(); - var fileStorage = _services.GetRequiredService(); - var twilio = _services.GetRequiredService(); - - var indication = await sessionManager.GetReplyIndicationAsync(request.ConversationId, request.SeqNum); - if (indication != null) - { - _logger.LogWarning($"Indication ({request.SeqNum}): {indication}"); - var speechPaths = new List(); - foreach (var text in indication.Split('|')) - { - var seg = text.Trim(); - if (seg.StartsWith('#')) - { - speechPaths.Add($"twilio/{seg.Substring(1)}.mp3"); - } - else - { - var hash = Utilities.HashTextMd5(seg); - var fileName = $"indication_{hash}.mp3"; - - var existing = fileStorage.GetSpeechFile(request.ConversationId, fileName); - if (existing == BinaryData.Empty) - { - var completion = CompletionProvider.GetAudioCompletion(_services, "openai", "tts-1"); - var data = await completion.GenerateAudioFromTextAsync(seg); - fileStorage.SaveSpeechFile(request.ConversationId, fileName, data); - } - - speechPaths.Add($"twilio/voice/speeches/{request.ConversationId}/{fileName}"); - } - } - - var instruction = new ConversationalVoiceResponse - { - AgentId = request.AgentId, - ConversationId = request.ConversationId, - SpeechPaths = speechPaths, - CallbackPath = $"twilio/voice/reply/{request.SeqNum}?agent-id={request.AgentId}&conversation-id={request.ConversationId}&{GenerateStatesParameter(request.States)}&AIResponseWaitTime={++request.AIResponseWaitTime}", - ActionOnEmptyResult = true - }; - - response = twilio.ReturnInstructions(instruction); - - await sessionManager.RemoveReplyIndicationAsync(request.ConversationId, request.SeqNum); - } - else - { - var instruction = new ConversationalVoiceResponse - { - AgentId = request.AgentId, - ConversationId = request.ConversationId, - SpeechPaths = [], - CallbackPath = $"twilio/voice/reply/{request.SeqNum}?agent-id={request.AgentId}&conversation-id={request.ConversationId}&{GenerateStatesParameter(request.States)}&AIResponseWaitTime={++request.AIResponseWaitTime}", - ActionOnEmptyResult = true - }; - - await HookEmitter.Emit(_services, async hook => - { - await hook.OnWaitingAgentResponse(request, instruction); - }); - - response = twilio.ReturnInstructions(instruction); - } - - return response; - } - - [ValidateRequest] - [HttpPost("twilio/voice/init-outbound-call")] - public async Task InitiateOutboundCall(ConversationalVoiceRequest request) - { - var twilio = _services.GetRequiredService(); - - VoiceResponse response = default!; - if (request.AnsweredBy == "machine_start" && - request.Direction == "outbound-api") - { - response = new VoiceResponse(); - - await HookEmitter.Emit(_services, async hook => - { - await hook.OnVoicemailStarting(request); - }); - - var url = twilio.GetSpeechPath(request.ConversationId, "voicemail.mp3"); - response.Play(new Uri(url)); - } - else - { - var instruction = new ConversationalVoiceResponse - { - AgentId = request.AgentId, - ConversationId = request.ConversationId, - ActionOnEmptyResult = true, - CallbackPath = $"twilio/voice/receive/1?agent-id={request.AgentId}&conversation-id={request.ConversationId}", - }; - - if (request.InitAudioFile != null) - { - instruction.SpeechPaths.Add(request.InitAudioFile); - } - - response = twilio.ReturnNoninterruptedInstructions(instruction); - } - - return TwiML(response); - } - [ValidateRequest] [HttpGet("twilio/voice/speeches/{conversationId}/{fileName}")] public async Task GetSpeechFile([FromRoute] string conversationId, [FromRoute] string fileName) @@ -499,13 +369,4 @@ private Dictionary ParseStates(List states) } return result; } - - private string GenerateStatesParameter(List states) - { - if (states is null || states.Count == 0) - { - return null; - } - return string.Join("&", states.Select(x => $"states={x}")); - } } diff --git a/src/Plugins/BotSharp.Plugin.Twilio/OutboundPhoneCallHandler/Functions/HangupPhoneCallFn.cs b/src/Plugins/BotSharp.Plugin.Twilio/OutboundPhoneCallHandler/Functions/HangupPhoneCallFn.cs index 16350983d..e2b78fd45 100644 --- a/src/Plugins/BotSharp.Plugin.Twilio/OutboundPhoneCallHandler/Functions/HangupPhoneCallFn.cs +++ b/src/Plugins/BotSharp.Plugin.Twilio/OutboundPhoneCallHandler/Functions/HangupPhoneCallFn.cs @@ -48,8 +48,8 @@ public async Task Execute(RoleDialogModel message) string initAudioFile = null; if (!string.IsNullOrEmpty(args.ResponseContent)) { - var completion = CompletionProvider.GetAudioCompletion(_services, "openai", "tts-1"); - var data = await completion.GenerateAudioFromTextAsync(args.ResponseContent); + var completion = CompletionProvider.GetAudioSynthesizer(_services); + var data = await completion.GenerateAudioAsync(args.ResponseContent); initAudioFile = "ending.mp3"; fileStorage.SaveSpeechFile(conversationId, initAudioFile, data); diff --git a/src/Plugins/BotSharp.Plugin.Twilio/OutboundPhoneCallHandler/Functions/LeaveVoicemailFn.cs b/src/Plugins/BotSharp.Plugin.Twilio/OutboundPhoneCallHandler/Functions/LeaveVoicemailFn.cs index 50b098538..96aa55809 100644 --- a/src/Plugins/BotSharp.Plugin.Twilio/OutboundPhoneCallHandler/Functions/LeaveVoicemailFn.cs +++ b/src/Plugins/BotSharp.Plugin.Twilio/OutboundPhoneCallHandler/Functions/LeaveVoicemailFn.cs @@ -45,8 +45,8 @@ public async Task Execute(RoleDialogModel message) string initAudioFile = null; if (!string.IsNullOrEmpty(args.VoicemailMessage)) { - var completion = CompletionProvider.GetAudioCompletion(_services, "openai", "tts-1"); - var data = await completion.GenerateAudioFromTextAsync(args.VoicemailMessage); + var completion = CompletionProvider.GetAudioSynthesizer(_services); + var data = await completion.GenerateAudioAsync(args.VoicemailMessage); initAudioFile = "voicemail.mp3"; fileStorage.SaveSpeechFile(conversationId, initAudioFile, data); } diff --git a/src/Plugins/BotSharp.Plugin.Twilio/OutboundPhoneCallHandler/Functions/OutboundPhoneCallFn.cs b/src/Plugins/BotSharp.Plugin.Twilio/OutboundPhoneCallHandler/Functions/OutboundPhoneCallFn.cs index c43741c37..905a22537 100644 --- a/src/Plugins/BotSharp.Plugin.Twilio/OutboundPhoneCallHandler/Functions/OutboundPhoneCallFn.cs +++ b/src/Plugins/BotSharp.Plugin.Twilio/OutboundPhoneCallHandler/Functions/OutboundPhoneCallFn.cs @@ -69,8 +69,8 @@ public async Task Execute(RoleDialogModel message) string initAudioFile = null; if (!string.IsNullOrEmpty(args.InitialMessage)) { - var completion = CompletionProvider.GetAudioCompletion(_services, "openai", "tts-1"); - var data = await completion.GenerateAudioFromTextAsync(args.InitialMessage); + var completion = CompletionProvider.GetAudioSynthesizer(_services); + var data = await completion.GenerateAudioAsync(args.InitialMessage); initAudioFile = "intial.mp3"; fileStorage.SaveSpeechFile(newConversationId, initAudioFile, data); diff --git a/src/Plugins/BotSharp.Plugin.Twilio/OutboundPhoneCallHandler/Functions/TransferPhoneCallFn.cs b/src/Plugins/BotSharp.Plugin.Twilio/OutboundPhoneCallHandler/Functions/TransferPhoneCallFn.cs index 60c2cdcda..a5c644234 100644 --- a/src/Plugins/BotSharp.Plugin.Twilio/OutboundPhoneCallHandler/Functions/TransferPhoneCallFn.cs +++ b/src/Plugins/BotSharp.Plugin.Twilio/OutboundPhoneCallHandler/Functions/TransferPhoneCallFn.cs @@ -50,8 +50,8 @@ public async Task Execute(RoleDialogModel message) // Generate initial assistant audio if (!string.IsNullOrEmpty(args.TransitionMessage)) { - var completion = CompletionProvider.GetAudioCompletion(_services, "openai", "tts-1"); - var data = await completion.GenerateAudioFromTextAsync(args.TransitionMessage); + var completion = CompletionProvider.GetAudioSynthesizer(_services); + var data = await completion.GenerateAudioAsync(args.TransitionMessage); var initAudioFile = "transfer.mp3"; fileStorage.SaveSpeechFile(conversationId, initAudioFile, data); diff --git a/src/Plugins/BotSharp.Plugin.Twilio/Services/TwilioMessageQueueService.cs b/src/Plugins/BotSharp.Plugin.Twilio/Services/TwilioMessageQueueService.cs index 6b6b209ae..35bc3f127 100644 --- a/src/Plugins/BotSharp.Plugin.Twilio/Services/TwilioMessageQueueService.cs +++ b/src/Plugins/BotSharp.Plugin.Twilio/Services/TwilioMessageQueueService.cs @@ -137,9 +137,9 @@ private static void InitConversation(CallerMessage message, RoleDialogModel inpu private static async Task GetReplySpeechFileName(string conversationId, AssistantMessage reply, IServiceProvider sp) { - var completion = CompletionProvider.GetAudioCompletion(sp, "openai", "tts-1"); + var completion = CompletionProvider.GetAudioSynthesizer(sp); var fileStorage = sp.GetRequiredService(); - var data = await completion.GenerateAudioFromTextAsync(reply.Content); + var data = await completion.GenerateAudioAsync(reply.Content); var fileName = $"reply_{reply.MessageId}.mp3"; fileStorage.SaveSpeechFile(conversationId, fileName, data); return fileName; diff --git a/src/Plugins/BotSharp.Plugin.Twilio/Services/TwilioService.cs b/src/Plugins/BotSharp.Plugin.Twilio/Services/TwilioService.cs index 45fbfcf24..da209f317 100644 --- a/src/Plugins/BotSharp.Plugin.Twilio/Services/TwilioService.cs +++ b/src/Plugins/BotSharp.Plugin.Twilio/Services/TwilioService.cs @@ -1,4 +1,7 @@ +using BotSharp.Abstraction.Files; using BotSharp.Abstraction.Utilities; +using BotSharp.Core.Infrastructures; +using BotSharp.Plugin.Twilio.Interfaces; using BotSharp.Plugin.Twilio.Models; using Twilio.Jwt.AccessToken; using Token = Twilio.Jwt.AccessToken.Token; @@ -12,11 +15,13 @@ public class TwilioService { private readonly TwilioSetting _settings; private readonly IServiceProvider _services; + public readonly ILogger _logger; - public TwilioService(TwilioSetting settings, IServiceProvider services) + public TwilioService(TwilioSetting settings, IServiceProvider services, ILogger logger) { _settings = settings; _services = services; + _logger = logger; } public string GetAccessToken() @@ -216,6 +221,76 @@ public VoiceResponse ReturnBidirectionalMediaStreamsInstructions(ConversationalV return response; } + public async Task WaitingForAiResponse(ConversationalVoiceRequest request) + { + VoiceResponse response; + var sessionManager = _services.GetRequiredService(); + var fileStorage = _services.GetRequiredService(); + + var indication = await sessionManager.GetReplyIndicationAsync(request.ConversationId, request.SeqNum); + if (indication != null) + { + _logger.LogWarning($"Indication ({request.SeqNum}): {indication}"); + var speechPaths = new List(); + foreach (var text in indication.Split('|')) + { + var seg = text.Trim(); + if (seg.StartsWith('#')) + { + speechPaths.Add($"twilio/{seg.Substring(1)}.mp3"); + } + else + { + var hash = Utilities.HashTextMd5(seg); + var fileName = $"indication_{hash}.mp3"; + + var existing = fileStorage.GetSpeechFile(request.ConversationId, fileName); + if (existing == BinaryData.Empty) + { + var completion = CompletionProvider.GetAudioSynthesizer(_services); + var data = await completion.GenerateAudioAsync(seg); + fileStorage.SaveSpeechFile(request.ConversationId, fileName, data); + } + + speechPaths.Add($"twilio/voice/speeches/{request.ConversationId}/{fileName}"); + } + } + + var instruction = new ConversationalVoiceResponse + { + AgentId = request.AgentId, + ConversationId = request.ConversationId, + SpeechPaths = speechPaths, + CallbackPath = $"twilio/voice/reply/{request.SeqNum}?agent-id={request.AgentId}&conversation-id={request.ConversationId}&{GenerateStatesParameter(request.States)}&AIResponseWaitTime={++request.AIResponseWaitTime}", + ActionOnEmptyResult = true + }; + + response = ReturnInstructions(instruction); + + await sessionManager.RemoveReplyIndicationAsync(request.ConversationId, request.SeqNum); + } + else + { + var instruction = new ConversationalVoiceResponse + { + AgentId = request.AgentId, + ConversationId = request.ConversationId, + SpeechPaths = [], + CallbackPath = $"twilio/voice/reply/{request.SeqNum}?agent-id={request.AgentId}&conversation-id={request.ConversationId}&{GenerateStatesParameter(request.States)}&AIResponseWaitTime={++request.AIResponseWaitTime}", + ActionOnEmptyResult = true + }; + + await HookEmitter.Emit(_services, async hook => + { + await hook.OnWaitingAgentResponse(request, instruction); + }); + + response = ReturnInstructions(instruction); + } + + return response; + } + public string GetSpeechPath(string conversationId, string speechPath) { if (speechPath.StartsWith("twilio/")) @@ -231,4 +306,13 @@ public string GetSpeechPath(string conversationId, string speechPath) return $"{_settings.CallbackHost}/twilio/voice/speeches/{conversationId}/{speechPath}"; } } + + public string GenerateStatesParameter(List states) + { + if (states is null || states.Count == 0) + { + return null; + } + return string.Join("&", states.Select(x => $"states={x}")); + } }