Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view

This file was deleted.

15 changes: 15 additions & 0 deletions src/Infrastructure/BotSharp.Abstraction/MLTasks/IAudioSynthesis.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
namespace BotSharp.Abstraction.MLTasks;

/// <summary>
/// Text to speech synthesis
/// </summary>
public interface IAudioSynthesis
{
string Provider { get; }

string Model { get; }

void SetModelName(string model);

Task<BinaryData> GenerateAudioAsync(string text, string? voice = "alloy", string? format = "mp3", string? instructions = null);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
using System.IO;

namespace BotSharp.Abstraction.MLTasks;

/// <summary>
/// Audio transcription service
/// </summary>
public interface IAudioTranscription
{
string Provider { get; }

string Model { get; }

Task<string> TranscriptTextAsync(Stream audio, string audioFileName, string? text = null);

void SetModelName(string model);
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,14 @@ namespace BotSharp.Abstraction.MLTasks.Settings;
public class LlmModelSetting
{
/// <summary>
/// Model Id, like "gpt-3.5" and "gpt-4".
/// Model Id, like "gpt-4", "gpt-4o", "o1".
/// </summary>
public string? Id { get; set; }
public string Id { get; set; } = null!;

/// <summary>
/// Deployment model name
/// </summary>
public string Name { get; set; }
public string Name { get; set; } = null!;

/// <summary>
/// Model version
Expand All @@ -28,8 +28,8 @@ public class LlmModelSetting
/// </summary>
public string? Group { get; set; }

public string ApiKey { get; set; }
public string Endpoint { get; set; }
public string ApiKey { get; set; } = null!;
public string? Endpoint { get; set; }
public LlmModelType Type { get; set; } = LlmModelType.Chat;

/// <summary>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,6 @@ public class ModelTurnDetection

public class AudioTranscription
{
public string Model { get; set; } = "whisper-1";
public string Language { get; set; } = "en";
public string Model { get; set; } = "gpt-4o-mini-transcribe";
public string? Language { get; set; }
}
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ private async Task ConnectToModel(WebSocket userWebSocket)
if (!model.Contains("-realtime-"))
{
var llmProviderService = _services.GetRequiredService<ILlmProviderService>();
model = llmProviderService.GetProviderModel("openai", "gpt-4", realTime: true).Name;
model = llmProviderService.GetProviderModel("openai", "gpt-4o", realTime: true).Name;
}

_completer.SetModelName(model);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,14 +6,14 @@ public partial class FileInstructService
{
public async Task<string> SpeechToText(string? provider, string? model, InstructFileModel audio, string? text = null)
{
var completion = CompletionProvider.GetAudioCompletion(_services, provider: provider ?? "openai", model: model ?? "whisper-1");
var completion = CompletionProvider.GetAudioTranscriber(_services, provider: provider, model: model);
var audioBytes = await DownloadFile(audio);
using var stream = new MemoryStream();
stream.Write(audioBytes, 0, audioBytes.Length);
stream.Position = 0;

var fileName = $"{audio.FileName ?? "audio"}.{audio.FileExtension ?? "wav"}";
var content = await completion.GenerateTextFromAudioAsync(stream, fileName, text);
var content = await completion.TranscriptTextAsync(stream, fileName, text);
stream.Close();
return content;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ public async Task<string> ReadPdf(string? provider, string? model, string? model

var innerAgentId = agentId ?? Guid.Empty.ToString();
var completion = CompletionProvider.GetChatCompletion(_services, provider: provider ?? "openai",
model: model, modelId: modelId ?? "gpt-4", multiModal: true);
model: model, modelId: modelId ?? "gpt-4o", multiModal: true);
var message = await completion.GetChatCompletions(new Agent()
{
Id = innerAgentId,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ private async Task<IEnumerable<MessageFileModel>> SelectFiles(IEnumerable<Messag
}

var providerName = options.Provider ?? "openai";
var modelId = options?.ModelId ?? "gpt-4";
var modelId = options?.ModelId ?? "gpt-4o";
var provider = llmProviderService.GetProviders().FirstOrDefault(x => x == providerName);
var model = llmProviderService.GetProviderModel(provider: provider, id: modelId);
var completion = CompletionProvider.GetChatCompletion(_services, provider: provider, model: model.Name);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ public static object GetCompletion(IServiceProvider services,
}
else if (settings.Type == LlmModelType.Audio)
{
return GetAudioCompletion(services, provider: provider, model: model);
return GetAudioTranscriber(services, provider: provider, model: model);
}
else
{
Expand Down Expand Up @@ -126,20 +126,39 @@ public static ITextEmbedding GetTextEmbedding(IServiceProvider services,
return completer;
}

public static IAudioCompletion GetAudioCompletion(
public static IAudioTranscription GetAudioTranscriber(
IServiceProvider services,
string provider,
string model)
string? provider = null,
string? model = null)
{
var completions = services.GetServices<IAudioCompletion>();
var completer = completions.FirstOrDefault(x => x.Provider == provider);
var completions = services.GetServices<IAudioTranscription>();
var completer = completions.FirstOrDefault(x => x.Provider == (provider ?? "openai"));
if (completer == null)
{
var logger = services.GetRequiredService<ILogger<CompletionProvider>>();
logger.LogError($"Can't resolve audio-completion provider by {provider}");
logger.LogError($"Can't resolve audio-transcriber provider by {provider}");
return default!;
}

completer.SetModelName(model);
completer.SetModelName(model ?? "gpt-4o-mini-transcribe");
return completer;
}

public static IAudioSynthesis GetAudioSynthesizer(
IServiceProvider services,
string? provider = null,
string? model = null)
{
var completions = services.GetServices<IAudioSynthesis>();
var completer = completions.FirstOrDefault(x => x.Provider == (provider ?? "openai"));
if (completer == null)
{
var logger = services.GetRequiredService<ILogger<CompletionProvider>>();
logger.LogError($"Can't resolve audio-synthesizer provider by {provider}");
return default!;
}

completer.SetModelName(model ?? "gpt-4o-mini-tts");
return completer;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -499,8 +499,8 @@ public async Task<SpeechToTextViewModel> SpeechToText(IFormFile file, [FromForm]
file.CopyTo(stream);
stream.Position = 0;

var completion = CompletionProvider.GetAudioCompletion(_services, provider: provider ?? "openai", model: model ?? "whisper-1");
var content = await completion.GenerateTextFromAudioAsync(stream, file.FileName, text);
var completion = CompletionProvider.GetAudioTranscriber(_services, provider: provider, model: model);
var content = await completion.TranscriptTextAsync(stream, file.FileName, text);
viewModel.Content = content;
stream.Close();
return viewModel;
Expand All @@ -520,8 +520,8 @@ public async Task<IActionResult> TextToSpeech([FromBody] TextToSpeechRequest inp
var state = _services.GetRequiredService<IConversationStateService>();
input.States.ForEach(x => state.SetState(x.Key, x.Value, activeRounds: x.ActiveRounds, source: StateSource.External));

var completion = CompletionProvider.GetAudioCompletion(_services, provider: input.Provider ?? "openai", model: input.Model ?? "tts-1");
var binaryData = await completion.GenerateAudioFromTextAsync(input.Text);
var completion = CompletionProvider.GetAudioSynthesizer(_services, provider: input.Provider, model: input.Model);
var binaryData = await completion.GenerateAudioAsync(input.Text);
var stream = binaryData.ToStream();
stream.Position = 0;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ public RealtimeController(IServiceProvider services)
[HttpGet("/agent/{agentId}/realtime/session")]
public async Task<RealtimeSession> CreateSession(string agentId)
{
var completion = CompletionProvider.GetRealTimeCompletion(_services, provider: "openai", modelId: "gpt-4");
var completion = CompletionProvider.GetRealTimeCompletion(_services, provider: "openai", modelId: "gpt-4o");

var agentService = _services.GetRequiredService<IAgentService>();
var agent = await agentService.LoadAgent(agentId);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ public void RegisterDI(IServiceCollection services, IConfiguration config)
return settingService.Bind<AudioHandlerSettings>("AudioHandler");
});

services.AddScoped<IAudioCompletion, NativeWhisperProvider>();
services.AddScoped<IAudioTranscription, NativeWhisperProvider>();
services.AddScoped<IAgentUtilityHook, AudioHandlerUtilityHook>();
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ private async Task<string> GetResponeFromDialogs(List<RoleDialogModel> dialogs)
using var stream = new MemoryStream(bytes);
stream.Position = 0;

var result = await audioCompletion.GenerateTextFromAudioAsync(stream, fileName);
var result = await audioCompletion.TranscriptTextAsync(stream, fileName);
transcripts.Add(result);
stream.Close();
}
Expand All @@ -104,9 +104,9 @@ private async Task<string> GetResponeFromDialogs(List<RoleDialogModel> dialogs)
return string.Join("\r\n\r\n", transcripts);
}

private IAudioCompletion PrepareModel()
private IAudioTranscription PrepareModel()
{
return CompletionProvider.GetAudioCompletion(_serviceProvider, provider: "openai", model: "whisper-1");
return CompletionProvider.GetAudioTranscriber(_serviceProvider);
}

private bool ParseAudioFileType(string fileName)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ namespace BotSharp.Plugin.AudioHandler.Provider;
/// <summary>
/// Native Whisper provider for speech to text conversion
/// </summary>
public class NativeWhisperProvider : IAudioCompletion
public class NativeWhisperProvider : IAudioTranscription
{
private static WhisperProcessor _whisperProcessor;

Expand All @@ -29,7 +29,7 @@ public NativeWhisperProvider(
_logger = logger;
}

public async Task<string> GenerateTextFromAudioAsync(Stream audio, string audioFileName, string? text = null)
public async Task<string> TranscriptTextAsync(Stream audio, string audioFileName, string? text = null)
{
var textResult = new List<SegmentData>();

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,6 @@ public void RegisterDI(IServiceCollection services, IConfiguration config)
services.AddScoped<IChatCompletion, ChatCompletionProvider>();
services.AddScoped<ITextEmbedding, TextEmbeddingProvider>();
services.AddScoped<IImageCompletion, ImageCompletionProvider>();
services.AddScoped<IAudioCompletion, AudioCompletionProvider>();
services.AddScoped<IAudioTranscription, AudioCompletionProvider>();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ namespace BotSharp.Plugin.AzureOpenAI.Providers.Audio;

public partial class AudioCompletionProvider
{
public async Task<string> GenerateTextFromAudioAsync(Stream audio, string audioFileName, string? text = null)
public async Task<string> TranscriptTextAsync(Stream audio, string audioFileName, string? text = null)
{
var audioClient = ProviderHelper.GetClient(Provider, _model, _services)
.GetAudioClient(_model);
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
namespace BotSharp.Plugin.AzureOpenAI.Providers.Audio;

public partial class AudioCompletionProvider : IAudioCompletion
public partial class AudioCompletionProvider : IAudioTranscription
{
private readonly IServiceProvider _services;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ public async Task<bool> Execute(RoleDialogModel message)

var llmProviderService = _services.GetRequiredService<ILlmProviderService>();
var provider = llmProviderService.GetProviders().FirstOrDefault(x => x == "openai");
var model = llmProviderService.GetProviderModel(provider: provider ?? "openai", id: "gpt-4");
var model = llmProviderService.GetProviderModel(provider: provider ?? "openai", id: "gpt-4o");
var completion = CompletionProvider.GetChatCompletion(_services, provider: provider, model: model.Name);
var convService = _services.GetRequiredService<IConversationService>();
var conversationId = convService.ConversationId;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ private async Task<string> GetChatCompletion(Agent agent, List<RoleDialogModel>
{
var llmProviderService = _services.GetRequiredService<ILlmProviderService>();
var provider = llmProviderService.GetProviders().FirstOrDefault(x => x == "openai");
var model = llmProviderService.GetProviderModel(provider: provider, id: "gpt-4", multiModal: true);
var model = llmProviderService.GetProviderModel(provider: provider, id: "gpt-4o", multiModal: true);
var completion = CompletionProvider.GetChatCompletion(_services, provider: provider, model: model.Name);
var response = await completion.GetChatCompletions(agent, dialogs);
return response.Content;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ private async Task<string> GetChatCompletion(Agent agent, List<RoleDialogModel>
{
var llmProviderService = _services.GetRequiredService<ILlmProviderService>();
var provider = llmProviderService.GetProviders().FirstOrDefault(x => x == "openai");
var model = llmProviderService.GetProviderModel(provider: provider, id: "gpt-4", multiModal: true);
var model = llmProviderService.GetProviderModel(provider: provider, id: "gpt-4o", multiModal: true);
var completion = CompletionProvider.GetChatCompletion(_services, provider: provider, model: model.Name);
var response = await completion.GetChatCompletions(agent, dialogs);
return response.Content;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -61,26 +61,30 @@ public class RealtimeSessionTurnDetection
/// <summary>
/// Milliseconds
/// </summary>
[JsonPropertyName("prefix_padding_ms")]
/*[JsonPropertyName("prefix_padding_ms")]
public int PrefixPadding { get; set; } = 300;

[JsonPropertyName("silence_duration_ms")]
public int SilenceDuration { get; set; } = 500;

[JsonPropertyName("threshold")]
public float Threshold { get; set; } = 0.5f;
public float Threshold { get; set; } = 0.5f;*/

[JsonPropertyName("type")]
public string Type { get; set; } = "server_vad";
public string Type { get; set; } = "semantic_vad";

[JsonPropertyName("eagerness")]
public string eagerness { get;set; } = "auto";
}

public class InputAudioTranscription
{
[JsonPropertyName("model")]
public string Model { get; set; } = "whisper-1";
public string Model { get; set; } = "gpt-4o-transcribe";

[JsonPropertyName("language")]
public string Language { get; set; } = "en";
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)]
public string? Language { get; set; }

[JsonPropertyName("prompt")]
[JsonIgnore(Condition = JsonIgnoreCondition.WhenWritingNull)]
Expand Down
3 changes: 2 additions & 1 deletion src/Plugins/BotSharp.Plugin.OpenAI/OpenAiPlugin.cs
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@ public void RegisterDI(IServiceCollection services, IConfiguration config)
services.AddScoped<IChatCompletion, ChatCompletionProvider>();
services.AddScoped<ITextEmbedding, TextEmbeddingProvider>();
services.AddScoped<IImageCompletion, ImageCompletionProvider>();
services.AddScoped<IAudioCompletion, AudioCompletionProvider>();
services.AddScoped<IAudioTranscription, AudioTranscriptionProvider>();
services.AddScoped<IAudioSynthesis, AudioSynthesisProvider>();
services.AddScoped<IRealTimeCompletion, RealTimeCompletionProvider>();

services.AddRefitClient<IOpenAiRealtimeApi>()
Expand Down

This file was deleted.

Loading
Loading