-
-
Notifications
You must be signed in to change notification settings - Fork 580
add audio service to transcribe local mp3/wav file #566
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Oceania2018
merged 4 commits into
SciSharp:master
from
evan-cao-wb:features/add-audio-handler
Jul 30, 2024
Merged
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
14 changes: 14 additions & 0 deletions
14
src/Infrastructure/BotSharp.Abstraction/MLTasks/ISpeechToText.cs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
using System; | ||
using System.Collections.Generic; | ||
using System.IO; | ||
using System.Linq; | ||
using System.Text; | ||
using System.Threading.Tasks; | ||
|
||
namespace BotSharp.Abstraction.MLTasks; | ||
|
||
public interface ISpeechToText | ||
{ | ||
Task<string> AudioToTextTranscript(string filePath); | ||
// Task<string> AudioToTextTranscript(Stream stream); | ||
} |
29 changes: 29 additions & 0 deletions
29
src/Plugins/BotSharp.Plugin.AudioHandler/AudioHandlerPlugin.cs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
using BotSharp.Plugin.AudioHandler.Settings; | ||
using BotSharp.Plugin.AudioHandler.Provider; | ||
using BotSharp.Abstraction.Settings; | ||
|
||
namespace BotSharp.Plugin.AudioHandler | ||
{ | ||
public class AudioHandlerPlugin : IBotSharpPlugin | ||
{ | ||
public string Id => "9d22014c-4f45-466a-9e82-a74e67983df8"; | ||
public string Name => "Audio Handler"; | ||
public string Description => "Process audio input and transform it into text output."; | ||
public void RegisterDI(IServiceCollection services, IConfiguration config) | ||
{ | ||
//var settings = new AudioHandlerSettings(); | ||
//config.Bind("AudioHandler", settings); | ||
//services.AddSingleton(x => settings); | ||
|
||
services.AddScoped(provider => | ||
{ | ||
var settingService = provider.GetRequiredService<ISettingService>(); | ||
return settingService.Bind<AudioHandlerSettings>("AudioHandler"); | ||
}); | ||
|
||
services.AddScoped<ISpeechToText, NativeWhisperProvider>(); | ||
services.AddScoped<IAudioProcessUtilities, AudioProcessUtilities>(); | ||
} | ||
} | ||
} | ||
|
26 changes: 26 additions & 0 deletions
26
src/Plugins/BotSharp.Plugin.AudioHandler/BotSharp.Plugin.AudioHandler.csproj
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
<Project Sdk="Microsoft.NET.Sdk"> | ||
|
||
<PropertyGroup> | ||
<TargetFramework>$(TargetFramework)</TargetFramework> | ||
<ImplicitUsings>enable</ImplicitUsings> | ||
<Nullable>enable</Nullable> | ||
<LangVersion>$(LangVersion)</LangVersion> | ||
<VersionPrefix>$(BotSharpVersion)</VersionPrefix> | ||
<GeneratePackageOnBuild>$(GeneratePackageOnBuild)</GeneratePackageOnBuild> | ||
<GenerateDocumentationFile>$(GenerateDocumentationFile)</GenerateDocumentationFile> | ||
<OutputPath>$(SolutionDir)packages</OutputPath> | ||
</PropertyGroup> | ||
|
||
<ItemGroup> | ||
<PackageReference Include="Microsoft.AspNetCore.Mvc" Version="2.2.0" /> | ||
<PackageReference Include="NAudio" Version="2.2.1" /> | ||
<PackageReference Include="NAudio.Core" Version="2.2.1" /> | ||
<PackageReference Include="Whisper.net" Version="1.5.0" /> | ||
<PackageReference Include="Whisper.net.Runtime" Version="1.5.0" /> | ||
</ItemGroup> | ||
|
||
<ItemGroup> | ||
<ProjectReference Include="..\..\Infrastructure\BotSharp.Core\BotSharp.Core.csproj" /> | ||
</ItemGroup> | ||
|
||
</Project> |
44 changes: 44 additions & 0 deletions
44
src/Plugins/BotSharp.Plugin.AudioHandler/Controllers/AudioController.cs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,44 @@ | ||
using System; | ||
using System.Collections.Generic; | ||
using System.Diagnostics; | ||
using System.Linq; | ||
using System.Text; | ||
using System.Threading.Tasks; | ||
using BotSharp.Plugin.AudioHandler.Models; | ||
using BotSharp.Plugin.AudioHandler.Provider; | ||
|
||
namespace BotSharp.Plugin.AudioHandler.Controllers | ||
{ | ||
#if DEBUG | ||
[AllowAnonymous] | ||
#endif | ||
[ApiController] | ||
public class AudioController : ControllerBase | ||
{ | ||
private readonly ISpeechToText _nativeWhisperProvider; | ||
|
||
public AudioController(ISpeechToText audioService) | ||
{ | ||
_nativeWhisperProvider = audioService; | ||
} | ||
|
||
[HttpGet("audio/transcript")] | ||
public async Task<IActionResult> GetTextFromAudioController(string audioInputString) | ||
{ | ||
#if DEBUG | ||
Stopwatch stopWatch = new Stopwatch(); | ||
stopWatch.Start(); | ||
#endif | ||
var result = await _nativeWhisperProvider.AudioToTextTranscript(audioInputString); | ||
#if DEBUG | ||
stopWatch.Stop(); | ||
TimeSpan ts = stopWatch.Elapsed; | ||
string elapsedTime = String.Format("{0:00}:{1:00}:{2:00}.{3:00}", | ||
ts.Hours, ts.Minutes, ts.Seconds, | ||
ts.Milliseconds / 10); | ||
Console.WriteLine("RunTime " + elapsedTime); | ||
#endif | ||
return Ok(result); | ||
} | ||
} | ||
} |
22 changes: 22 additions & 0 deletions
22
src/Plugins/BotSharp.Plugin.AudioHandler/Enums/AudioType.cs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
using System; | ||
using System.Collections.Generic; | ||
using System.Linq; | ||
using System.Runtime.CompilerServices; | ||
using System.Text; | ||
using System.Threading.Tasks; | ||
using Whisper.net.Wave; | ||
|
||
namespace BotSharp.Plugin.AudioHandler.Enums | ||
{ | ||
public enum AudioType | ||
{ | ||
wav, | ||
mp3, | ||
} | ||
|
||
public static class AudioTypeExtensions | ||
{ | ||
public static string ToFileExtension(this AudioType audioType) => $".{audioType}"; | ||
} | ||
} | ||
|
68 changes: 68 additions & 0 deletions
68
src/Plugins/BotSharp.Plugin.AudioHandler/Functions/AudioProcessUtilities.cs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,68 @@ | ||
using BotSharp.Plugin.AudioHandler.Enums; | ||
using NAudio; | ||
using NAudio.Wave; | ||
using NAudio.Wave.SampleProviders; | ||
|
||
namespace BotSharp.Plugin.AudioHandler.Functions; | ||
|
||
public class AudioProcessUtilities : IAudioProcessUtilities | ||
{ | ||
public AudioProcessUtilities() | ||
{ | ||
} | ||
|
||
public Stream ConvertMp3ToStream(string mp3FileName) | ||
{ | ||
var fileStream = File.OpenRead(mp3FileName); | ||
using var reader = new Mp3FileReader(fileStream); | ||
if (reader.WaveFormat.SampleRate != 16000) | ||
{ | ||
var wavStream = new MemoryStream(); | ||
var resampler = new WdlResamplingSampleProvider(reader.ToSampleProvider(), 16000); | ||
WaveFileWriter.WriteWavFileToStream(wavStream, resampler.ToWaveProvider16()); | ||
wavStream.Seek(0, SeekOrigin.Begin); | ||
return wavStream; | ||
} | ||
fileStream.Seek(0, SeekOrigin.Begin); | ||
return fileStream; | ||
|
||
} | ||
|
||
public Stream ConvertWavToStream(string wavFileName) | ||
{ | ||
var fileStream = File.OpenRead(wavFileName); | ||
using var reader = new WaveFileReader(fileStream); | ||
if (reader.WaveFormat.SampleRate != 16000) | ||
{ | ||
var wavStream = new MemoryStream(); | ||
var resampler = new WdlResamplingSampleProvider(reader.ToSampleProvider(), 16000); | ||
WaveFileWriter.WriteWavFileToStream(wavStream, resampler.ToWaveProvider16()); | ||
wavStream.Seek(0, SeekOrigin.Begin); | ||
return wavStream; | ||
} | ||
fileStream.Seek(0, SeekOrigin.Begin); | ||
return fileStream; | ||
} | ||
|
||
public Stream ConvertToStream(string fileName) | ||
{ | ||
if (string.IsNullOrEmpty(fileName)) | ||
{ | ||
throw new ArgumentNullException("fileName is Null"); | ||
} | ||
string fileExtension = Path.GetExtension(fileName).ToLower().TrimStart('.'); | ||
if (!Enum.TryParse<AudioType>(fileExtension, out AudioType fileType)) | ||
{ | ||
throw new NotSupportedException($"File extension: '{fileExtension}' not supported"); | ||
} | ||
|
||
var stream = fileType switch | ||
{ | ||
AudioType.mp3 => ConvertMp3ToStream(fileName), | ||
AudioType.wav => ConvertWavToStream(fileName), | ||
_ => throw new NotSupportedException("File extension not supported"), | ||
}; | ||
|
||
return stream; | ||
} | ||
} |
10 changes: 10 additions & 0 deletions
10
src/Plugins/BotSharp.Plugin.AudioHandler/Functions/IAudioProcessUtilities.cs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
|
||
namespace BotSharp.Plugin.AudioHandler.Functions | ||
{ | ||
public interface IAudioProcessUtilities | ||
{ | ||
Stream ConvertMp3ToStream(string mp3FileName); | ||
Stream ConvertWavToStream(string wavFileName); | ||
Stream ConvertToStream(string fileName); | ||
} | ||
} |
19 changes: 19 additions & 0 deletions
19
src/Plugins/BotSharp.Plugin.AudioHandler/Models/AudioOutput.cs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
using System; | ||
using System.Collections.Generic; | ||
using System.Linq; | ||
using System.Text; | ||
using System.Threading.Tasks; | ||
using Whisper.net; | ||
|
||
namespace BotSharp.Plugin.AudioHandler.Models | ||
{ | ||
public class AudioOutput | ||
{ | ||
public List<SegmentData> Segments { get; set; } | ||
|
||
public override string ToString() | ||
{ | ||
return this.Segments.Count > 0 ? string.Join(" ", this.Segments.Select(x => x.Text)) : string.Empty; | ||
} | ||
} | ||
} |
82 changes: 82 additions & 0 deletions
82
src/Plugins/BotSharp.Plugin.AudioHandler/Provider/NativeWhisperProvider.cs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,82 @@ | ||
using Whisper.net; | ||
using Whisper.net.Ggml; | ||
|
||
namespace BotSharp.Plugin.AudioHandler.Provider; | ||
|
||
/// <summary> | ||
/// Native Whisper provider for speech to text conversion | ||
/// </summary> | ||
public class NativeWhisperProvider : ISpeechToText | ||
{ | ||
private readonly IAudioProcessUtilities _audioProcessUtilities; | ||
private static WhisperProcessor _processor; | ||
|
||
private string _modelName; | ||
|
||
public NativeWhisperProvider(IAudioProcessUtilities audioProcessUtilities) | ||
{ | ||
_audioProcessUtilities = audioProcessUtilities; | ||
} | ||
|
||
public async Task<string> AudioToTextTranscript(string filePath) | ||
{ | ||
string fileExtension = Path.GetExtension(filePath); | ||
if (!Enum.TryParse<AudioType>(fileExtension.TrimStart('.').ToLower(), out AudioType audioType)) | ||
{ | ||
throw new Exception($"Unsupported audio type: {fileExtension}"); | ||
} | ||
await InitModel(); | ||
// var _streamHandler = _audioHandlerFactory.CreateAudioHandler(audioType); | ||
using var stream = _audioProcessUtilities.ConvertToStream(filePath); | ||
|
||
if (stream == null) | ||
{ | ||
throw new Exception($"Failed to convert {fileExtension} to stream"); | ||
} | ||
|
||
var textResult = new List<SegmentData>(); | ||
|
||
await foreach (var result in _processor.ProcessAsync((Stream)stream).ConfigureAwait(false)) | ||
{ | ||
textResult.Add(result); | ||
} | ||
|
||
var audioOutput = new AudioOutput | ||
{ | ||
Segments = textResult | ||
}; | ||
return audioOutput.ToString(); | ||
} | ||
private async Task LoadWhisperModel(GgmlType modelType) | ||
{ | ||
try | ||
{ | ||
_modelName = $"ggml-{modelType}.bin"; | ||
|
||
if (!File.Exists(_modelName)) | ||
{ | ||
using var modelStream = await WhisperGgmlDownloader.GetGgmlModelAsync(GgmlType.TinyEn); | ||
using var fileWriter = File.OpenWrite(_modelName); | ||
await modelStream.CopyToAsync(fileWriter); | ||
} | ||
} | ||
catch (Exception ex) | ||
{ | ||
throw new Exception($"Failed to load whisper model: {ex.Message}"); | ||
} | ||
} | ||
|
||
private async Task InitModel(GgmlType modelType = GgmlType.TinyEn) | ||
{ | ||
if (_processor == null) | ||
{ | ||
|
||
await LoadWhisperModel(modelType); | ||
_processor = WhisperFactory | ||
.FromPath(_modelName) | ||
.CreateBuilder() | ||
.WithLanguage("en") | ||
.Build(); | ||
} | ||
} | ||
} |
6 changes: 6 additions & 0 deletions
6
src/Plugins/BotSharp.Plugin.AudioHandler/Settings/AudioHandlerSettings.cs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
namespace BotSharp.Plugin.AudioHandler.Settings | ||
{ | ||
public class AudioHandlerSettings | ||
{ | ||
} | ||
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
global using System; | ||
global using System.Collections.Generic; | ||
global using System.Text; | ||
global using System.Linq; | ||
global using System.Text.Json; | ||
global using System.Linq; | ||
global using System.Text; | ||
global using System.Threading.Tasks; | ||
global using System.Threading.Tasks; | ||
|
||
global using BotSharp.Abstraction.Plugins; | ||
global using BotSharp.Abstraction.MLTasks; | ||
global using BotSharp.Plugin.AudioHandler.Enums; | ||
global using BotSharp.Plugin.AudioHandler.Functions; | ||
global using BotSharp.Plugin.AudioHandler.Models; | ||
|
||
global using Microsoft.Extensions.Configuration; | ||
global using Microsoft.Extensions.DependencyInjection; | ||
global using Microsoft.AspNetCore.Http; | ||
global using Microsoft.AspNetCore.Authorization; | ||
global using Microsoft.AspNetCore.Mvc; |
9 changes: 9 additions & 0 deletions
9
src/Plugins/BotSharp.Plugin.OpenAI/Providers/Audio/SpeechToTextProvider.cs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,9 @@ | ||
namespace BotSharp.Plugin.OpenAI.Providers.Audio; | ||
|
||
public class SpeechToTextProvider : ISpeechToText | ||
{ | ||
public Task<string> AudioToTextTranscript(string filePath) | ||
{ | ||
throw new NotImplementedException(); | ||
} | ||
} |
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Change to
using var fileStream =
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I tested it on my local. Adding "using" statement at the beginning will result in a disposed "fileStream" if SampleRate is 16000. The value returned by this function will not contain any data.