Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions src/Infrastructure/BotSharp.Abstraction/MLTasks/ISpeechToText.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace BotSharp.Abstraction.MLTasks;

public interface ISpeechToText
{
Task<string> AudioToTextTranscript(string filePath);
// Task<string> AudioToTextTranscript(Stream stream);
}
29 changes: 29 additions & 0 deletions src/Plugins/BotSharp.Plugin.AudioHandler/AudioHandlerPlugin.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
using BotSharp.Plugin.AudioHandler.Settings;
using BotSharp.Plugin.AudioHandler.Provider;
using BotSharp.Abstraction.Settings;

namespace BotSharp.Plugin.AudioHandler
{
public class AudioHandlerPlugin : IBotSharpPlugin
{
public string Id => "9d22014c-4f45-466a-9e82-a74e67983df8";
public string Name => "Audio Handler";
public string Description => "Process audio input and transform it into text output.";
public void RegisterDI(IServiceCollection services, IConfiguration config)
{
//var settings = new AudioHandlerSettings();
//config.Bind("AudioHandler", settings);
//services.AddSingleton(x => settings);

services.AddScoped(provider =>
{
var settingService = provider.GetRequiredService<ISettingService>();
return settingService.Bind<AudioHandlerSettings>("AudioHandler");
});

services.AddScoped<ISpeechToText, NativeWhisperProvider>();
services.AddScoped<IAudioProcessUtilities, AudioProcessUtilities>();
}
}
}

Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
<Project Sdk="Microsoft.NET.Sdk">

<PropertyGroup>
<TargetFramework>$(TargetFramework)</TargetFramework>
<ImplicitUsings>enable</ImplicitUsings>
<Nullable>enable</Nullable>
<LangVersion>$(LangVersion)</LangVersion>
<VersionPrefix>$(BotSharpVersion)</VersionPrefix>
<GeneratePackageOnBuild>$(GeneratePackageOnBuild)</GeneratePackageOnBuild>
<GenerateDocumentationFile>$(GenerateDocumentationFile)</GenerateDocumentationFile>
<OutputPath>$(SolutionDir)packages</OutputPath>
</PropertyGroup>

<ItemGroup>
<PackageReference Include="Microsoft.AspNetCore.Mvc" Version="2.2.0" />
<PackageReference Include="NAudio" Version="2.2.1" />
<PackageReference Include="NAudio.Core" Version="2.2.1" />
<PackageReference Include="Whisper.net" Version="1.5.0" />
<PackageReference Include="Whisper.net.Runtime" Version="1.5.0" />
</ItemGroup>

<ItemGroup>
<ProjectReference Include="..\..\Infrastructure\BotSharp.Core\BotSharp.Core.csproj" />
</ItemGroup>

</Project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
using System;
using System.Collections.Generic;
using System.Diagnostics;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using BotSharp.Plugin.AudioHandler.Models;
using BotSharp.Plugin.AudioHandler.Provider;

namespace BotSharp.Plugin.AudioHandler.Controllers
{
#if DEBUG
[AllowAnonymous]
#endif
[ApiController]
public class AudioController : ControllerBase
{
private readonly ISpeechToText _nativeWhisperProvider;

public AudioController(ISpeechToText audioService)
{
_nativeWhisperProvider = audioService;
}

[HttpGet("audio/transcript")]
public async Task<IActionResult> GetTextFromAudioController(string audioInputString)
{
#if DEBUG
Stopwatch stopWatch = new Stopwatch();
stopWatch.Start();
#endif
var result = await _nativeWhisperProvider.AudioToTextTranscript(audioInputString);
#if DEBUG
stopWatch.Stop();
TimeSpan ts = stopWatch.Elapsed;
string elapsedTime = String.Format("{0:00}:{1:00}:{2:00}.{3:00}",
ts.Hours, ts.Minutes, ts.Seconds,
ts.Milliseconds / 10);
Console.WriteLine("RunTime " + elapsedTime);
#endif
return Ok(result);
}
}
}
22 changes: 22 additions & 0 deletions src/Plugins/BotSharp.Plugin.AudioHandler/Enums/AudioType.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Runtime.CompilerServices;
using System.Text;
using System.Threading.Tasks;
using Whisper.net.Wave;

namespace BotSharp.Plugin.AudioHandler.Enums
{
public enum AudioType
{
wav,
mp3,
}

public static class AudioTypeExtensions
{
public static string ToFileExtension(this AudioType audioType) => $".{audioType}";
}
}

Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
using BotSharp.Plugin.AudioHandler.Enums;
using NAudio;
using NAudio.Wave;
using NAudio.Wave.SampleProviders;

namespace BotSharp.Plugin.AudioHandler.Functions;

public class AudioProcessUtilities : IAudioProcessUtilities
{
public AudioProcessUtilities()
{
}

public Stream ConvertMp3ToStream(string mp3FileName)
{
var fileStream = File.OpenRead(mp3FileName);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Change to using var fileStream =

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I tested it on my local. Adding "using" statement at the beginning will result in a disposed "fileStream" if SampleRate is 16000. The value returned by this function will not contain any data.

using var reader = new Mp3FileReader(fileStream);
if (reader.WaveFormat.SampleRate != 16000)
{
var wavStream = new MemoryStream();
var resampler = new WdlResamplingSampleProvider(reader.ToSampleProvider(), 16000);
WaveFileWriter.WriteWavFileToStream(wavStream, resampler.ToWaveProvider16());
wavStream.Seek(0, SeekOrigin.Begin);
return wavStream;
}
fileStream.Seek(0, SeekOrigin.Begin);
return fileStream;

}

public Stream ConvertWavToStream(string wavFileName)
{
var fileStream = File.OpenRead(wavFileName);
using var reader = new WaveFileReader(fileStream);
if (reader.WaveFormat.SampleRate != 16000)
{
var wavStream = new MemoryStream();
var resampler = new WdlResamplingSampleProvider(reader.ToSampleProvider(), 16000);
WaveFileWriter.WriteWavFileToStream(wavStream, resampler.ToWaveProvider16());
wavStream.Seek(0, SeekOrigin.Begin);
return wavStream;
}
fileStream.Seek(0, SeekOrigin.Begin);
return fileStream;
}

public Stream ConvertToStream(string fileName)
{
if (string.IsNullOrEmpty(fileName))
{
throw new ArgumentNullException("fileName is Null");
}
string fileExtension = Path.GetExtension(fileName).ToLower().TrimStart('.');
if (!Enum.TryParse<AudioType>(fileExtension, out AudioType fileType))
{
throw new NotSupportedException($"File extension: '{fileExtension}' not supported");
}

var stream = fileType switch
{
AudioType.mp3 => ConvertMp3ToStream(fileName),
AudioType.wav => ConvertWavToStream(fileName),
_ => throw new NotSupportedException("File extension not supported"),
};

return stream;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@

namespace BotSharp.Plugin.AudioHandler.Functions
{
public interface IAudioProcessUtilities
{
Stream ConvertMp3ToStream(string mp3FileName);
Stream ConvertWavToStream(string wavFileName);
Stream ConvertToStream(string fileName);
}
}
19 changes: 19 additions & 0 deletions src/Plugins/BotSharp.Plugin.AudioHandler/Models/AudioOutput.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using Whisper.net;

namespace BotSharp.Plugin.AudioHandler.Models
{
public class AudioOutput
{
public List<SegmentData> Segments { get; set; }

public override string ToString()
{
return this.Segments.Count > 0 ? string.Join(" ", this.Segments.Select(x => x.Text)) : string.Empty;
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
using Whisper.net;
using Whisper.net.Ggml;

namespace BotSharp.Plugin.AudioHandler.Provider;

/// <summary>
/// Native Whisper provider for speech to text conversion
/// </summary>
public class NativeWhisperProvider : ISpeechToText
{
private readonly IAudioProcessUtilities _audioProcessUtilities;
private static WhisperProcessor _processor;

private string _modelName;

public NativeWhisperProvider(IAudioProcessUtilities audioProcessUtilities)
{
_audioProcessUtilities = audioProcessUtilities;
}

public async Task<string> AudioToTextTranscript(string filePath)
{
string fileExtension = Path.GetExtension(filePath);
if (!Enum.TryParse<AudioType>(fileExtension.TrimStart('.').ToLower(), out AudioType audioType))
{
throw new Exception($"Unsupported audio type: {fileExtension}");
}
await InitModel();
// var _streamHandler = _audioHandlerFactory.CreateAudioHandler(audioType);
using var stream = _audioProcessUtilities.ConvertToStream(filePath);

if (stream == null)
{
throw new Exception($"Failed to convert {fileExtension} to stream");
}

var textResult = new List<SegmentData>();

await foreach (var result in _processor.ProcessAsync((Stream)stream).ConfigureAwait(false))
{
textResult.Add(result);
}

var audioOutput = new AudioOutput
{
Segments = textResult
};
return audioOutput.ToString();
}
private async Task LoadWhisperModel(GgmlType modelType)
{
try
{
_modelName = $"ggml-{modelType}.bin";

if (!File.Exists(_modelName))
{
using var modelStream = await WhisperGgmlDownloader.GetGgmlModelAsync(GgmlType.TinyEn);
using var fileWriter = File.OpenWrite(_modelName);
await modelStream.CopyToAsync(fileWriter);
}
}
catch (Exception ex)
{
throw new Exception($"Failed to load whisper model: {ex.Message}");
}
}

private async Task InitModel(GgmlType modelType = GgmlType.TinyEn)
{
if (_processor == null)
{

await LoadWhisperModel(modelType);
_processor = WhisperFactory
.FromPath(_modelName)
.CreateBuilder()
.WithLanguage("en")
.Build();
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
namespace BotSharp.Plugin.AudioHandler.Settings
{
public class AudioHandlerSettings
{
}
}
21 changes: 21 additions & 0 deletions src/Plugins/BotSharp.Plugin.AudioHandler/Using.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
global using System;
global using System.Collections.Generic;
global using System.Text;
global using System.Linq;
global using System.Text.Json;
global using System.Linq;
global using System.Text;
global using System.Threading.Tasks;
global using System.Threading.Tasks;

global using BotSharp.Abstraction.Plugins;
global using BotSharp.Abstraction.MLTasks;
global using BotSharp.Plugin.AudioHandler.Enums;
global using BotSharp.Plugin.AudioHandler.Functions;
global using BotSharp.Plugin.AudioHandler.Models;

global using Microsoft.Extensions.Configuration;
global using Microsoft.Extensions.DependencyInjection;
global using Microsoft.AspNetCore.Http;
global using Microsoft.AspNetCore.Authorization;
global using Microsoft.AspNetCore.Mvc;
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
namespace BotSharp.Plugin.OpenAI.Providers.Audio;

public class SpeechToTextProvider : ISpeechToText
{
public Task<string> AudioToTextTranscript(string filePath)
{
throw new NotImplementedException();
}
}