diff --git a/src/Infrastructure/BotSharp.Abstraction/Knowledges/IPaddleOcrConverter.cs b/src/Infrastructure/BotSharp.Abstraction/Knowledges/IPaddleOcrConverter.cs new file mode 100644 index 000000000..6c9133480 --- /dev/null +++ b/src/Infrastructure/BotSharp.Abstraction/Knowledges/IPaddleOcrConverter.cs @@ -0,0 +1,12 @@ +using System; +using System.Collections.Generic; +using System.Text; + +namespace BotSharp.Abstraction.Knowledges +{ + public interface IPaddleOcrConverter + { + // void LoadModel(); + Task ConvertImageToText(string loadPath); + } +} diff --git a/src/Infrastructure/BotSharp.Abstraction/Knowledges/IPdf2TextConverter.cs b/src/Infrastructure/BotSharp.Abstraction/Knowledges/IPdf2TextConverter.cs new file mode 100644 index 000000000..0ae85b303 --- /dev/null +++ b/src/Infrastructure/BotSharp.Abstraction/Knowledges/IPdf2TextConverter.cs @@ -0,0 +1,12 @@ +using System; +using System.Collections.Generic; +using System.Text; +using Microsoft.AspNetCore.Http; + +namespace BotSharp.Abstraction.Knowledges +{ + public interface IPdf2TextConverter + { + Task ConvertPdfToText(IFormFile formFile, int? startPageNum, int? endPageNum); + } +} \ No newline at end of file diff --git a/src/Infrastructure/BotSharp.Core/BotSharp.Core.csproj b/src/Infrastructure/BotSharp.Core/BotSharp.Core.csproj index ab77723b1..3ffea2e6b 100644 --- a/src/Infrastructure/BotSharp.Core/BotSharp.Core.csproj +++ b/src/Infrastructure/BotSharp.Core/BotSharp.Core.csproj @@ -76,6 +76,7 @@ + diff --git a/src/Infrastructure/BotSharp.Core/BotSharpServiceCollectionExtensions.cs b/src/Infrastructure/BotSharp.Core/BotSharpServiceCollectionExtensions.cs index 1b38659c5..bb2122ee3 100644 --- a/src/Infrastructure/BotSharp.Core/BotSharpServiceCollectionExtensions.cs +++ b/src/Infrastructure/BotSharp.Core/BotSharpServiceCollectionExtensions.cs @@ -2,6 +2,7 @@ using BotSharp.Core.Functions; using BotSharp.Core.Hooks; using BotSharp.Core.Templating; +using BotSharp.Core.Plugins.Knowledges.Services; using Microsoft.AspNetCore.Builder; using Microsoft.Extensions.Configuration; @@ -95,5 +96,7 @@ public static void RegisterPlugins(IServiceCollection services, IConfiguration c loader.Load(); services.AddSingleton(loader); + + services.AddSingleton(); } } diff --git a/src/Infrastructure/BotSharp.Core/Plugins/Knowledges/KnowledgeBaseSettings.cs b/src/Infrastructure/BotSharp.Core/Plugins/Knowledges/KnowledgeBaseSettings.cs index a56767e92..0cb08a488 100644 --- a/src/Infrastructure/BotSharp.Core/Plugins/Knowledges/KnowledgeBaseSettings.cs +++ b/src/Infrastructure/BotSharp.Core/Plugins/Knowledges/KnowledgeBaseSettings.cs @@ -5,4 +5,5 @@ public class KnowledgeBaseSettings public string VectorDb { get; set; } public string TextEmbedding { get; set; } public string TextCompletion { get; set; } + public string Pdf2TextConverter { get; set; } } diff --git a/src/Infrastructure/BotSharp.Core/Plugins/Knowledges/Services/PigPdf2TextConverter.cs b/src/Infrastructure/BotSharp.Core/Plugins/Knowledges/Services/PigPdf2TextConverter.cs new file mode 100644 index 000000000..c19ddc77a --- /dev/null +++ b/src/Infrastructure/BotSharp.Core/Plugins/Knowledges/Services/PigPdf2TextConverter.cs @@ -0,0 +1,49 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.Text; +using Microsoft.AspNetCore.Http; +using UglyToad.PdfPig; +using UglyToad.PdfPig.Content; + +namespace BotSharp.Core.Plugins.Knowledges.Services; + +public class PigPdf2TextConverter : IPdf2TextConverter +{ + public async Task ConvertPdfToText(IFormFile formFile, int? startPageNum, int? endPageNum) + { + return await OpenPdfDocumentAsync(formFile, startPageNum, endPageNum); + } + + private async Task OpenPdfDocumentAsync(IFormFile formFile, int? startPageNum, int? endPageNum) + { + if (formFile.Length <= 0) + { + return await Task.FromResult(string.Empty); + } + + var filePath = Path.GetTempFileName(); + + using (var stream = System.IO.File.Create(filePath)) + { + await formFile.CopyToAsync(stream); + } + + var document = PdfDocument.Open(filePath); + var content = ""; + foreach (Page page in document.GetPages()) + { + if (startPageNum.HasValue && page.Number < startPageNum.Value) + { + continue; + } + + if (endPageNum.HasValue && page.Number > endPageNum.Value) + { + continue; + } + content += page.Text; + } + return content; + } +} diff --git a/src/Infrastructure/BotSharp.OpenAPI/BotSharp.OpenAPI.csproj b/src/Infrastructure/BotSharp.OpenAPI/BotSharp.OpenAPI.csproj index d1a16a68e..7b8290d63 100644 --- a/src/Infrastructure/BotSharp.OpenAPI/BotSharp.OpenAPI.csproj +++ b/src/Infrastructure/BotSharp.OpenAPI/BotSharp.OpenAPI.csproj @@ -1,4 +1,4 @@ - + net6.0 @@ -9,11 +9,11 @@ - + diff --git a/src/Infrastructure/BotSharp.OpenAPI/Controllers/KnowledgeController.cs b/src/Infrastructure/BotSharp.OpenAPI/Controllers/KnowledgeController.cs index 4d63885e6..3824f4768 100644 --- a/src/Infrastructure/BotSharp.OpenAPI/Controllers/KnowledgeController.cs +++ b/src/Infrastructure/BotSharp.OpenAPI/Controllers/KnowledgeController.cs @@ -3,6 +3,8 @@ using Microsoft.AspNetCore.Http; using UglyToad.PdfPig.Content; using UglyToad.PdfPig; +using BotSharp.Core.Plugins.Knowledges; + namespace BotSharp.OpenAPI.Controllers; @@ -11,11 +13,13 @@ namespace BotSharp.OpenAPI.Controllers; public class KnowledgeController : ControllerBase, IApiAdapter { private readonly IKnowledgeService _knowledgeService; - public KnowledgeController(IKnowledgeService knowledgeService) + private readonly IServiceProvider _services; + + public KnowledgeController(IKnowledgeService knowledgeService, IServiceProvider services) { _knowledgeService = knowledgeService; + _services = services; } - [HttpGet("/knowledge/{agentId}")] public async Task> RetrieveKnowledge([FromRoute] string agentId, [FromQuery(Name = "q")] string question) { @@ -27,44 +31,22 @@ public async Task> RetrieveKnowledge([FromRoute] string ag } [HttpPost("/knowledge/{agentId}")] - public async Task FeedKnowledge([FromRoute] string agentId, List files, [FromQuery] int? startPageNum, [FromQuery] int? endPageNum) + public async Task FeedKnowledge([FromRoute] string agentId, List files, [FromQuery] int? startPageNum, [FromQuery] int? endPageNum, [FromQuery] bool? paddleModel) { + var setttings = _services.GetRequiredService(); + var textConverter = _services.GetServices().First(x => x.GetType().FullName.EndsWith(setttings.Pdf2TextConverter)); long size = files.Sum(f => f.Length); foreach (var formFile in files) { - if (formFile.Length <= 0) - { - continue; - } - - var filePath = Path.GetTempFileName(); - - using (var stream = System.IO.File.Create(filePath)) - { - await formFile.CopyToAsync(stream); - } - - var document = PdfDocument.Open(filePath); var content = ""; - foreach (Page page in document.GetPages()) - { - if (startPageNum.HasValue && page.Number < startPageNum.Value) - { - continue; - } - - if (endPageNum.HasValue && page.Number > endPageNum.Value) - { - continue; - } - content += page.Text; - } + content = await textConverter.ConvertPdfToText(formFile, startPageNum, endPageNum); // Process uploaded files // Don't rely on or trust the FileName property without validation. + // Add FeedWithMetaData await _knowledgeService.Feed(new KnowledgeFeedModel { AgentId = agentId, diff --git a/src/Plugins/BotSharp.Plugin.PaddleSharp/BotSharp.Plugin.PaddleSharp.csproj b/src/Plugins/BotSharp.Plugin.PaddleSharp/BotSharp.Plugin.PaddleSharp.csproj index 0069f97f6..3ad42c16d 100644 --- a/src/Plugins/BotSharp.Plugin.PaddleSharp/BotSharp.Plugin.PaddleSharp.csproj +++ b/src/Plugins/BotSharp.Plugin.PaddleSharp/BotSharp.Plugin.PaddleSharp.csproj @@ -8,10 +8,15 @@ + + + + + diff --git a/src/Plugins/BotSharp.Plugin.PaddleSharp/PaddleSharpPlugin.cs b/src/Plugins/BotSharp.Plugin.PaddleSharp/PaddleSharpPlugin.cs index f679c4e7d..94796b641 100644 --- a/src/Plugins/BotSharp.Plugin.PaddleSharp/PaddleSharpPlugin.cs +++ b/src/Plugins/BotSharp.Plugin.PaddleSharp/PaddleSharpPlugin.cs @@ -1,4 +1,7 @@ +using BotSharp.Abstraction.Knowledges; using BotSharp.Abstraction.Plugins; +using BotSharp.Plugin.PaddleSharp.Providers; +using BotSharp.Plugin.PaddleSharp.Settings; using Microsoft.Extensions.Configuration; using Microsoft.Extensions.DependencyInjection; using System; @@ -9,6 +12,9 @@ public class PaddleSharpPlugin : IBotSharpPlugin { public void RegisterDI(IServiceCollection services, IConfiguration config) { - + var settings = new PaddleSharpSettings(); + config.Bind("PaddleSharp", settings); + services.AddSingleton(x => settings); + services.AddSingleton(); } } diff --git a/src/Plugins/BotSharp.Plugin.PaddleSharp/Providers/PaddleOcrConverter.cs b/src/Plugins/BotSharp.Plugin.PaddleSharp/Providers/PaddleOcrConverter.cs new file mode 100644 index 000000000..4cf8edb79 --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.PaddleSharp/Providers/PaddleOcrConverter.cs @@ -0,0 +1,69 @@ +/* +using System; +using System.Collections.Generic; +using System.Text; +using Sdcb.PaddleOCR; +using Sdcb.PaddleOCR.Models; +using Sdcb.PaddleInference; +using Sdcb.PaddleOCR.Models.LocalV3; +using OpenCvSharp; +using System.Threading.Tasks; +using BotSharp.Abstraction.Knowledges; +using BotSharp.Plugin.PaddleSharp.Settings; + +namespace BotSharp.Plugin.PaddleSharp.Providers; + +public class PaddleOcrConverter : IPaddleOcrConverter +{ + private FullOcrModel _paddleFullOcrmodel; + private QueuedPaddleOcrAll _allModel; + private readonly PaddleSharpSettings _paddleSharpSettings; + + public PaddleOcrConverter(FullOcrModel paddleFullOcrmodel, QueuedPaddleOcrAll allModel, PaddleSharpSettings paddleSharpSettings) + { + _paddleFullOcrmodel = paddleFullOcrmodel; + _allModel = allModel; + _paddleSharpSettings = paddleSharpSettings; + } + + private void LoadModel() + { + _allModel = new(() => new PaddleOcrAll(_paddleFullOcrmodel, _paddleSharpSettings.device) + { + AllowRotateDetection = _paddleSharpSettings.allowRotateDetection, + Enable180Classification = _paddleSharpSettings.enable180Classification, + }, consumerCount: _paddleSharpSettings.consumerCount, boundedCapacity: _paddleSharpSettings.boundedCapacity); + } + + private void DisposeModel() + { + _allModel.Dispose(); + } + + public async Task ConvertImageToText(string loadPath) + { + _allModel = new(() => new PaddleOcrAll(_paddleFullOcrmodel, _paddleSharpSettings.device) + { + AllowRotateDetection = _paddleSharpSettings.allowRotateDetection, + Enable180Classification = _paddleSharpSettings.enable180Classification, + }, consumerCount: _paddleSharpSettings.consumerCount, boundedCapacity: _paddleSharpSettings.boundedCapacity); + + var contents = ""; + using (Mat src = Cv2.ImRead(loadPath)) + { + PaddleOcrResult result = await _allModel.Run(src); + + foreach (PaddleOcrResultRegion region in result.Regions) + { + if (region.Score > _paddleSharpSettings.acceptScore) + { + contents += region.Text + " "; + } + } + } + + _allModel.Dispose(); + return contents; + } +} +*/ \ No newline at end of file diff --git a/src/Plugins/BotSharp.Plugin.PaddleSharp/Providers/Pdf2TextConverter.cs b/src/Plugins/BotSharp.Plugin.PaddleSharp/Providers/Pdf2TextConverter.cs new file mode 100644 index 000000000..ab9d6576c --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.PaddleSharp/Providers/Pdf2TextConverter.cs @@ -0,0 +1,164 @@ +using System; +using System.Collections.Generic; +using System.Text; +using System.IO; +using ImageMagick; +using OpenCvSharp; +using Microsoft.AspNetCore.Http; +using Sdcb.PaddleInference; +using Sdcb.PaddleOCR.Models; +using Sdcb.PaddleOCR.Models.LocalV3; +using Sdcb.PaddleOCR; +using System.Threading.Tasks; +using BotSharp.Abstraction.Knowledges; +using System.Linq; +using Docnet; +using Docnet.Core.Models; +using Docnet.Core; +using Docnet.Core.Converters; +using System.Drawing; +using System.Drawing.Imaging; +using System.Runtime.InteropServices; +using BotSharp.Plugin.PaddleSharp.Settings; + +namespace BotSharp.Plugin.PaddleSharp.Providers; + +public class Pdf2TextConverter : IPdf2TextConverter +{ + private Dictionary _mappings = new Dictionary(); + private FullOcrModel _model = LocalFullModels.EnglishV3; + private PaddleSharpSettings _paddleSharpSettings; + public Pdf2TextConverter(PaddleSharpSettings paddleSharpSettings) + { + _paddleSharpSettings = paddleSharpSettings; + } + + public async Task ConvertPdfToText(IFormFile formFile, int? startPageNum, int? endPageNum) + { + await ConvertPdfToLocalImagesAsync(formFile, startPageNum, endPageNum); + return await LocalImageToTextsAsync(); + } + + private async Task LocalImageToTextsAsync() + { + string loadPath; + string contents = ""; + if (!Directory.Exists(_paddleSharpSettings.tempFolderPath)) + { + throw new Exception("No local temporary files found! Please convert PDF to local images first by \"ConvertPdfToLocalImages\"."); + } + + QueuedPaddleOcrAll all = new(() => new PaddleOcrAll(_model, PaddleDevice.Mkldnn()) + { + AllowRotateDetection = true, + Enable180Classification = false, + }, consumerCount: _paddleSharpSettings.consumerCount, boundedCapacity: _paddleSharpSettings.boundedCapacity); + + + foreach (var item in _mappings.OrderBy(x => x.Key)) + { + loadPath = Path.Combine(_paddleSharpSettings.tempFolderPath, item.Value); + + using (Mat src = Cv2.ImRead(loadPath)) + { + PaddleOcrResult result = await all.Run(src); + + foreach (PaddleOcrResultRegion region in result.Regions) + { + if (region.Score > _paddleSharpSettings.acceptScore) + { + contents += region.Text + " "; + } + } + } + } + return contents; + } + + private static void AddBytes(Bitmap bmp, byte[] rawBytes) + { + var rect = new Rectangle(0, 0, bmp.Width, bmp.Height); + + var bmpData = bmp.LockBits(rect, ImageLockMode.WriteOnly, bmp.PixelFormat); + var pNative = bmpData.Scan0; + + Marshal.Copy(rawBytes, 0, pNative, rawBytes.Length); + bmp.UnlockBits(bmpData); + } + + public void DocnetConverter(string filePath, int width = 1080, int height = 1920) + { + var pageSettings = new PageDimensions(width, height); + + // using (var docReader = DocLib.Instance.GetDocReader("C:\\Users\\104199\\Postman\\files\\WM2077CW.pdf", new PageDimensions(1080, 1920))) + using (var docReader = DocLib.Instance.GetDocReader(filePath, pageSettings)) + { + using (var pageReader = docReader.GetPageReader(17)) + { + var rawBytes = pageReader.GetImage(); + var pageWidth = pageReader.GetPageWidth(); + var pageHeight = pageReader.GetPageHeight(); + var characters = pageReader.GetCharacters(); + + using (var bmp = new Bitmap(pageWidth, pageHeight, PixelFormat.Format32bppArgb)) + { + AddBytes(bmp, rawBytes); + + using (var imageStream = new MemoryStream()) + { + //saving and exporting + bmp.Save(imageStream, ImageFormat.Png); + System.IO.File.WriteAllBytes(filePath, imageStream.ToArray()); + }; + } + } + }; + } + + private async Task ConvertPdfToLocalImagesAsync(IFormFile formFile, int? startPageNum, int? endPageNum) + { + string rootFileName; + + var filePath = Path.GetTempFileName(); + + using (var stream = System.IO.File.Create(filePath)) + { + await formFile.CopyToAsync(stream); + } + + using var images = new MagickImageCollection(); + // _magicReadSettings.Density = new Density((double)300); + /* + using var images = new MagickImageCollection(); + MagickNET.SetGhostscriptDirectory("C:\\Users\\104199\\Downloads\\ghostpcl-10.01.2-win64\\ghostpcl-10.01.2-win64"); + + images.Read("C:\\Users\\104199\\Postman\\files\\page12.pdf", new MagickReadSettings + { + Density = new Density(300, 300) + }); + */ + images.Read(filePath, new MagickReadSettings + { + Density = new Density(300, 300) + }); + + if (images.Count == 0) + { + throw new Exception("PDF loading failed. Please check if the PDF format is correct!"); + } + + startPageNum = startPageNum.HasValue ? startPageNum : 1; + endPageNum = endPageNum.HasValue ? endPageNum : images.Count; + + for (int page = (int)startPageNum; page <= (int)endPageNum; page++) + { + string tempFileName = Path.GetRandomFileName(); + tempFileName = Path.ChangeExtension(tempFileName, "png"); + rootFileName = Path.Combine(_paddleSharpSettings.tempFolderPath, tempFileName); + + // image.Format = MagickFormat.Jpg; Set to "Jpg" format + images[page].Write(rootFileName); + _mappings[page] = rootFileName; + } + } +} diff --git a/src/Plugins/BotSharp.Plugin.PaddleSharp/Settings/PaddleSharpSettings.cs b/src/Plugins/BotSharp.Plugin.PaddleSharp/Settings/PaddleSharpSettings.cs new file mode 100644 index 000000000..39c99343e --- /dev/null +++ b/src/Plugins/BotSharp.Plugin.PaddleSharp/Settings/PaddleSharpSettings.cs @@ -0,0 +1,37 @@ +using System; +using System.Collections.Generic; +using System.Text; +using Sdcb.PaddleOCR; +using ImageMagick; +using Sdcb.PaddleOCR.Models; +using System.IO; +using Sdcb.PaddleInference; + +namespace BotSharp.Plugin.PaddleSharp.Settings +{ + public class PaddleSharpSettings + { + public MagickReadSettings magickReadSettings { get; set; } + public PaddleOcrAll paddleOcrAll { get; set; } + public string tempFolderPath { get; set; } = Path.GetTempPath(); + public PaddleOcrAll paddleSettings { get; set; } + public MagickReadSettings magicReadSettings + { + get + { + return magicReadSettings; + } + set + { + magicReadSettings.Density = new Density(300, 300); + } + } + public int consumerCount { get; set; } = 1; + public int boundedCapacity { get; set; } = 64; + public double acceptScore { get; set; } + public Action device { get; set; } = PaddleDevice.Mkldnn(); + public bool allowRotateDetection { get; set; } + public bool enable180Classification { get; set; } + public bool paddleModel { get; set; } = true; + } +} diff --git a/src/WebStarter/appsettings.json b/src/WebStarter/appsettings.json index 606ec0b26..9657f0724 100644 --- a/src/WebStarter/appsettings.json +++ b/src/WebStarter/appsettings.json @@ -81,14 +81,15 @@ "WeixinAppSecret": "#{WeixinAppSecret}#" }, - "KnowledgeBase": { - "VectorDb": "MemVectorDatabase", - // "VectorDb": "QdrantDb", - "TextEmbedding": "fastTextEmbeddingProvider", - // "TextEmbedding": "LLamaSharp.TextEmbeddingProvider", - "TextCompletion": "AzureOpenAI.Providers.TextCompletionProvider" - // "TextCompletion": "LLamaSharp.TextCompletionProvider" - }, + "KnowledgeBase": { + "VectorDb": "MemVectorDatabase", + // "VectorDb": "QdrantDb", + "TextEmbedding": "fastTextEmbeddingProvider", + // "TextEmbedding": "LLamaSharp.TextEmbeddingProvider", + "TextCompletion": "AzureOpenAI.Providers.TextCompletionProvider", + // "TextCompletion": "LLamaSharp.TextCompletionProvider", + "Pdf2TextConverter": "PaddleSharp.Providers.Pdf2TextConverter" + }, "PluginLoader": { "Assemblies": [