SciSharp · martindevans · Jul 24, 2024 · Jul 22, 2024 · Jul 22, 2024 · Jul 22, 2024
diff --git a/LLama.KernelMemory/LLamaSharp.KernelMemory.csproj b/LLama.KernelMemory/LLamaSharp.KernelMemory.csproj
@@ -27,7 +27,7 @@
   </PropertyGroup>
 
   <ItemGroup>
-    <PackageReference Include="Microsoft.KernelMemory.Abstractions" Version="0.66.240709.1" />
+    <PackageReference Include="Microsoft.KernelMemory.Abstractions" Version="0.68.240716.1" />
   </ItemGroup>
 
   <ItemGroup>

diff --git a/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs b/LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs
@@ -1,6 +1,5 @@
 using LLama;
 using LLama.Common;
-using LLama.Native;
 using Microsoft.KernelMemory;
 using Microsoft.KernelMemory.AI;
 
@@ -112,5 +111,24 @@ public async Task<Embedding> GenerateEmbeddingAsync(string text, CancellationTok
 
         /// <inheritdoc/>
         public int CountTokens(string text) => _embedder.Context.Tokenize(text, special: true).Length;
+
+        /// <summary>
+        /// Get the list of tokens for the input text
+        /// </summary>
+        /// <param name="text">Input string to be tokenized</param>
+        /// <returns>Read-only list of tokens for the input test</returns>
+        /// <remarks>
+        /// It throws if text is null and Includes empty stop token because addBos is left true to be consistent with the CountTokens implementation.</remarks>
+        /// <see cref="CountTokens(string)"/>
+        public IReadOnlyList<string> GetTokens(string text)
+        {
+            /* see relevant unit tests for important implementation notes regarding unicode */
+            var context = _embedder.Context;
+            var numericTokens = context.Tokenize(text, special: true);
+            var decoder = new StreamingTokenDecoder(context);
+            return numericTokens
+                .Select(x => { decoder.Add(x); return decoder.Read(); })
+                .ToList();
+        }
     }
 }
diff --git a/LLama.KernelMemory/LlamaSharpTextGenerator.cs b/LLama.KernelMemory/LlamaSharpTextGenerator.cs
@@ -106,5 +106,23 @@ private static InferenceParams OptionsToParams(TextGenerationOptions options, In
 
         /// <inheritdoc/>
         public int CountTokens(string text) => _context.Tokenize(text, special: true).Length;
+
+        /// <summary>
+        /// Get the list of tokens for the input text
+        /// </summary>
+        /// <param name="text">Input string to be tokenized</param>
+        /// <returns>Read-only list of tokens for the input test</returns>
+        /// <remarks>
+        /// It throws if text is null and Includes empty stop token because addBos is left true to be consistent with the CountTokens implementation.</remarks>
+        /// <see cref="CountTokens(string)"/>
+        public IReadOnlyList<string> GetTokens(string text)
+        {
+            /* see relevant unit tests for important implementation notes regarding unicode */
+            var numericTokens = _context.Tokenize(text, special: true);
+            var decoder = new StreamingTokenDecoder(_context);
+            return numericTokens
+                .Select(x => { decoder.Add(x); return decoder.Read(); })
+                .ToList();
+        }
     }
 }
diff --git a/LLama.Unittest/KernelMemory/ITextTokenizerTests.cs b/LLama.Unittest/KernelMemory/ITextTokenizerTests.cs
@@ -0,0 +1,117 @@
+using LLama.Common;
+using LLamaSharp.KernelMemory;
+using Microsoft.KernelMemory.AI;
+using Xunit.Abstractions;
+
+namespace LLama.Unittest.KernelMemory
+{
+
+    public abstract class ITextTokenizerTests
+    {
+        private readonly ITestOutputHelper _testOutputHelper;
+
+#pragma warning disable KMEXP00 // Type is for evaluation purposes only and is subject to change or removal in future updates. Suppress this diagnostic to proceed.
+        protected ITextTokenizer? _generator;
+#pragma warning restore KMEXP00 // Type is for evaluation purposes only and is subject to change or removal in future updates. Suppress this diagnostic to proceed.
+
+        protected InferenceParams _infParams;
+        protected LLamaSharpConfig _lsConfig;
+
+        public ITextTokenizerTests(ITestOutputHelper testOutputHelper)
+        {
+            _testOutputHelper = testOutputHelper;
+
+            _infParams = new() { AntiPrompts = ["\n\n"] };
+            _lsConfig = new(Constants.GenerativeModelPath) { DefaultInferenceParams = _infParams };
+
+            testOutputHelper.WriteLine($"Using model {Path.GetFileName(_lsConfig.ModelPath)}");
+        }        
+
+
+        [Theory]
+        [InlineData("The quick brown fox jumps over the lazy dog")]
+        [InlineData("Well, here're some special characters!!!")]
+        [InlineData("...___---")]
+        [InlineData("15 + 6 = 21 && 68 * 75 = 5100")]
+        [InlineData("  \n  \r\n  \t   ")]
+        public void GetTokens_ShouldReturnListOfTokensForInputString(string? text)
+        {
+            var tokens = _generator!.GetTokens(text);
+            var tokensCount = _generator.CountTokens(text);
+
+            var expected = " " + text; // the placement of the space corresponding to BOS will vary by model tokenizer
+            var actual = string.Join("", tokens);
+
+            _testOutputHelper.WriteLine($"Tokens for '{text}':");
+            _testOutputHelper.WriteLine(string.Join("", tokens.Select(x => $"({x})")));
+
+            Assert.Equal(expected, actual);
+            Assert.Equal(tokensCount, tokens.Count);
+        }
+
+        /* This is exactly the same test as the non-unicode cases. However, there are reasons why this
+         * should be made a special case and may deviate in the future:
+         * 
+         * As of now there appears to be no final word as to how characters that consist of more than one 
+         * numeric token should correspond to textual tokens, and results vary according to different 
+         * models' tokenizers. For example, given a character 'Z' that corresponds to the numeric tokens {1,2,3} 
+         * some (llama-2) will pad the length of the total number of tokens by returning spaces as tokens 
+         * (i.e. ' ', ' ', 'Z') while others (GPT4Tokenizer) will pad with the character itself (i.e. 'Z','Z','Z').
+         * 
+         * This is very evident when tokenizing ideograms and emojis, but can arise with various unicode characters 
+         * as well. See pull request for more relevant discussion https://github.com/SciSharp/LLamaSharp/pull/862
+         *
+         * Currently the method will remain consistent with the output of ITextTokenizer.CountTokens, meaning
+         * any redundant tokens will not be omitted as long as they are counted by CountTokens.
+         * 
+         * StreamingTokenDecoder, while sufficiently useful for this task, was not designed with producing
+         * output for one numeric token at a time in mind, so ITextTokenizer.GetTokens should not be considered 
+         * an example of proper use.
+         * 
+         * Note: if this message is removed, also remove references to it in LLamaSharpTextEmbeddingGenerator.GetTokens
+         * and LLamaSharpTextGenerator.GetTokens
+         */
+        [Theory]
+        [InlineData("And a little bit of unicode για να κρατήσουμε τα πράγματα ενδιαφέροντα")]
+        [InlineData("猫坐在垫子上 😀🤨🤐😏")]
+        public void GetTokens_Unicode_ShouldReturnListOfTokensForInputString(string? text)
+        {
+            var tokens = _generator!.GetTokens(text);
+            var tokensCount = _generator.CountTokens(text);
+
+            var expected = " " + text; // the placement of the space corresponding to BOS will vary by model tokenizer
+            var actual = string.Join("", tokens);
+
+            _testOutputHelper.WriteLine($"Tokens for '{text}':");
+            _testOutputHelper.WriteLine(string.Join("", tokens.Select(x => $"({x})")));
+
+            Assert.Equal(expected, actual);
+            Assert.Equal(tokensCount, tokens.Count);
+        }
+
+        [Fact]
+        public void GetToken_ShouldThrowForNull()
+        {
+            string? text = null;
+
+            Assert.Throws<ArgumentNullException>(() => { _generator!.GetTokens(text!); });
+        }
+
+        [Fact]
+        public void GetToken_EmptyStringYieldsOneEmptyToken()
+        {
+            var text = "";
+            var expected = "";
+
+            var tokens = _generator!.GetTokens(text);
+            var tokensCount = _generator.CountTokens(text);
+            var actual = tokens.Single();
+
+            _testOutputHelper.WriteLine($"Tokens for '{text}':");
+            _testOutputHelper.WriteLine(string.Join("", tokens.Select(x => $"({x})")));
+
+            Assert.Equal(expected, actual);
+            Assert.Equal(tokensCount, tokens.Count);
+        }
+    }
+}
diff --git a/LLama.Unittest/KernelMemory/LLamaSharpTextEmbeddingGeneratorTests.cs b/LLama.Unittest/KernelMemory/LLamaSharpTextEmbeddingGeneratorTests.cs
@@ -0,0 +1,30 @@
+using LLama.Common;
+using LLamaSharp.KernelMemory;
+using Microsoft.KernelMemory.AI;
+using System;
+using System.Collections.Generic;
+using System.Linq;
+using System.Text;
+using System.Text.RegularExpressions;
+using System.Threading.Tasks;
+using Xunit.Abstractions;
+
+namespace LLama.Unittest.KernelMemory
+{
+    public class LLamaSharpTextEmbeddingGeneratorTests : ITextTokenizerTests, IDisposable
+    {
+        private readonly LLamaSharpTextEmbeddingGenerator _embeddingGenerator;
+
+        public LLamaSharpTextEmbeddingGeneratorTests(ITestOutputHelper testOutputHelper) : base(testOutputHelper)
+        {
+            _embeddingGenerator = new LLamaSharpTextEmbeddingGenerator(_lsConfig);
+
+            _generator = _embeddingGenerator;
+        }
+
+        public void Dispose()
+        {
+            _embeddingGenerator.Dispose();
+        }       
+    }
+}
diff --git a/LLama.Unittest/KernelMemory/LlamaSharpTextGeneratorTests.cs b/LLama.Unittest/KernelMemory/LlamaSharpTextGeneratorTests.cs
@@ -0,0 +1,34 @@
+using LLama.Common;
+using LLamaSharp.KernelMemory;
+using Microsoft.KernelMemory.AI;
+using System;
+using System.Collections.Generic;
+using System.Diagnostics;
+using System.Linq;
+using System.Reflection.Emit;
+using System.Text;
+using System.Text.RegularExpressions;
+using System.Threading.Tasks;
+using Xunit.Abstractions;
+using Xunit.Sdk;
+using static System.Net.Mime.MediaTypeNames;
+
+namespace LLama.Unittest.KernelMemory
+{
+    public class LlamaSharpTextGeneratorTests : ITextTokenizerTests, IDisposable
+    {        
+        private readonly LlamaSharpTextGenerator _textGenerator;
+
+        public LlamaSharpTextGeneratorTests(ITestOutputHelper testOutputHelper) : base(testOutputHelper)
+        {            
+            _textGenerator = new LlamaSharpTextGenerator(_lsConfig);
+
+            _generator = _textGenerator;
+        }
+
+        public void Dispose()
+        {
+            _textGenerator.Dispose();
+        }       
+    }
+}
diff --git a/LLama.Unittest/LLama.Unittest.csproj b/LLama.Unittest/LLama.Unittest.csproj
@@ -1,4 +1,4 @@
-<Project Sdk="Microsoft.NET.Sdk">
+<Project Sdk="Microsoft.NET.Sdk">
   <Import Project="..\LLama\LLamaSharp.Runtime.targets" />
   <PropertyGroup>
     <TargetFramework>net8.0</TargetFramework>
@@ -29,31 +29,16 @@
 
   <Target Name="DownloadContentFilesInner">
 
-    <DownloadFile
-		SourceUrl="https://huggingface.co/TheBloke/Llama-2-7b-Chat-GGUF/resolve/main/llama-2-7b-chat.Q3_K_S.gguf"
-		DestinationFolder="Models"
-		DestinationFileName="llama-2-7b-chat.Q3_K_S.gguf"
-		SkipUnchangedFiles="true">
+    <DownloadFile SourceUrl="https://huggingface.co/TheBloke/Llama-2-7b-Chat-GGUF/resolve/main/llama-2-7b-chat.Q3_K_S.gguf" DestinationFolder="Models" DestinationFileName="llama-2-7b-chat.Q3_K_S.gguf" SkipUnchangedFiles="true">
 	</DownloadFile>
 
-	<DownloadFile
-		SourceUrl="https://huggingface.co/cjpais/llava-1.6-mistral-7b-gguf/resolve/main/llava-v1.6-mistral-7b.Q3_K_XS.gguf"
-		DestinationFolder="Models" DestinationFileName="llava-v1.6-mistral-7b.Q3_K_XS.gguf"
-		SkipUnchangedFiles="true">
+	<DownloadFile SourceUrl="https://huggingface.co/cjpais/llava-1.6-mistral-7b-gguf/resolve/main/llava-v1.6-mistral-7b.Q3_K_XS.gguf" DestinationFolder="Models" DestinationFileName="llava-v1.6-mistral-7b.Q3_K_XS.gguf" SkipUnchangedFiles="true">
 	</DownloadFile>
 
-	<DownloadFile
-		SourceUrl="https://huggingface.co/cjpais/llava-1.6-mistral-7b-gguf/resolve/main/mmproj-model-f16.gguf"
-		DestinationFolder="Models"
-		DestinationFileName="mmproj-model-f16.gguf"
-		SkipUnchangedFiles="true">
+	<DownloadFile SourceUrl="https://huggingface.co/cjpais/llava-1.6-mistral-7b-gguf/resolve/main/mmproj-model-f16.gguf" DestinationFolder="Models" DestinationFileName="mmproj-model-f16.gguf" SkipUnchangedFiles="true">
 	</DownloadFile>
 
-	<DownloadFile
-		SourceUrl="https://huggingface.co/leliuga/all-MiniLM-L12-v2-GGUF/resolve/main/all-MiniLM-L12-v2.Q8_0.gguf"
-		DestinationFolder="Models"
-		DestinationFileName="all-MiniLM-L12-v2.Q8_0.gguf"
-		SkipUnchangedFiles="true">
+	<DownloadFile SourceUrl="https://huggingface.co/leliuga/all-MiniLM-L12-v2-GGUF/resolve/main/all-MiniLM-L12-v2.Q8_0.gguf" DestinationFolder="Models" DestinationFileName="all-MiniLM-L12-v2.Q8_0.gguf" SkipUnchangedFiles="true">
 	</DownloadFile>
 
   </Target>
@@ -63,14 +48,11 @@
   </Target>
 
   <ItemGroup>
+    <ProjectReference Include="..\LLama.KernelMemory\LLamaSharp.KernelMemory.csproj" />
     <ProjectReference Include="..\LLama.SemanticKernel\LLamaSharp.SemanticKernel.csproj" />
     <ProjectReference Include="..\LLama\LLamaSharp.csproj" />
   </ItemGroup>
 
-  <ItemGroup>
-    <Folder Include="Models\" />
-  </ItemGroup>
-
   <ItemGroup>
     <None Update="Models\all-MiniLM-L12-v2.Q8_0.gguf">
       <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>