-
Notifications
You must be signed in to change notification settings - Fork 470
Microsoft.KernelMemory version 0.68+ compatibility fix #862
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
martindevans
merged 8 commits into
SciSharp:master
from
SpaceAntelope:kernel-memory-68-compatibility-fix
Jul 24, 2024
Merged
Changes from all commits
Commits
Show all changes
8 commits
Select commit
Hold shift + click to select a range
a018ea4
added ITextTokenizer.GetTokens implementation to affected generators
SpaceAntelope a2ff5fa
updated LLama.KernelMemory to use Microsoft.KernelMemory.Abstractions…
SpaceAntelope 578bfa7
updated LLama.Unittest with reference to LLama.KernelMemory
SpaceAntelope 4a9b822
added some unit tests for ITextTokenizer.GetTokens implementation
SpaceAntelope 2532afd
removed redundant .AsReadOnly, cleaned up usings
SpaceAntelope dd5ffa1
changed misleading variable name
SpaceAntelope 63b50f5
spun off unicode test cases and added short explanation of the issue …
SpaceAntelope 939d2b1
fixed spelling errors in comments
SpaceAntelope File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
using LLama.Common; | ||
using LLamaSharp.KernelMemory; | ||
using Microsoft.KernelMemory.AI; | ||
using Xunit.Abstractions; | ||
|
||
namespace LLama.Unittest.KernelMemory | ||
{ | ||
|
||
public abstract class ITextTokenizerTests | ||
{ | ||
private readonly ITestOutputHelper _testOutputHelper; | ||
|
||
#pragma warning disable KMEXP00 // Type is for evaluation purposes only and is subject to change or removal in future updates. Suppress this diagnostic to proceed. | ||
protected ITextTokenizer? _generator; | ||
#pragma warning restore KMEXP00 // Type is for evaluation purposes only and is subject to change or removal in future updates. Suppress this diagnostic to proceed. | ||
|
||
protected InferenceParams _infParams; | ||
protected LLamaSharpConfig _lsConfig; | ||
|
||
public ITextTokenizerTests(ITestOutputHelper testOutputHelper) | ||
{ | ||
_testOutputHelper = testOutputHelper; | ||
|
||
_infParams = new() { AntiPrompts = ["\n\n"] }; | ||
_lsConfig = new(Constants.GenerativeModelPath) { DefaultInferenceParams = _infParams }; | ||
|
||
testOutputHelper.WriteLine($"Using model {Path.GetFileName(_lsConfig.ModelPath)}"); | ||
} | ||
|
||
|
||
[Theory] | ||
martindevans marked this conversation as resolved.
Show resolved
Hide resolved
|
||
[InlineData("The quick brown fox jumps over the lazy dog")] | ||
[InlineData("Well, here're some special characters!!!")] | ||
[InlineData("...___---")] | ||
[InlineData("15 + 6 = 21 && 68 * 75 = 5100")] | ||
[InlineData(" \n \r\n \t ")] | ||
public void GetTokens_ShouldReturnListOfTokensForInputString(string? text) | ||
{ | ||
var tokens = _generator!.GetTokens(text); | ||
var tokensCount = _generator.CountTokens(text); | ||
|
||
var expected = " " + text; // the placement of the space corresponding to BOS will vary by model tokenizer | ||
var actual = string.Join("", tokens); | ||
|
||
_testOutputHelper.WriteLine($"Tokens for '{text}':"); | ||
_testOutputHelper.WriteLine(string.Join("", tokens.Select(x => $"({x})"))); | ||
|
||
Assert.Equal(expected, actual); | ||
Assert.Equal(tokensCount, tokens.Count); | ||
} | ||
|
||
/* This is exactly the same test as the non-unicode cases. However, there are reasons why this | ||
* should be made a special case and may deviate in the future: | ||
* | ||
* As of now there appears to be no final word as to how characters that consist of more than one | ||
* numeric token should correspond to textual tokens, and results vary according to different | ||
* models' tokenizers. For example, given a character 'Z' that corresponds to the numeric tokens {1,2,3} | ||
* some (llama-2) will pad the length of the total number of tokens by returning spaces as tokens | ||
* (i.e. ' ', ' ', 'Z') while others (GPT4Tokenizer) will pad with the character itself (i.e. 'Z','Z','Z'). | ||
* | ||
* This is very evident when tokenizing ideograms and emojis, but can arise with various unicode characters | ||
* as well. See pull request for more relevant discussion https://github.com/SciSharp/LLamaSharp/pull/862 | ||
* | ||
* Currently the method will remain consistent with the output of ITextTokenizer.CountTokens, meaning | ||
* any redundant tokens will not be omitted as long as they are counted by CountTokens. | ||
* | ||
* StreamingTokenDecoder, while sufficiently useful for this task, was not designed with producing | ||
* output for one numeric token at a time in mind, so ITextTokenizer.GetTokens should not be considered | ||
* an example of proper use. | ||
* | ||
* Note: if this message is removed, also remove references to it in LLamaSharpTextEmbeddingGenerator.GetTokens | ||
* and LLamaSharpTextGenerator.GetTokens | ||
*/ | ||
[Theory] | ||
[InlineData("And a little bit of unicode για να κρατήσουμε τα πράγματα ενδιαφέροντα")] | ||
[InlineData("猫坐在垫子上 😀🤨🤐😏")] | ||
public void GetTokens_Unicode_ShouldReturnListOfTokensForInputString(string? text) | ||
{ | ||
var tokens = _generator!.GetTokens(text); | ||
var tokensCount = _generator.CountTokens(text); | ||
|
||
var expected = " " + text; // the placement of the space corresponding to BOS will vary by model tokenizer | ||
var actual = string.Join("", tokens); | ||
|
||
_testOutputHelper.WriteLine($"Tokens for '{text}':"); | ||
_testOutputHelper.WriteLine(string.Join("", tokens.Select(x => $"({x})"))); | ||
|
||
Assert.Equal(expected, actual); | ||
Assert.Equal(tokensCount, tokens.Count); | ||
} | ||
|
||
[Fact] | ||
public void GetToken_ShouldThrowForNull() | ||
{ | ||
string? text = null; | ||
|
||
Assert.Throws<ArgumentNullException>(() => { _generator!.GetTokens(text!); }); | ||
} | ||
|
||
[Fact] | ||
public void GetToken_EmptyStringYieldsOneEmptyToken() | ||
{ | ||
var text = ""; | ||
var expected = ""; | ||
|
||
var tokens = _generator!.GetTokens(text); | ||
var tokensCount = _generator.CountTokens(text); | ||
var actual = tokens.Single(); | ||
|
||
_testOutputHelper.WriteLine($"Tokens for '{text}':"); | ||
_testOutputHelper.WriteLine(string.Join("", tokens.Select(x => $"({x})"))); | ||
|
||
Assert.Equal(expected, actual); | ||
Assert.Equal(tokensCount, tokens.Count); | ||
} | ||
} | ||
} |
30 changes: 30 additions & 0 deletions
30
LLama.Unittest/KernelMemory/LLamaSharpTextEmbeddingGeneratorTests.cs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
using LLama.Common; | ||
using LLamaSharp.KernelMemory; | ||
using Microsoft.KernelMemory.AI; | ||
using System; | ||
using System.Collections.Generic; | ||
using System.Linq; | ||
using System.Text; | ||
using System.Text.RegularExpressions; | ||
using System.Threading.Tasks; | ||
using Xunit.Abstractions; | ||
|
||
namespace LLama.Unittest.KernelMemory | ||
{ | ||
public class LLamaSharpTextEmbeddingGeneratorTests : ITextTokenizerTests, IDisposable | ||
{ | ||
private readonly LLamaSharpTextEmbeddingGenerator _embeddingGenerator; | ||
|
||
public LLamaSharpTextEmbeddingGeneratorTests(ITestOutputHelper testOutputHelper) : base(testOutputHelper) | ||
{ | ||
_embeddingGenerator = new LLamaSharpTextEmbeddingGenerator(_lsConfig); | ||
|
||
_generator = _embeddingGenerator; | ||
} | ||
|
||
public void Dispose() | ||
{ | ||
_embeddingGenerator.Dispose(); | ||
} | ||
} | ||
} |
34 changes: 34 additions & 0 deletions
34
LLama.Unittest/KernelMemory/LlamaSharpTextGeneratorTests.cs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
using LLama.Common; | ||
using LLamaSharp.KernelMemory; | ||
using Microsoft.KernelMemory.AI; | ||
using System; | ||
using System.Collections.Generic; | ||
using System.Diagnostics; | ||
using System.Linq; | ||
using System.Reflection.Emit; | ||
using System.Text; | ||
using System.Text.RegularExpressions; | ||
using System.Threading.Tasks; | ||
using Xunit.Abstractions; | ||
using Xunit.Sdk; | ||
using static System.Net.Mime.MediaTypeNames; | ||
|
||
namespace LLama.Unittest.KernelMemory | ||
{ | ||
public class LlamaSharpTextGeneratorTests : ITextTokenizerTests, IDisposable | ||
{ | ||
private readonly LlamaSharpTextGenerator _textGenerator; | ||
|
||
public LlamaSharpTextGeneratorTests(ITestOutputHelper testOutputHelper) : base(testOutputHelper) | ||
{ | ||
_textGenerator = new LlamaSharpTextGenerator(_lsConfig); | ||
|
||
_generator = _textGenerator; | ||
} | ||
|
||
public void Dispose() | ||
{ | ||
_textGenerator.Dispose(); | ||
} | ||
} | ||
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.