| 
 | 1 | +using LLama.Common;  | 
 | 2 | +using LLamaSharp.KernelMemory;  | 
 | 3 | +using Microsoft.KernelMemory.AI;  | 
 | 4 | +using Xunit.Abstractions;  | 
 | 5 | + | 
 | 6 | +namespace LLama.Unittest.KernelMemory  | 
 | 7 | +{  | 
 | 8 | +      | 
 | 9 | +    public abstract class ITextTokenizerTests  | 
 | 10 | +    {  | 
 | 11 | +        private readonly ITestOutputHelper _testOutputHelper;  | 
 | 12 | + | 
 | 13 | +#pragma warning disable KMEXP00 // Type is for evaluation purposes only and is subject to change or removal in future updates. Suppress this diagnostic to proceed.  | 
 | 14 | +        protected ITextTokenizer? _generator;  | 
 | 15 | +#pragma warning restore KMEXP00 // Type is for evaluation purposes only and is subject to change or removal in future updates. Suppress this diagnostic to proceed.  | 
 | 16 | + | 
 | 17 | +        protected InferenceParams _infParams;  | 
 | 18 | +        protected LLamaSharpConfig _lsConfig;  | 
 | 19 | + | 
 | 20 | +        public ITextTokenizerTests(ITestOutputHelper testOutputHelper)  | 
 | 21 | +        {  | 
 | 22 | +            _testOutputHelper = testOutputHelper;  | 
 | 23 | + | 
 | 24 | +            _infParams = new() { AntiPrompts = ["\n\n"] };  | 
 | 25 | +            _lsConfig = new(Constants.GenerativeModelPath) { DefaultInferenceParams = _infParams };  | 
 | 26 | + | 
 | 27 | +            testOutputHelper.WriteLine($"Using model {Path.GetFileName(_lsConfig.ModelPath)}");  | 
 | 28 | +        }          | 
 | 29 | + | 
 | 30 | + | 
 | 31 | +        [Theory]  | 
 | 32 | +        [InlineData("The quick brown fox jumps over the lazy dog")]  | 
 | 33 | +        [InlineData("Well, here're some special characters!!!")]  | 
 | 34 | +        [InlineData("...___---")]  | 
 | 35 | +        [InlineData("15 + 6 = 21 && 68 * 75 = 5100")]  | 
 | 36 | +        [InlineData("  \n  \r\n  \t   ")]  | 
 | 37 | +        public void GetTokens_ShouldReturnListOfTokensForInputString(string? text)  | 
 | 38 | +        {  | 
 | 39 | +            var tokens = _generator!.GetTokens(text);  | 
 | 40 | +            var tokensCount = _generator.CountTokens(text);  | 
 | 41 | + | 
 | 42 | +            var expected = " " + text; // the placement of the space corresponding to BOS will vary by model tokenizer  | 
 | 43 | +            var actual = string.Join("", tokens);  | 
 | 44 | + | 
 | 45 | +            _testOutputHelper.WriteLine($"Tokens for '{text}':");  | 
 | 46 | +            _testOutputHelper.WriteLine(string.Join("", tokens.Select(x => $"({x})")));  | 
 | 47 | + | 
 | 48 | +            Assert.Equal(expected, actual);  | 
 | 49 | +            Assert.Equal(tokensCount, tokens.Count);  | 
 | 50 | +        }  | 
 | 51 | + | 
 | 52 | +        /* This is exactly the same test as the non-unicode cases. However, there are reasons why this  | 
 | 53 | +         * should be made a special case and may deviate in the future:  | 
 | 54 | +         *   | 
 | 55 | +         * As of now there appears to be no final word as to how characters that consist of more than one   | 
 | 56 | +         * numeric token should correspond to textual tokens, and results vary according to different   | 
 | 57 | +         * models' tokenizers. For example, given a character 'Z' that corresponds to the numeric tokens {1,2,3}   | 
 | 58 | +         * some (llama-2) will pad the length of the total number of tokens by returning spaces as tokens   | 
 | 59 | +         * (i.e. ' ', ' ', 'Z') while others (GPT4Tokenizer) will pad with the character itself (i.e. 'Z','Z','Z').  | 
 | 60 | +         *   | 
 | 61 | +         * This is very evident when tokenizing ideograms and emojis, but can arise with various unicode characters   | 
 | 62 | +         * as well. See pull request for more relevant discussion https://github.com/SciSharp/LLamaSharp/pull/862  | 
 | 63 | +         *  | 
 | 64 | +         * Currently the method will remain consistent with the output of ITextTokenizer.CountTokens, meaning  | 
 | 65 | +         * any redundant tokens will not be omitted as long as they are counted by CountTokens.  | 
 | 66 | +         *   | 
 | 67 | +         * StreamingTokenDecoder, while sufficiently useful for this task, was not designed with producing  | 
 | 68 | +         * output for one numeric token at a time in mind, so ITextTokenizer.GetTokens should not be considered   | 
 | 69 | +         * an example of proper use.  | 
 | 70 | +         *   | 
 | 71 | +         * Note: if this message is removed, also remove references to it in LLamaSharpTextEmbeddingGenerator.GetTokens  | 
 | 72 | +         * and LLamaSharpTextGenerator.GetTokens  | 
 | 73 | +         */  | 
 | 74 | +        [Theory]  | 
 | 75 | +        [InlineData("And a little bit of unicode για να κρατήσουμε τα πράγματα ενδιαφέροντα")]  | 
 | 76 | +        [InlineData("猫坐在垫子上 😀🤨🤐😏")]  | 
 | 77 | +        public void GetTokens_Unicode_ShouldReturnListOfTokensForInputString(string? text)  | 
 | 78 | +        {  | 
 | 79 | +            var tokens = _generator!.GetTokens(text);  | 
 | 80 | +            var tokensCount = _generator.CountTokens(text);  | 
 | 81 | + | 
 | 82 | +            var expected = " " + text; // the placement of the space corresponding to BOS will vary by model tokenizer  | 
 | 83 | +            var actual = string.Join("", tokens);  | 
 | 84 | + | 
 | 85 | +            _testOutputHelper.WriteLine($"Tokens for '{text}':");  | 
 | 86 | +            _testOutputHelper.WriteLine(string.Join("", tokens.Select(x => $"({x})")));  | 
 | 87 | + | 
 | 88 | +            Assert.Equal(expected, actual);  | 
 | 89 | +            Assert.Equal(tokensCount, tokens.Count);  | 
 | 90 | +        }  | 
 | 91 | + | 
 | 92 | +        [Fact]  | 
 | 93 | +        public void GetToken_ShouldThrowForNull()  | 
 | 94 | +        {  | 
 | 95 | +            string? text = null;  | 
 | 96 | + | 
 | 97 | +            Assert.Throws<ArgumentNullException>(() => { _generator!.GetTokens(text!); });  | 
 | 98 | +        }  | 
 | 99 | + | 
 | 100 | +        [Fact]  | 
 | 101 | +        public void GetToken_EmptyStringYieldsOneEmptyToken()  | 
 | 102 | +        {  | 
 | 103 | +            var text = "";  | 
 | 104 | +            var expected = "";  | 
 | 105 | + | 
 | 106 | +            var tokens = _generator!.GetTokens(text);  | 
 | 107 | +            var tokensCount = _generator.CountTokens(text);  | 
 | 108 | +            var actual = tokens.Single();  | 
 | 109 | + | 
 | 110 | +            _testOutputHelper.WriteLine($"Tokens for '{text}':");  | 
 | 111 | +            _testOutputHelper.WriteLine(string.Join("", tokens.Select(x => $"({x})")));  | 
 | 112 | + | 
 | 113 | +            Assert.Equal(expected, actual);  | 
 | 114 | +            Assert.Equal(tokensCount, tokens.Count);  | 
 | 115 | +        }  | 
 | 116 | +    }  | 
 | 117 | +}  | 
0 commit comments