Merge pull request #1248 from krisbiradar/add-support-for-gemma-3n

martindevans · web-flow · commit 4bc90f44841e · 2025-11-13T15:32:40.000Z
Add support for gemma 3n
diff --git a/LLama.Unittest/LLamaContextTests.cs b/LLama.Unittest/LLamaContextTests.cs
@@ -13,10 +13,9 @@ public LLamaContextTests()
         {
             var @params = new ModelParams(Constants.GenerativeModelPath2)
             {
-                ContextSize = 128,
+                ContextSize = 512,
                 BatchSize = 8,
                 UBatchSize = 8,
-                SeqMax = 1,
                 VocabOnly = false,
                 GpuLayerCount = Constants.CIGpuLayerCount,
             };
@@ -33,7 +32,7 @@ public void Dispose()
         [Fact]
         public void CheckProperties()
         {
-            Assert.Equal(128u, _context.ContextSize);
+            Assert.Equal(_context.NativeHandle.MaxSeq * 256, _context.ContextSize);
             Assert.Equal(960, _context.EmbeddingSize);
             Assert.Equal(49152, _context.Vocab.Count);
         }
diff --git a/LLama.Unittest/LLamaContextWithCustomLoggerTests.cs b/LLama.Unittest/LLamaContextWithCustomLoggerTests.cs
@@ -30,7 +30,7 @@ public LLamaContextWithCustomLoggerTests()
         {
             var @params = new ModelParams(Constants.GenerativeModelPath2)
             {
-                ContextSize = 128,
+                ContextSize = 512,
                 GpuLayerCount = Constants.CIGpuLayerCount,
             };
 
@@ -55,7 +55,7 @@ public void Dispose()
         [Fact]
         public void CheckProperties()
         {
-            Assert.Equal(128u, _context.ContextSize);
+            Assert.Equal(_context.NativeHandle.MaxSeq * 256, _context.ContextSize);
             Assert.Equal(960, _context.EmbeddingSize);
             Assert.Equal(49152, _context.Vocab.Count);
         }
diff --git a/LLama.Unittest/LLamaRerankerTests.cs b/LLama.Unittest/LLamaRerankerTests.cs
@@ -20,7 +20,6 @@ public LLamaRerankerTests(ITestOutputHelper testOutputHelper)
             ContextSize = 0,
             PoolingType = LLamaPoolingType.Rank,
             GpuLayerCount = Constants.CIGpuLayerCount,
-
         };
         using var weights = LLamaWeights.LoadFromFile(@params);
         _reranker = new LLamaReranker(weights, @params);
diff --git a/LLama.Unittest/SamplingTests.cs b/LLama.Unittest/SamplingTests.cs
@@ -104,7 +104,7 @@ public void BatchedSampling()
                 }
             }
 
-            // Add " repeat" and test whether next tokens will be "this phrase forever.".
+            // Add " repeat" and test whether next tokens will be "this phrase forever."
             for (int i = 0; i < 4; i++)
             {
                 for (int b = 0; b < batch_count; b++)
diff --git a/LLama.Web/Common/ModelOptions.cs b/LLama.Web/Common/ModelOptions.cs
@@ -102,7 +102,7 @@ public class ModelOptions
         public bool NoKqvOffload { get; set; }
 
         /// <inheritdoc />
-        public bool FlashAttention { get; set; }
+        public bool? FlashAttention { get; set; }
 
         /// <inheritdoc />
         public Encoding Encoding { get; set; } = Encoding.UTF8;
diff --git a/LLama/Abstractions/IContextParams.cs b/LLama/Abstractions/IContextParams.cs
@@ -106,8 +106,8 @@ public interface IContextParams
     /// <summary>
     /// Whether to use flash attention
     /// </summary>
-    bool FlashAttention { get; }
-
+    bool? FlashAttention { get; }
+    
     /// <summary>
     /// defragment the KV cache if holes/size &gt; defrag_threshold, Set to &lt;= 0 to disable (default)
     /// </summary>
diff --git a/LLama/Common/ModelParams.cs b/LLama/Common/ModelParams.cs
@@ -1,3 +1,4 @@
+using System;
 using LLama.Abstractions;
 using System.Text;
 using System.Text.Json.Serialization;
@@ -95,12 +96,12 @@ public record ModelParams
 
         /// <inheritdoc />
         public bool NoKqvOffload { get; set; }
-
+        
         /// <inheritdoc />
-
-        public bool FlashAttention { get; set; }
+        public bool? FlashAttention { get; set; }
 
         /// <inheritdoc />
+        [Obsolete]
         public float? DefragThreshold { get; set; }
 
         /// <inheritdoc />
diff --git a/LLama/Extensions/IContextParamsExtensions.cs b/LLama/Extensions/IContextParamsExtensions.cs
@@ -37,7 +37,7 @@ public static void ToLlamaContextParams(this IContextParams @params, out LLamaCo
             result.yarn_beta_slow = @params.YarnBetaSlow ?? 1f;
             result.yarn_orig_ctx = @params.YarnOriginalContext ?? 0;
             result.rope_scaling_type = @params.YarnScalingType ?? RopeScalingType.Unspecified;
-
+            
             result.defrag_threshold = @params.DefragThreshold ?? -1;
 
             result.cb_eval = IntPtr.Zero;
@@ -49,9 +49,16 @@ public static void ToLlamaContextParams(this IContextParams @params, out LLamaCo
             result.type_k = @params.TypeK ?? GGMLType.GGML_TYPE_F16;
             result.type_v = @params.TypeV ?? GGMLType.GGML_TYPE_F16;
             result.offload_kqv = !@params.NoKqvOffload;
-            result.flash_attention = @params.FlashAttention;
             result.llama_pooling_type = @params.PoolingType;
             result.attention_type = @params.AttentionType;
+            result.llama_flash_attn_type = @params.FlashAttention switch
+            {
+                true => LLamaFlashAttentionType.LLAMA_FLASH_ATTENTION_TYPE_ENABLED,
+                false => LLamaFlashAttentionType.LLAMA_FLASH_ATTENTION_TYPE_DISABLED,
+                null => LLamaFlashAttentionType.LLAMA_FLASH_ATTENTION_TYPE_AUTO
+            };
+            result.kv_unified = true;
+            result.n_seq_max = (uint)Math.Min(Math.Max(10,result.n_ctx/8),256);
 
             result.n_threads = Threads(@params.Threads);
             result.n_threads_batch = Threads(@params.BatchThreads);
diff --git a/LLama/LLamaSharp.csproj b/LLama/LLamaSharp.csproj
@@ -57,7 +57,7 @@
   </ItemGroup>
 
   <PropertyGroup>
-    <BinaryReleaseId>11dd5a44eb180e</BinaryReleaseId>
+    <BinaryReleaseId>86587da</BinaryReleaseId>
   </PropertyGroup>
 
   <PropertyGroup>
diff --git a/LLama/Native/LLamaContextParams.cs b/LLama/Native/LLamaContextParams.cs
@@ -64,6 +64,11 @@ public struct LLamaContextParams
         /// Attention type to use for embeddings
         /// </summary>
         public LLamaAttentionType attention_type;
+
+        /// <summary>
+        /// when to enable Flash Attention
+        /// </summary>
+        public LLamaFlashAttentionType llama_flash_attn_type;
         
         /// <summary>
         /// RoPE base frequency, 0 = from model
diff --git a/LLama/Native/LLamaFlashAttentionType.cs b/LLama/Native/LLamaFlashAttentionType.cs
@@ -0,0 +1,19 @@
+﻿namespace LLama.Native;
+/// <summary>
+/// flash_attn_type
+/// </summary>
+public enum LLamaFlashAttentionType
+{
+    /// <summary>
+    /// attention type auto
+    /// </summary>
+    LLAMA_FLASH_ATTENTION_TYPE_AUTO = -1,
+    /// <summary>
+    /// attention disabled
+    /// </summary>
+    LLAMA_FLASH_ATTENTION_TYPE_DISABLED = 0,
+    /// <summary>
+    /// attention enabled
+    /// </summary>
+    LLAMA_FLASH_ATTENTION_TYPE_ENABLED = 1,
+}
diff --git a/LLama/Native/LLamaFtype.cs b/LLama/Native/LLamaFtype.cs
@@ -201,7 +201,12 @@ public enum LLamaFtype
         /// except 1d tensors
         /// </summary>
         LLAMA_FTYPE_MOSTLY_TQ2_0 = 37,
-
+        
+        /// <summary>
+        /// except 1d tensors 
+        /// </summary>
+        LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38,
+        
         /// <summary>
         /// File type was not specified
         /// </summary>
diff --git a/LLama/Native/LLamaModelParams.cs b/LLama/Native/LLamaModelParams.cs
@@ -100,7 +100,16 @@ public bool check_tensors
             set => _check_tensors = Convert.ToSByte(value);
         }
         private sbyte _check_tensors;
-
+        
+        /// <summary>
+        /// use extra buffer types (used for weight repacking) 
+        /// </summary>
+        public bool use_extra_bufts
+        {
+            readonly get => Convert.ToBoolean(_use_extra_bufts);
+            set => _use_extra_bufts = Convert.ToSByte(value);
+        }
+        private sbyte _use_extra_bufts;
         /// <summary>
         /// Create a LLamaModelParams with default values
         /// </summary>
diff --git a/LLama/Native/NativeApi.cs b/LLama/Native/NativeApi.cs
@@ -179,7 +179,7 @@ public static unsafe int llama_chat_apply_template(byte* tmpl, LLamaChatMessage*
         {
             return internal_llama_chat_apply_template(tmpl, chat, n_msg, add_ass, buf, length);
 
-            [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl, EntryPoint = "llama_chat_apply_template")]
+            [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl,EntryPoint = "llama_chat_apply_template")]
             static extern int internal_llama_chat_apply_template(byte* tmpl, LLamaChatMessage* chat, nuint n_msg, [MarshalAs(UnmanagedType.U1)] bool add_ass, byte* buf, int length);
         }
 
@@ -215,7 +215,8 @@ public static unsafe int llama_chat_apply_template(byte* tmpl, LLamaChatMessage*
         /// <param name="lstrip">User can skip up to 'lstrip' leading spaces before copying (useful when encoding/decoding multiple tokens with 'add_space_prefix')</param>
         /// <param name="special">If true, special tokens are rendered in the output</param>
         /// <returns>The length written, or if the buffer is too small a negative that indicates the length required</returns>
-        public static int llama_token_to_piece(SafeLlamaModelHandle.Vocabulary vocab, LLamaToken llamaToken, Span<byte> buffer, int lstrip, bool special)
+        public static int llama_token_to_piece(SafeLlamaModelHandle.Vocabulary vocab, LLamaToken llamaToken,
+            Span<byte> buffer, int lstrip, bool special)
         {
             // Handle invalid tokens
             if ((int)llamaToken < 0)
@@ -225,12 +226,14 @@ public static int llama_token_to_piece(SafeLlamaModelHandle.Vocabulary vocab, LL
             {
                 fixed (byte* bufferPtr = buffer)
                 {
-                    return llama_token_to_piece_native(vocab.VocabNative, llamaToken, bufferPtr, buffer.Length, lstrip, special);
+                    return llama_token_to_piece_native(vocab.VocabNative, llamaToken, bufferPtr, buffer.Length, lstrip,
+                        special);
                 }
             }
 
             [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl, EntryPoint = "llama_token_to_piece")]
-            static extern unsafe int llama_token_to_piece_native(LLamaVocabNative* model, LLamaToken llamaToken, byte* buffer, int length, int lstrip, [MarshalAs(UnmanagedType.U1)] bool special);
+            static extern unsafe int llama_token_to_piece_native(LLamaVocabNative* model, LLamaToken llamaToken,
+                byte* buffer, int length, int lstrip, [MarshalAs(UnmanagedType.U1)] bool special);
         }
 
         /// <summary>
@@ -247,7 +250,9 @@ public static int llama_token_to_piece(SafeLlamaModelHandle.Vocabulary vocab, LL
         /// Returns a negative number on failure - the number of tokens that would have been returned. Returns INT32_MIN on overflow (e.g., tokenization result size exceeds int32_t limit)
         /// </returns>
         [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
-        internal static extern unsafe int llama_tokenize(LLamaVocabNative* model, byte* text, int text_len, LLamaToken* tokens, int n_max_tokens, [MarshalAs(UnmanagedType.U1)] bool add_special, [MarshalAs(UnmanagedType.U1)] bool parse_special);
+        internal static extern unsafe int llama_tokenize(LLamaVocabNative* model, byte* text, int text_len,
+            LLamaToken* tokens, int n_max_tokens, [MarshalAs(UnmanagedType.U1)] bool add_special,
+            [MarshalAs(UnmanagedType.U1)] bool parse_special);
 
         /// <summary>
         /// Convert the provided tokens into text (inverse of llama_tokenize()).
@@ -261,7 +266,8 @@ public static int llama_token_to_piece(SafeLlamaModelHandle.Vocabulary vocab, LL
         /// <param name="unparseSpecial">unparse_special If true, special tokens are rendered in the output.</param>
         /// <returns>Returns the number of chars/bytes on success, no more than textLengthMax. Returns a negative number on failure - the number of chars/bytes that would have been returned.</returns>
         [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
-        internal static extern unsafe int llama_detokenize(LLamaVocabNative* model, LLamaToken* tokens, int nTokens, byte* textOut, int textLengthMax, bool removeSpecial, bool unparseSpecial);
+        internal static extern unsafe int llama_detokenize(LLamaVocabNative* model, LLamaToken* tokens, int nTokens,
+            byte* textOut, int textLengthMax, bool removeSpecial, bool unparseSpecial);
 
         /// <summary>
         /// Register a callback to receive llama log messages
@@ -272,7 +278,7 @@ public static void llama_log_set(NativeLogConfig.LLamaLogCallback logCallback)
         {
             NativeLogConfig.llama_log_set(logCallback);
         }
-        
+
         /// <summary>
         /// Allocates a batch of tokens on the heap
         /// Each token can be assigned up to n_seq_max sequence ids
@@ -311,7 +317,8 @@ public static void llama_log_set(NativeLogConfig.LLamaLogCallback logCallback)
         /// <param name="il_end"></param>
         /// <returns></returns>
         [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
-        public static extern unsafe int llama_apply_adapter_cvec(SafeLLamaContextHandle ctx, float* data, nuint len, int n_embd, int il_start, int il_end);
+        public static extern unsafe int llama_apply_adapter_cvec(SafeLLamaContextHandle ctx, float* data, nuint len,
+            int n_embd, int il_start, int il_end);
 
         /// <summary>
         /// Build a split GGUF final path for this chunk.
@@ -324,7 +331,8 @@ public static void llama_log_set(NativeLogConfig.LLamaLogCallback logCallback)
         /// <param name="split_count"></param>
         /// <returns>Returns the split_path length.</returns>
         [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
-        public static extern int llama_split_path(string split_path, nuint maxlen, string path_prefix, int split_no, int split_count);
+        public static extern int llama_split_path(string split_path, nuint maxlen, string path_prefix, int split_no,
+            int split_count);
 
         /// <summary>
         /// Extract the path prefix from the split_path if and only if the split_no and split_count match.
@@ -337,7 +345,8 @@ public static void llama_log_set(NativeLogConfig.LLamaLogCallback logCallback)
         /// <param name="split_count"></param>
         /// <returns>Returns the split_prefix length.</returns>
         [DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
-        public static extern int llama_split_prefix(string split_prefix, nuint maxlen, string split_path, int split_no, int split_count);
+        public static extern int llama_split_prefix(string split_prefix, nuint maxlen, string split_path, int split_no,
+            int split_count);
 
         //[DllImport(libraryName, CallingConvention = CallingConvention.Cdecl)]
         //todo: public static void llama_attach_threadpool(SafeLLamaContextHandle ctx, ggml_threadpool_t threadpool, ggml_threadpool_t threadpool_batch);
@@ -380,5 +389,41 @@ public static void llama_log_set(NativeLogConfig.LLamaLogCallback logCallback)
         /// <returns>Name of the buffer type</returns>
         [DllImport(ggmlBaseLibraryName, CallingConvention = CallingConvention.Cdecl)]
         public static extern IntPtr ggml_backend_buft_name(IntPtr buft);
+
+        /// <summary>
+        /// 
+        /// </summary>
+        /// <param name="ctx"></param>
+        /// <param name="seq_id"></param>
+        /// <param name="flags"></param>
+        /// <returns></returns>
+        [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+        public static extern UIntPtr llama_state_seq_get_size_ext(IntPtr ctx, int seq_id, uint flags);
+
+        /// <summary>
+        /// 
+        /// </summary>
+        /// <param name="ctx"></param>
+        /// <param name="dst"></param>
+        /// <param name="size"></param>
+        /// <param name="seq_id"></param>
+        /// <param name="flags"></param>
+        /// <returns></returns>
+        [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+        public static extern UIntPtr llama_state_seq_get_data_ext(IntPtr ctx, [Out] byte[] dst, UIntPtr size,
+            int seq_id, uint flags);
+
+        /// <summary>
+        /// 
+        /// </summary>
+        /// <param name="ctx"></param>
+        /// <param name="src"></param>
+        /// <param name="size"></param>
+        /// <param name="dest_seq_id"></param>
+        /// <param name="flags"></param>
+        /// <returns></returns>
+        [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+        public static extern UIntPtr llama_state_seq_set_data_ext(IntPtr ctx, byte[] src, UIntPtr size, int dest_seq_id,
+            uint flags);
     }
-}
+}
diff --git a/LLama/Native/SafeLLamaContextHandle.cs b/LLama/Native/SafeLLamaContextHandle.cs
@@ -33,6 +33,11 @@ public sealed class SafeLLamaContextHandle
         /// Get the physical maximum batch size for this context
         /// </summary>
         public uint UBatchSize => llama_n_ubatch(this);
+        
+        /// <summary>
+        /// Get the number of maximum sequences allowed
+        /// </summary>
+        public uint MaxSeq => NativeApi.llama_n_seq_max(this);
 
         /// <summary>
         /// Get or set the number of threads used for generation of a single token.
@@ -341,6 +346,47 @@ static SafeLLamaContextHandle()
         [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
         private static extern int llama_set_adapter_lora(SafeLLamaContextHandle context, IntPtr adapter, float scale);
 
+        /// <summary>
+        /// Get metadata value as a string by key name
+        /// </summary>
+        /// <param name="adapter"></param>
+        /// <param name="key"></param>
+        /// <param name="buf"></param>
+        /// <param name="buf_size"></param>
+        /// <returns>The length of the value string (on success) -1 otherwise </returns>
+        [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+        private static extern int llama_adapter_meta_val_str(IntPtr adapter, string key, StringBuilder buf, UIntPtr buf_size);
+        
+        /// <summary>
+        /// Get the number of metadata key value pairs
+        /// </summary>
+        /// <param name="adapter"></param>
+        /// <returns>The count of meta key value pairs</returns>
+        [DllImport(NativeApi.libraryName, CallingConvention =  CallingConvention.Cdecl)]
+        private static extern int llama_adapter_meta_count(IntPtr adapter);
+        
+        /// <summary>
+        /// Get metadata key name by index
+        /// </summary>
+        /// <param name="adapter"></param>
+        /// <param name="i"></param>
+        /// <param name="buf"></param>
+        /// <param name="buf_size"></param>
+        /// <returns>The length of string i.e meta key (on success) -1 otherwise</returns>
+        [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+        private static extern int llama_adapter_meta_key_by_index(IntPtr adapter, int i, StringBuilder buf, UIntPtr buf_size);
+        
+        /// <summary>
+        /// Get metadata key value by index
+        /// </summary>
+        /// <param name="adapter"></param>
+        /// <param name="i"></param>
+        /// <param name="buf"></param>
+        /// <param name="buf_size"></param>
+        /// <returns>The length of value string (on success) -1 otherwise</returns>
+        [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
+        private static extern int llama_adapter_meta_val_by_index(IntPtr adapter, int i, StringBuilder buf,  UIntPtr buf_size);
+
         [DllImport(NativeApi.libraryName, CallingConvention = CallingConvention.Cdecl)]
         private static extern int llama_rm_adapter_lora(SafeLLamaContextHandle context, IntPtr adapter);
 
diff --git a/LLama/Native/SafeLLamaSamplerHandle.cs b/LLama/Native/SafeLLamaSamplerHandle.cs
diff --git a/LLama/Native/SafeLlamaModelHandle.cs b/LLama/Native/SafeLlamaModelHandle.cs
diff --git a/llama.cpp b/llama.cpp

Original file line number	Diff line number	Diff line change
`@@ -13,10 +13,9 @@ public LLamaContextTests()`
`13`	`13`	`{`
`14`	`14`	`var @params = new ModelParams(Constants.GenerativeModelPath2)`
`15`	`15`	`{`
`16`		`- ContextSize = 128,`
	`16`	`+ ContextSize = 512,`
`17`	`17`	`BatchSize = 8,`
`18`	`18`	`UBatchSize = 8,`
`19`		`- SeqMax = 1,`
`20`	`19`	`VocabOnly = false,`
`21`	`20`	`GpuLayerCount = Constants.CIGpuLayerCount,`
`22`	`21`	`};`
`@@ -33,7 +32,7 @@ public void Dispose()`
`33`	`32`	`[Fact]`
`34`	`33`	`public void CheckProperties()`
`35`	`34`	`{`
`36`		`- Assert.Equal(128u, _context.ContextSize);`
	`35`	`+ Assert.Equal(_context.NativeHandle.MaxSeq * 256, _context.ContextSize);`
`37`	`36`	`Assert.Equal(960, _context.EmbeddingSize);`
`38`	`37`	`Assert.Equal(49152, _context.Vocab.Count);`
`39`	`38`	`}`
Original file line number	Diff line number	Diff line change
`@@ -30,7 +30,7 @@ public LLamaContextWithCustomLoggerTests()`
`30`	`30`	`{`
`31`	`31`	`var @params = new ModelParams(Constants.GenerativeModelPath2)`
`32`	`32`	`{`
`33`		`- ContextSize = 128,`
	`33`	`+ ContextSize = 512,`
`34`	`34`	`GpuLayerCount = Constants.CIGpuLayerCount,`
`35`	`35`	`};`
`36`	`36`
`@@ -55,7 +55,7 @@ public void Dispose()`
`55`	`55`	`[Fact]`
`56`	`56`	`public void CheckProperties()`
`57`	`57`	`{`
`58`		`- Assert.Equal(128u, _context.ContextSize);`
	`58`	`+ Assert.Equal(_context.NativeHandle.MaxSeq * 256, _context.ContextSize);`
`59`	`59`	`Assert.Equal(960, _context.EmbeddingSize);`
`60`	`60`	`Assert.Equal(49152, _context.Vocab.Count);`
`61`	`61`	`}`
Original file line number	Diff line number	Diff line change
`@@ -104,7 +104,7 @@ public void BatchedSampling()`
`104`	`104`	`}`
`105`	`105`	`}`
`106`	`106`
`107`		`- // Add " repeat" and test whether next tokens will be "this phrase forever.".`
	`107`	`+ // Add " repeat" and test whether next tokens will be "this phrase forever."`
`108`	`108`	`for (int i = 0; i < 4; i++)`
`109`	`109`	`{`
`110`	`110`	`for (int b = 0; b < batch_count; b++)`