@@ -37,7 +37,7 @@ public static void ToLlamaContextParams(this IContextParams @params, out LLamaCo
3737 result . yarn_beta_slow = @params . YarnBetaSlow ?? 1f ;
3838 result . yarn_orig_ctx = @params . YarnOriginalContext ?? 0 ;
3939 result . rope_scaling_type = @params . YarnScalingType ?? RopeScalingType . Unspecified ;
40-
40+
4141 result . defrag_threshold = @params . DefragThreshold ?? - 1 ;
4242
4343 result . cb_eval = IntPtr . Zero ;
@@ -49,9 +49,16 @@ public static void ToLlamaContextParams(this IContextParams @params, out LLamaCo
4949 result . type_k = @params . TypeK ?? GGMLType . GGML_TYPE_F16 ;
5050 result . type_v = @params . TypeV ?? GGMLType . GGML_TYPE_F16 ;
5151 result . offload_kqv = ! @params . NoKqvOffload ;
52- result . flash_attention = @params . FlashAttention ;
5352 result . llama_pooling_type = @params . PoolingType ;
5453 result . attention_type = @params . AttentionType ;
54+ result . llama_flash_attn_type = @params . FlashAttention switch
55+ {
56+ true => LLamaFlashAttentionType . LLAMA_FLASH_ATTENTION_TYPE_ENABLED ,
57+ false => LLamaFlashAttentionType . LLAMA_FLASH_ATTENTION_TYPE_DISABLED ,
58+ null => LLamaFlashAttentionType . LLAMA_FLASH_ATTENTION_TYPE_AUTO
59+ } ;
60+ result . kv_unified = true ;
61+ result . n_seq_max = ( uint ) Math . Min ( Math . Max ( 10 , result . n_ctx / 8 ) , 256 ) ;
5562
5663 result . n_threads = Threads ( @params . Threads ) ;
5764 result . n_threads_batch = Threads ( @params . BatchThreads ) ;
0 commit comments