Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion LLama.KernelMemory/BuilderExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,6 @@ public static IKernelMemoryBuilder WithLLamaSharpDefaults(this IKernelMemoryBuil
SplitMode = config.SplitMode,
BatchSize = 512,
UBatchSize = 512,
FlashAttention = true,
UseMemorymap = true
};

Expand Down
2 changes: 0 additions & 2 deletions LLama.KernelMemory/LLamaSharpTextEmbeddingGenerator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ public LLamaSharpTextEmbeddingGenerator(LLamaSharpConfig config)
SplitMode = config?.SplitMode ?? LLama.Native.GPUSplitMode.Layer,
BatchSize = 512,
UBatchSize = 512,
FlashAttention = true,
UseMemorymap = true,
PoolingType = LLamaPoolingType.Mean,
};
Expand Down Expand Up @@ -68,7 +67,6 @@ public LLamaSharpTextEmbeddingGenerator(LLamaSharpConfig config, LLamaWeights we
SplitMode = config?.SplitMode ?? LLama.Native.GPUSplitMode.Layer,
BatchSize = 512,
UBatchSize = 512,
FlashAttention = true,
UseMemorymap = true,
PoolingType = LLamaPoolingType.Mean,
};
Expand Down
2 changes: 0 additions & 2 deletions LLama.KernelMemory/LlamaSharpTextGenerator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ public LlamaSharpTextGenerator(LLamaSharpConfig config)
SplitMode = config?.SplitMode ?? LLama.Native.GPUSplitMode.Layer,
BatchSize = 512,
UBatchSize = 512,
FlashAttention = true,
UseMemorymap = true
};
_weights = LLamaWeights.LoadFromFile(@params);
Expand Down Expand Up @@ -66,7 +65,6 @@ public LlamaSharpTextGenerator(LLamaWeights weights, LLamaSharpConfig config, St
SplitMode = config?.SplitMode ?? LLama.Native.GPUSplitMode.Layer,
BatchSize = 512,
UBatchSize = 512,
FlashAttention = true,
UseMemorymap = true
};
_executor = executor ?? new StatelessExecutor(_weights, @params);
Expand Down
4 changes: 2 additions & 2 deletions LLama.Unittest/LLamaContextTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ public LLamaContextTests()
{
var @params = new ModelParams(Constants.GenerativeModelPath2)
{
ContextSize = 128,
ContextSize = 512,
BatchSize = 8,
UBatchSize = 8,
SeqMax = 1,
Expand All @@ -33,7 +33,7 @@ public void Dispose()
[Fact]
public void CheckProperties()
{
Assert.Equal(128u, _context.ContextSize);
Assert.Equal(512u, _context.ContextSize);
Assert.Equal(960, _context.EmbeddingSize);
Assert.Equal(49152, _context.Vocab.Count);
}
Expand Down
4 changes: 2 additions & 2 deletions LLama.Unittest/LLamaContextWithCustomLoggerTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ public LLamaContextWithCustomLoggerTests()
{
var @params = new ModelParams(Constants.GenerativeModelPath2)
{
ContextSize = 128,
ContextSize = 512,
GpuLayerCount = Constants.CIGpuLayerCount,
};

Expand All @@ -55,7 +55,7 @@ public void Dispose()
[Fact]
public void CheckProperties()
{
Assert.Equal(128u, _context.ContextSize);
Assert.Equal(512u, _context.ContextSize);
Assert.Equal(960, _context.EmbeddingSize);
Assert.Equal(49152, _context.Vocab.Count);
}
Expand Down
2 changes: 1 addition & 1 deletion LLama.Unittest/SamplingTests.cs
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ public void BatchedSampling()
}
}

// Add " repeat" and test whether next tokens will be "this phrase forever.".
// Add " repeat" and test whether next tokens will be "this phrase forever."
for (int i = 0; i < 4; i++)
{
for (int b = 0; b < batch_count; b++)
Expand Down
5 changes: 0 additions & 5 deletions LLama/Abstractions/IContextParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -103,11 +103,6 @@ public interface IContextParams
/// </summary>
bool NoKqvOffload { get; }

/// <summary>
/// Whether to use flash attention
/// </summary>
bool FlashAttention { get; }

/// <summary>
/// defragment the KV cache if holes/size &gt; defrag_threshold, Set to &lt;= 0 to disable (default)
/// </summary>
Expand Down
6 changes: 2 additions & 4 deletions LLama/Common/ModelParams.cs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
using System;
using LLama.Abstractions;
using System.Text;
using System.Text.Json.Serialization;
Expand Down Expand Up @@ -97,10 +98,7 @@ public record ModelParams
public bool NoKqvOffload { get; set; }

/// <inheritdoc />

public bool FlashAttention { get; set; }

/// <inheritdoc />
[Obsolete]
public float? DefragThreshold { get; set; }

/// <inheritdoc />
Expand Down
1 change: 0 additions & 1 deletion LLama/Extensions/IContextParamsExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,6 @@ public static void ToLlamaContextParams(this IContextParams @params, out LLamaCo
result.type_k = @params.TypeK ?? GGMLType.GGML_TYPE_F16;
result.type_v = @params.TypeV ?? GGMLType.GGML_TYPE_F16;
result.offload_kqv = !@params.NoKqvOffload;
result.flash_attention = @params.FlashAttention;
Copy link
Contributor

@Lyrcaxis Lyrcaxis Oct 10, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Instead of completely removing the option to use flash attention, can you pass to llama_flash_attn_type?
I would suggest keeping the previous FlashAttention bool as it was -- but turn it to nullable, so null == Auto.

result.llama_flash_attn_type = @params.FlashAttention switch
{
    true => LLamaFlashAttentionType.LLAMA_FLASH_ATTENTION_TYPE_ENABLED,
    false => LLamaFlashAttentionType.LLAMA_FLASH_ATTENTION_TYPE_DISABLED,
    null => LLamaFlashAttentionType.LLAMA_FLASH_ATTENTION_TYPE_AUTO
}
result.kv_unified = true; // if we wanna hardcode it here instead of in `Default()`.

result.llama_pooling_type = @params.PoolingType;
result.attention_type = @params.AttentionType;

Expand Down
2 changes: 1 addition & 1 deletion LLama/LLamaSharp.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@
</ItemGroup>

<PropertyGroup>
<BinaryReleaseId>11dd5a44eb180e</BinaryReleaseId>
<BinaryReleaseId>86587da</BinaryReleaseId>
</PropertyGroup>

<PropertyGroup>
Expand Down
5 changes: 5 additions & 0 deletions LLama/Native/LLamaContextParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,11 @@ public struct LLamaContextParams
/// Attention type to use for embeddings
/// </summary>
public LLamaAttentionType attention_type;

/// <summary>
/// when to enable Flash Attention
/// </summary>
public LLamaFlashAttentionType llama_flash_attn_type;

/// <summary>
/// RoPE base frequency, 0 = from model
Expand Down
19 changes: 19 additions & 0 deletions LLama/Native/LLamaFlashAttentionType.cs
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
namespace LLama.Native;
/// <summary>
/// flash_attn_type
/// </summary>
public enum LLamaFlashAttentionType
{
/// <summary>
/// attention type auto
/// </summary>
LLAMA_FLASH_ATTENTION_TYPE_AUTO = -1,
/// <summary>
/// attention disabled
/// </summary>
LLAMA_FLASH_ATTENTION_TYPE_DISABLED = 0,
/// <summary>
/// attention enabled
/// </summary>
LLAMA_FLASH_ATTENTION_TYPE_ENABLED = 1,
}
7 changes: 6 additions & 1 deletion LLama/Native/LLamaFtype.cs
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,12 @@ public enum LLamaFtype
/// except 1d tensors
/// </summary>
LLAMA_FTYPE_MOSTLY_TQ2_0 = 37,


/// <summary>
/// except 1d tensors
/// </summary>
LLAMA_FTYPE_MOSTLY_MXFP4_MOE = 38,

/// <summary>
/// File type was not specified
/// </summary>
Expand Down
11 changes: 10 additions & 1 deletion LLama/Native/LLamaModelParams.cs
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,16 @@ public bool check_tensors
set => _check_tensors = Convert.ToSByte(value);
}
private sbyte _check_tensors;


/// <summary>
/// use extra buffer types (used for weight repacking)
/// </summary>
public bool use_extra_bufts
{
readonly get => Convert.ToBoolean(_use_extra_bufts);
set => _use_extra_bufts = Convert.ToSByte(value);
}
private sbyte _use_extra_bufts;
/// <summary>
/// Create a LLamaModelParams with default values
/// </summary>
Expand Down
Loading