diff --git a/.github/workflows/compile.yml b/.github/workflows/compile.yml
index 2e9ac1e86..cc95273d2 100644
--- a/.github/workflows/compile.yml
+++ b/.github/workflows/compile.yml
@@ -172,7 +172,85 @@ jobs:
./build/libllama.so
# ./build/libclblast.so
name: llama-bin-linux-clblast-x64.so
-
+
+ compile-vulkan:
+ name: Compile (vulkan) - ${{ matrix.os }}
+ strategy:
+ fail-fast: false
+ matrix:
+ os: [
+ ubuntu-22.04,
+ windows-latest
+ ]
+ env:
+ OPENBLAS_VERSION: 0.3.23
+ OPENCL_VERSION: 2023.04.17
+ CLBLAST_VERSION: 1.6.0
+ VULKAN_VERSION: 1.3.261.1
+ runs-on: ${{ matrix.os }}
+ steps:
+ - name: Clone
+ id: checkout
+ uses: actions/checkout@v4
+ with:
+ repository: ggerganov/llama.cpp
+
+ - name: Download dependencies - Linux
+ if: ${{ matrix.os == 'ubuntu-22.04' }}
+ run: |
+ wget -qO- https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo tee /etc/apt/trusted.gpg.d/lunarg.asc
+ sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list http://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
+ sudo apt update
+ sudo apt install vulkan-sdk
+
+ - name: Download dependencies - Windows
+ id: get_vulkan
+ if: ${{ matrix.os == 'windows-latest' }}
+ run: |
+ curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
+ & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
+ Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
+ Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
+
+ - name: Build
+ id: cmake_build
+ if: ${{ matrix.os == 'windows-latest' }}
+ run: |
+ mkdir build
+ cd build
+ cmake .. ${{ env.COMMON_DEFINE }} -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/vulkan"
+ cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
+ #copy $env:RUNNER_TEMP/clblast/lib/clblast.dll .\bin\Release\clblast.dll
+ # # We should probably generate a sha256 sum in a file, and use that.
+ # echo "78a8c98bcb2efe1a63318d901ab204d9ba96c3b29707b4ce0c4240bdcdc698d6 ./bin/Release/clblast.dll" >> tmp
+ # sha256sum -c tmp || exit 255
+ # rm tmp
+ ls -R
+ - name: Build
+ if: ${{ matrix.os == 'ubuntu-22.04' }}
+ run: |
+ mkdir build
+ cd build
+ cmake .. ${{ env.COMMON_DEFINE }} -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON -DBUILD_SHARED_LIBS=ON
+ cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
+ # if we ever want to pull libvulkan.so back into the packages, just uncomment this line, and the one below for the upload
+ # cp $(ldconfig -p | grep libvulkan.so | tail -n 1 | cut -d ' ' -f 4) ./
+ ls -R
+ - name: Upload artifacts (Windows)
+ if: ${{ matrix.os == 'windows-latest' }}
+ uses: actions/upload-artifact@v4
+ with:
+ path: |
+ .\build\bin\Release\llama.dll
+ name: llama-bin-win-vulkan-x64.dll
+ - name: Upload artifacts (linux)
+ if: ${{ matrix.os == 'ubuntu-22.04' }}
+ uses: actions/upload-artifact@v4
+ with:
+ path: |
+ ./build/libllama.so
+ name: llama-bin-linux-vulkan-x64.so
+
compile-cublas:
name: Compile (cublas)
strategy:
@@ -277,7 +355,8 @@ jobs:
"compile-macos",
"compile-windows",
"compile-cublas",
- "compile-clblast"
+ "compile-clblast",
+ "compile-vulkan"
]
steps:
- uses: actions/download-artifact@v4
@@ -288,7 +367,7 @@ jobs:
- name: Rearrange Files
run: |
# Make all directories at once
- mkdir --parents deps/{avx,avx2,avx512,osx-arm64,osx-x64,cu11.7.1,cu12.1.0,clblast}
+ mkdir --parents deps/{avx,avx2,avx512,osx-arm64,osx-x64,cu11.7.1,cu12.1.0,clblast,vulkan}
cp artifacts/llama-bin-linux-noavx-x64.so/libllama.so deps/libllama.so
cp artifacts/llama-bin-linux-avx-x64.so/libllama.so deps/avx/libllama.so
@@ -312,6 +391,9 @@ jobs:
cp artifacts/llama-bin-win-clblast-x64.dll/{llama,clblast}.dll deps/clblast/
cp artifacts/llama-bin-linux-clblast-x64.so/libllama.so deps/clblast/
+ cp artifacts/llama-bin-win-vulkan-x64.dll/llama.dll deps/vulkan/
+ cp artifacts/llama-bin-linux-vulkan-x64.so/libllama.so deps/vulkan/
+
- name: Upload artifacts
uses: actions/upload-artifact@v4
with:
diff --git a/LLama/LLamaSharp.Runtime.targets b/LLama/LLamaSharp.Runtime.targets
index f26ad24e0..98e580bf5 100644
--- a/LLama/LLamaSharp.Runtime.targets
+++ b/LLama/LLamaSharp.Runtime.targets
@@ -28,6 +28,10 @@
PreserveNewest
runtimes/win-x64/native/cuda12/llama.dll
+
+ PreserveNewest
+ runtimes/win-x64/native/vulkan/llama.dll
+
PreserveNewest
@@ -53,6 +57,10 @@
PreserveNewest
runtimes/linux-x64/native/cuda12/libllama.so
+
+ PreserveNewest
+ runtimes/linux-x64/native/vulkan/libllama.so
+
PreserveNewest
diff --git a/LLama/Native/NativeApi.Load.cs b/LLama/Native/NativeApi.Load.cs
index 8a02d8a56..7264c65b5 100644
--- a/LLama/Native/NativeApi.Load.cs
+++ b/LLama/Native/NativeApi.Load.cs
@@ -4,6 +4,7 @@
using System.Collections.Generic;
using System.Diagnostics;
using System.IO;
+using System.Linq;
using System.Runtime.InteropServices;
using System.Text.Json;
@@ -59,6 +60,51 @@ private static void Log(string message, LogLevel level)
Console.ResetColor();
}
+ private static string GetVulkanVersion()
+ {
+ var apiVersionString = string.Empty;
+ try
+ {
+ ProcessStartInfo start = new()
+ {
+ FileName = "vulkaninfo",
+ Arguments = "--summary",
+ RedirectStandardOutput = true,
+ UseShellExecute = false,
+ CreateNoWindow = true
+ };
+
+ Process process = new()
+ {
+ StartInfo = start
+ };
+ process.Start();
+
+ string output = process.StandardOutput.ReadToEnd();
+ process.WaitForExit();
+
+ var lines = output.Split('\n');
+ int apiVersionLineIndex = lines.ToList().FindIndex(line => line.Contains("apiVersion"));
+ if (apiVersionLineIndex >= 0)
+ {
+ var apiVersionline = lines[apiVersionLineIndex];
+ //apiVersionline =" apiVersion = 1.3.260";
+ //apiVersionline =" apiVersion = 4206830 (1.3.238)";
+ if(apiVersionline.Contains('=') && apiVersionline.Length > apiVersionline.IndexOf('=')+1)
+ {
+ apiVersionString = apiVersionline.Substring(apiVersionline.IndexOf('=')+1).Trim();
+ if(apiVersionString.Contains('(') && apiVersionString.Contains(')') && apiVersionString.IndexOf(')') > apiVersionString.IndexOf('('))
+ {
+ apiVersionString = apiVersionString.Substring(apiVersionString.IndexOf('(')+1,apiVersionString.IndexOf(')')-apiVersionString.IndexOf('(')-1);
+ }
+ return apiVersionString;
+ }
+ }
+ }
+ catch {}
+ return apiVersionString;
+ }
+
private static int GetCudaMajorVersion()
{
string? cudaPath;
@@ -209,6 +255,17 @@ private static List GetLibraryTryOrder(NativeLibraryConfig.Description c
// otherwise no cuda detected but allow fallback
}
+ if (configuration.UseVulkan && (platform == OSPlatform.Windows || platform == OSPlatform.Linux)) // no vulkan on macos
+ {
+ string vulkanVersion = GetVulkanVersion();
+ if(!string.IsNullOrEmpty(vulkanVersion))
+ {
+ Log($"Detected device supporting vulkan version {vulkanVersion}.", LogLevel.Information);
+ string vulkanLibraryPath = $"{prefix}vulkan/{libraryNamePrefix}{libraryName}{suffix}";
+ result.Add(vulkanLibraryPath);
+ }
+ }
+
// use cpu (or mac possibly with metal)
if (!configuration.AllowFallback && platform != OSPlatform.OSX)
{
diff --git a/LLama/Native/NativeLibraryConfig.cs b/LLama/Native/NativeLibraryConfig.cs
index ad52fc816..2bc88c667 100644
--- a/LLama/Native/NativeLibraryConfig.cs
+++ b/LLama/Native/NativeLibraryConfig.cs
@@ -25,6 +25,7 @@ public sealed class NativeLibraryConfig
private string _libraryPath = string.Empty;
private bool _useCuda = true;
+ private bool _useVulkan = true;
private AvxLevel _avxLevel;
private bool _allowFallback = true;
private bool _skipCheck = false;
@@ -69,6 +70,20 @@ public NativeLibraryConfig WithCuda(bool enable = true)
return this;
}
+ ///
+ /// Configure whether to use vulkan backend if possible.
+ ///
+ ///
+ ///
+ /// Thrown if `LibraryHasLoaded` is true.
+ public NativeLibraryConfig WithVulkan(bool enable = true)
+ {
+ ThrowIfLoaded();
+
+ _useVulkan = enable;
+ return this;
+ }
+
///
/// Configure the prefferred avx support level of the backend.
///
@@ -164,7 +179,8 @@ internal static Description CheckAndGatherDescription()
return new Description(
Instance._libraryPath,
- Instance._useCuda,
+ Instance._useCuda,
+ Instance._useVulkan,
Instance._avxLevel,
Instance._allowFallback,
Instance._skipCheck,
@@ -250,7 +266,7 @@ public enum AvxLevel
Avx512,
}
- internal record Description(string Path, bool UseCuda, AvxLevel AvxLevel, bool AllowFallback, bool SkipCheck, bool Logging, string[] SearchDirectories)
+ internal record Description(string Path, bool UseCuda,bool UseVulkan, AvxLevel AvxLevel, bool AllowFallback, bool SkipCheck, bool Logging, string[] SearchDirectories)
{
public override string ToString()
{
@@ -268,6 +284,7 @@ public override string ToString()
return $"NativeLibraryConfig Description:\n" +
$"- Path: {Path}\n" +
$"- PreferCuda: {UseCuda}\n" +
+ $"- PreferVulkan: {UseVulkan}\n" +
$"- PreferredAvxLevel: {avxLevelString}\n" +
$"- AllowFallback: {AllowFallback}\n" +
$"- SkipCheck: {SkipCheck}\n" +
diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.nuspec
new file mode 100644
index 000000000..3e45b249a
--- /dev/null
+++ b/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.nuspec
@@ -0,0 +1,24 @@
+
+
+
+ LLamaSharp.Backend.Vulkan
+ $version$
+ LLamaSharp.Backend.Vulkan - Vulkan Backend for LLamaSharp
+ llama.cpp Authors
+ false
+ MIT
+ icon512.png
+ https://github.com/SciSharp/LLamaSharp
+ LLamaSharp.Backend.Vulkan is a backend for LLamaSharp to use with Vulkan.
+
+ Copyright 2023 The llama.cpp Authors. All rights reserved.
+ LLamaSharp LLama LLM GPT AI ChatBot SciSharp
+
+
+
+
+
+
+
+
+
diff --git a/LLama/runtimes/build/temp.csproj b/LLama/runtimes/build/temp.csproj
deleted file mode 100644
index 9f5c4f4ab..000000000
--- a/LLama/runtimes/build/temp.csproj
+++ /dev/null
@@ -1,7 +0,0 @@
-
-
-
- netstandard2.0
-
-
-
diff --git a/LLama/runtimes/deps/avx/libllama.so b/LLama/runtimes/deps/avx/libllama.so
index 49a80191e..9c1e586b0 100644
Binary files a/LLama/runtimes/deps/avx/libllama.so and b/LLama/runtimes/deps/avx/libllama.so differ
diff --git a/LLama/runtimes/deps/avx/llama.dll b/LLama/runtimes/deps/avx/llama.dll
index e9924272f..6192e5395 100644
Binary files a/LLama/runtimes/deps/avx/llama.dll and b/LLama/runtimes/deps/avx/llama.dll differ
diff --git a/LLama/runtimes/deps/avx2/libllama.so b/LLama/runtimes/deps/avx2/libllama.so
index ffa59a499..ff9e73846 100644
Binary files a/LLama/runtimes/deps/avx2/libllama.so and b/LLama/runtimes/deps/avx2/libllama.so differ
diff --git a/LLama/runtimes/deps/avx2/llama.dll b/LLama/runtimes/deps/avx2/llama.dll
index 996ee0a1a..08e5e0365 100644
Binary files a/LLama/runtimes/deps/avx2/llama.dll and b/LLama/runtimes/deps/avx2/llama.dll differ
diff --git a/LLama/runtimes/deps/avx512/libllama.so b/LLama/runtimes/deps/avx512/libllama.so
index baee01e64..4f3510eb8 100644
Binary files a/LLama/runtimes/deps/avx512/libllama.so and b/LLama/runtimes/deps/avx512/libllama.so differ
diff --git a/LLama/runtimes/deps/avx512/llama.dll b/LLama/runtimes/deps/avx512/llama.dll
index 754d5cfb4..9d92e60fa 100644
Binary files a/LLama/runtimes/deps/avx512/llama.dll and b/LLama/runtimes/deps/avx512/llama.dll differ
diff --git a/LLama/runtimes/deps/clblast/libllama.so b/LLama/runtimes/deps/clblast/libllama.so
index bf2ca9e1e..8bb61ecb2 100644
Binary files a/LLama/runtimes/deps/clblast/libllama.so and b/LLama/runtimes/deps/clblast/libllama.so differ
diff --git a/LLama/runtimes/deps/clblast/llama.dll b/LLama/runtimes/deps/clblast/llama.dll
index 55d4a8517..6e45a4e5a 100644
Binary files a/LLama/runtimes/deps/clblast/llama.dll and b/LLama/runtimes/deps/clblast/llama.dll differ
diff --git a/LLama/runtimes/deps/cu11.7.1/libllama.so b/LLama/runtimes/deps/cu11.7.1/libllama.so
index e6eee3cd7..52ac0c0ac 100644
Binary files a/LLama/runtimes/deps/cu11.7.1/libllama.so and b/LLama/runtimes/deps/cu11.7.1/libllama.so differ
diff --git a/LLama/runtimes/deps/cu11.7.1/llama.dll b/LLama/runtimes/deps/cu11.7.1/llama.dll
index e75a353ea..e3ba94e70 100644
Binary files a/LLama/runtimes/deps/cu11.7.1/llama.dll and b/LLama/runtimes/deps/cu11.7.1/llama.dll differ
diff --git a/LLama/runtimes/deps/cu12.1.0/libllama.so b/LLama/runtimes/deps/cu12.1.0/libllama.so
index dbc7d066a..13949b5d2 100644
Binary files a/LLama/runtimes/deps/cu12.1.0/libllama.so and b/LLama/runtimes/deps/cu12.1.0/libllama.so differ
diff --git a/LLama/runtimes/deps/cu12.1.0/llama.dll b/LLama/runtimes/deps/cu12.1.0/llama.dll
index 88ea37f83..dec5278fe 100644
Binary files a/LLama/runtimes/deps/cu12.1.0/llama.dll and b/LLama/runtimes/deps/cu12.1.0/llama.dll differ
diff --git a/LLama/runtimes/deps/libllama.so b/LLama/runtimes/deps/libllama.so
index b9f6a8193..c97dff7fd 100644
Binary files a/LLama/runtimes/deps/libllama.so and b/LLama/runtimes/deps/libllama.so differ
diff --git a/LLama/runtimes/deps/llama.dll b/LLama/runtimes/deps/llama.dll
index 9325dadf6..5c2d6e96b 100644
Binary files a/LLama/runtimes/deps/llama.dll and b/LLama/runtimes/deps/llama.dll differ
diff --git a/LLama/runtimes/deps/osx-arm64/ggml-metal.metal b/LLama/runtimes/deps/osx-arm64/ggml-metal.metal
index efed6ad46..09ebcc9e3 100644
--- a/LLama/runtimes/deps/osx-arm64/ggml-metal.metal
+++ b/LLama/runtimes/deps/osx-arm64/ggml-metal.metal
@@ -351,12 +351,17 @@ kernel void kernel_sum_rows(
kernel void kernel_soft_max(
device const float * src0,
device const float * src1,
+ device const float * src2,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
constant float & scale,
- threadgroup float * buf [[threadgroup(0)]],
+ constant float & max_bias,
+ constant float & m0,
+ constant float & m1,
+ constant uint32_t & n_head_log2,
+ threadgroup float * buf [[threadgroup(0)]],
uint tgpig[[threadgroup_position_in_grid]],
uint tpitg[[thread_position_in_threadgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]],
@@ -368,13 +373,26 @@ kernel void kernel_soft_max(
device const float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
device const float * pmask = src1 != src0 ? src1 + i01*ne00 : nullptr;
+ device const float * ppos = src2 != src0 ? src2 : nullptr;
device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
+ float slope = 0.0f;
+
+ // ALiBi
+ if (max_bias > 0.0f) {
+ const int64_t h = i02;
+
+ const float base = h < n_head_log2 ? m0 : m1;
+ const int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
+
+ slope = pow(base, exp);
+ }
+
// parallel max
float lmax = -INFINITY;
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
- lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f));
+ lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]);
}
// find the max value in the block
@@ -399,7 +417,7 @@ kernel void kernel_soft_max(
// parallel sum
float lsum = 0.0f;
for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
- const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f)) - max_val);
+ const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]) - max_val);
lsum += exp_psrc0;
pdst[i00] = exp_psrc0;
}
@@ -437,12 +455,17 @@ kernel void kernel_soft_max(
kernel void kernel_soft_max_4(
device const float * src0,
device const float * src1,
+ device const float * src2,
device float * dst,
constant int64_t & ne00,
constant int64_t & ne01,
constant int64_t & ne02,
constant float & scale,
- threadgroup float * buf [[threadgroup(0)]],
+ constant float & max_bias,
+ constant float & m0,
+ constant float & m1,
+ constant uint32_t & n_head_log2,
+ threadgroup float * buf [[threadgroup(0)]],
uint tgpig[[threadgroup_position_in_grid]],
uint tpitg[[thread_position_in_threadgroup]],
uint sgitg[[simdgroup_index_in_threadgroup]],
@@ -454,13 +477,25 @@ kernel void kernel_soft_max_4(
device const float4 * psrc4 = (device const float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
device const float4 * pmask = src1 != src0 ? (device const float4 *)(src1 + i01*ne00) : nullptr;
+ device const float4 * ppos = src2 != src0 ? (device const float4 *)(src2) : nullptr;
device float4 * pdst4 = (device float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
+ float slope = 0.0f;
+
+ if (max_bias > 0.0f) {
+ const int64_t h = i02;
+
+ const float base = h < n_head_log2 ? m0 : m1;
+ const int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
+
+ slope = pow(base, exp);
+ }
+
// parallel max
float4 lmax4 = -INFINITY;
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
- lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f));
+ lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]);
}
const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
@@ -486,7 +521,7 @@ kernel void kernel_soft_max_4(
// parallel sum
float4 lsum4 = 0.0f;
for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
- const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f)) - max_val);
+ const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]) - max_val);
lsum4 += exp_psrc4;
pdst4[i00] = exp_psrc4;
}
diff --git a/LLama/runtimes/deps/osx-arm64/libllama.dylib b/LLama/runtimes/deps/osx-arm64/libllama.dylib
index 853998a7b..1cf020462 100644
Binary files a/LLama/runtimes/deps/osx-arm64/libllama.dylib and b/LLama/runtimes/deps/osx-arm64/libllama.dylib differ
diff --git a/LLama/runtimes/deps/osx-x64/libllama.dylib b/LLama/runtimes/deps/osx-x64/libllama.dylib
index 208bfe846..53aa7980c 100644
Binary files a/LLama/runtimes/deps/osx-x64/libllama.dylib and b/LLama/runtimes/deps/osx-x64/libllama.dylib differ
diff --git a/LLama/runtimes/deps/vulkan/libllama.so b/LLama/runtimes/deps/vulkan/libllama.so
new file mode 100644
index 000000000..2aee9edd9
Binary files /dev/null and b/LLama/runtimes/deps/vulkan/libllama.so differ
diff --git a/LLama/runtimes/deps/vulkan/llama.dll b/LLama/runtimes/deps/vulkan/llama.dll
new file mode 100644
index 000000000..e03fdcdf9
Binary files /dev/null and b/LLama/runtimes/deps/vulkan/llama.dll differ