diff --git a/.github/workflows/compile.yml b/.github/workflows/compile.yml index 2e9ac1e86..cc95273d2 100644 --- a/.github/workflows/compile.yml +++ b/.github/workflows/compile.yml @@ -172,7 +172,85 @@ jobs: ./build/libllama.so # ./build/libclblast.so name: llama-bin-linux-clblast-x64.so - + + compile-vulkan: + name: Compile (vulkan) - ${{ matrix.os }} + strategy: + fail-fast: false + matrix: + os: [ + ubuntu-22.04, + windows-latest + ] + env: + OPENBLAS_VERSION: 0.3.23 + OPENCL_VERSION: 2023.04.17 + CLBLAST_VERSION: 1.6.0 + VULKAN_VERSION: 1.3.261.1 + runs-on: ${{ matrix.os }} + steps: + - name: Clone + id: checkout + uses: actions/checkout@v4 + with: + repository: ggerganov/llama.cpp + + - name: Download dependencies - Linux + if: ${{ matrix.os == 'ubuntu-22.04' }} + run: | + wget -qO- https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo tee /etc/apt/trusted.gpg.d/lunarg.asc + sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list http://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list + sudo apt update + sudo apt install vulkan-sdk + + - name: Download dependencies - Windows + id: get_vulkan + if: ${{ matrix.os == 'windows-latest' }} + run: | + curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe" + & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install + Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}" + Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin" + + - name: Build + id: cmake_build + if: ${{ matrix.os == 'windows-latest' }} + run: | + mkdir build + cd build + cmake .. ${{ env.COMMON_DEFINE }} -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/vulkan" + cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} + #copy $env:RUNNER_TEMP/clblast/lib/clblast.dll .\bin\Release\clblast.dll + # # We should probably generate a sha256 sum in a file, and use that. + # echo "78a8c98bcb2efe1a63318d901ab204d9ba96c3b29707b4ce0c4240bdcdc698d6 ./bin/Release/clblast.dll" >> tmp + # sha256sum -c tmp || exit 255 + # rm tmp + ls -R + - name: Build + if: ${{ matrix.os == 'ubuntu-22.04' }} + run: | + mkdir build + cd build + cmake .. ${{ env.COMMON_DEFINE }} -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON -DBUILD_SHARED_LIBS=ON + cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} + # if we ever want to pull libvulkan.so back into the packages, just uncomment this line, and the one below for the upload + # cp $(ldconfig -p | grep libvulkan.so | tail -n 1 | cut -d ' ' -f 4) ./ + ls -R + - name: Upload artifacts (Windows) + if: ${{ matrix.os == 'windows-latest' }} + uses: actions/upload-artifact@v4 + with: + path: | + .\build\bin\Release\llama.dll + name: llama-bin-win-vulkan-x64.dll + - name: Upload artifacts (linux) + if: ${{ matrix.os == 'ubuntu-22.04' }} + uses: actions/upload-artifact@v4 + with: + path: | + ./build/libllama.so + name: llama-bin-linux-vulkan-x64.so + compile-cublas: name: Compile (cublas) strategy: @@ -277,7 +355,8 @@ jobs: "compile-macos", "compile-windows", "compile-cublas", - "compile-clblast" + "compile-clblast", + "compile-vulkan" ] steps: - uses: actions/download-artifact@v4 @@ -288,7 +367,7 @@ jobs: - name: Rearrange Files run: | # Make all directories at once - mkdir --parents deps/{avx,avx2,avx512,osx-arm64,osx-x64,cu11.7.1,cu12.1.0,clblast} + mkdir --parents deps/{avx,avx2,avx512,osx-arm64,osx-x64,cu11.7.1,cu12.1.0,clblast,vulkan} cp artifacts/llama-bin-linux-noavx-x64.so/libllama.so deps/libllama.so cp artifacts/llama-bin-linux-avx-x64.so/libllama.so deps/avx/libllama.so @@ -312,6 +391,9 @@ jobs: cp artifacts/llama-bin-win-clblast-x64.dll/{llama,clblast}.dll deps/clblast/ cp artifacts/llama-bin-linux-clblast-x64.so/libllama.so deps/clblast/ + cp artifacts/llama-bin-win-vulkan-x64.dll/llama.dll deps/vulkan/ + cp artifacts/llama-bin-linux-vulkan-x64.so/libllama.so deps/vulkan/ + - name: Upload artifacts uses: actions/upload-artifact@v4 with: diff --git a/LLama/LLamaSharp.Runtime.targets b/LLama/LLamaSharp.Runtime.targets index f26ad24e0..98e580bf5 100644 --- a/LLama/LLamaSharp.Runtime.targets +++ b/LLama/LLamaSharp.Runtime.targets @@ -28,6 +28,10 @@ PreserveNewest runtimes/win-x64/native/cuda12/llama.dll + + PreserveNewest + runtimes/win-x64/native/vulkan/llama.dll + PreserveNewest @@ -53,6 +57,10 @@ PreserveNewest runtimes/linux-x64/native/cuda12/libllama.so + + PreserveNewest + runtimes/linux-x64/native/vulkan/libllama.so + PreserveNewest diff --git a/LLama/Native/NativeApi.Load.cs b/LLama/Native/NativeApi.Load.cs index 8a02d8a56..7264c65b5 100644 --- a/LLama/Native/NativeApi.Load.cs +++ b/LLama/Native/NativeApi.Load.cs @@ -4,6 +4,7 @@ using System.Collections.Generic; using System.Diagnostics; using System.IO; +using System.Linq; using System.Runtime.InteropServices; using System.Text.Json; @@ -59,6 +60,51 @@ private static void Log(string message, LogLevel level) Console.ResetColor(); } + private static string GetVulkanVersion() + { + var apiVersionString = string.Empty; + try + { + ProcessStartInfo start = new() + { + FileName = "vulkaninfo", + Arguments = "--summary", + RedirectStandardOutput = true, + UseShellExecute = false, + CreateNoWindow = true + }; + + Process process = new() + { + StartInfo = start + }; + process.Start(); + + string output = process.StandardOutput.ReadToEnd(); + process.WaitForExit(); + + var lines = output.Split('\n'); + int apiVersionLineIndex = lines.ToList().FindIndex(line => line.Contains("apiVersion")); + if (apiVersionLineIndex >= 0) + { + var apiVersionline = lines[apiVersionLineIndex]; + //apiVersionline =" apiVersion = 1.3.260"; + //apiVersionline =" apiVersion = 4206830 (1.3.238)"; + if(apiVersionline.Contains('=') && apiVersionline.Length > apiVersionline.IndexOf('=')+1) + { + apiVersionString = apiVersionline.Substring(apiVersionline.IndexOf('=')+1).Trim(); + if(apiVersionString.Contains('(') && apiVersionString.Contains(')') && apiVersionString.IndexOf(')') > apiVersionString.IndexOf('(')) + { + apiVersionString = apiVersionString.Substring(apiVersionString.IndexOf('(')+1,apiVersionString.IndexOf(')')-apiVersionString.IndexOf('(')-1); + } + return apiVersionString; + } + } + } + catch {} + return apiVersionString; + } + private static int GetCudaMajorVersion() { string? cudaPath; @@ -209,6 +255,17 @@ private static List GetLibraryTryOrder(NativeLibraryConfig.Description c // otherwise no cuda detected but allow fallback } + if (configuration.UseVulkan && (platform == OSPlatform.Windows || platform == OSPlatform.Linux)) // no vulkan on macos + { + string vulkanVersion = GetVulkanVersion(); + if(!string.IsNullOrEmpty(vulkanVersion)) + { + Log($"Detected device supporting vulkan version {vulkanVersion}.", LogLevel.Information); + string vulkanLibraryPath = $"{prefix}vulkan/{libraryNamePrefix}{libraryName}{suffix}"; + result.Add(vulkanLibraryPath); + } + } + // use cpu (or mac possibly with metal) if (!configuration.AllowFallback && platform != OSPlatform.OSX) { diff --git a/LLama/Native/NativeLibraryConfig.cs b/LLama/Native/NativeLibraryConfig.cs index ad52fc816..2bc88c667 100644 --- a/LLama/Native/NativeLibraryConfig.cs +++ b/LLama/Native/NativeLibraryConfig.cs @@ -25,6 +25,7 @@ public sealed class NativeLibraryConfig private string _libraryPath = string.Empty; private bool _useCuda = true; + private bool _useVulkan = true; private AvxLevel _avxLevel; private bool _allowFallback = true; private bool _skipCheck = false; @@ -69,6 +70,20 @@ public NativeLibraryConfig WithCuda(bool enable = true) return this; } + /// + /// Configure whether to use vulkan backend if possible. + /// + /// + /// + /// Thrown if `LibraryHasLoaded` is true. + public NativeLibraryConfig WithVulkan(bool enable = true) + { + ThrowIfLoaded(); + + _useVulkan = enable; + return this; + } + /// /// Configure the prefferred avx support level of the backend. /// @@ -164,7 +179,8 @@ internal static Description CheckAndGatherDescription() return new Description( Instance._libraryPath, - Instance._useCuda, + Instance._useCuda, + Instance._useVulkan, Instance._avxLevel, Instance._allowFallback, Instance._skipCheck, @@ -250,7 +266,7 @@ public enum AvxLevel Avx512, } - internal record Description(string Path, bool UseCuda, AvxLevel AvxLevel, bool AllowFallback, bool SkipCheck, bool Logging, string[] SearchDirectories) + internal record Description(string Path, bool UseCuda,bool UseVulkan, AvxLevel AvxLevel, bool AllowFallback, bool SkipCheck, bool Logging, string[] SearchDirectories) { public override string ToString() { @@ -268,6 +284,7 @@ public override string ToString() return $"NativeLibraryConfig Description:\n" + $"- Path: {Path}\n" + $"- PreferCuda: {UseCuda}\n" + + $"- PreferVulkan: {UseVulkan}\n" + $"- PreferredAvxLevel: {avxLevelString}\n" + $"- AllowFallback: {AllowFallback}\n" + $"- SkipCheck: {SkipCheck}\n" + diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.nuspec new file mode 100644 index 000000000..3e45b249a --- /dev/null +++ b/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.nuspec @@ -0,0 +1,24 @@ + + + + LLamaSharp.Backend.Vulkan + $version$ + LLamaSharp.Backend.Vulkan - Vulkan Backend for LLamaSharp + llama.cpp Authors + false + MIT + icon512.png + https://github.com/SciSharp/LLamaSharp + LLamaSharp.Backend.Vulkan is a backend for LLamaSharp to use with Vulkan. + + Copyright 2023 The llama.cpp Authors. All rights reserved. + LLamaSharp LLama LLM GPT AI ChatBot SciSharp + + + + + + + + + diff --git a/LLama/runtimes/build/temp.csproj b/LLama/runtimes/build/temp.csproj deleted file mode 100644 index 9f5c4f4ab..000000000 --- a/LLama/runtimes/build/temp.csproj +++ /dev/null @@ -1,7 +0,0 @@ - - - - netstandard2.0 - - - diff --git a/LLama/runtimes/deps/avx/libllama.so b/LLama/runtimes/deps/avx/libllama.so index 49a80191e..9c1e586b0 100644 Binary files a/LLama/runtimes/deps/avx/libllama.so and b/LLama/runtimes/deps/avx/libllama.so differ diff --git a/LLama/runtimes/deps/avx/llama.dll b/LLama/runtimes/deps/avx/llama.dll index e9924272f..6192e5395 100644 Binary files a/LLama/runtimes/deps/avx/llama.dll and b/LLama/runtimes/deps/avx/llama.dll differ diff --git a/LLama/runtimes/deps/avx2/libllama.so b/LLama/runtimes/deps/avx2/libllama.so index ffa59a499..ff9e73846 100644 Binary files a/LLama/runtimes/deps/avx2/libllama.so and b/LLama/runtimes/deps/avx2/libllama.so differ diff --git a/LLama/runtimes/deps/avx2/llama.dll b/LLama/runtimes/deps/avx2/llama.dll index 996ee0a1a..08e5e0365 100644 Binary files a/LLama/runtimes/deps/avx2/llama.dll and b/LLama/runtimes/deps/avx2/llama.dll differ diff --git a/LLama/runtimes/deps/avx512/libllama.so b/LLama/runtimes/deps/avx512/libllama.so index baee01e64..4f3510eb8 100644 Binary files a/LLama/runtimes/deps/avx512/libllama.so and b/LLama/runtimes/deps/avx512/libllama.so differ diff --git a/LLama/runtimes/deps/avx512/llama.dll b/LLama/runtimes/deps/avx512/llama.dll index 754d5cfb4..9d92e60fa 100644 Binary files a/LLama/runtimes/deps/avx512/llama.dll and b/LLama/runtimes/deps/avx512/llama.dll differ diff --git a/LLama/runtimes/deps/clblast/libllama.so b/LLama/runtimes/deps/clblast/libllama.so index bf2ca9e1e..8bb61ecb2 100644 Binary files a/LLama/runtimes/deps/clblast/libllama.so and b/LLama/runtimes/deps/clblast/libllama.so differ diff --git a/LLama/runtimes/deps/clblast/llama.dll b/LLama/runtimes/deps/clblast/llama.dll index 55d4a8517..6e45a4e5a 100644 Binary files a/LLama/runtimes/deps/clblast/llama.dll and b/LLama/runtimes/deps/clblast/llama.dll differ diff --git a/LLama/runtimes/deps/cu11.7.1/libllama.so b/LLama/runtimes/deps/cu11.7.1/libllama.so index e6eee3cd7..52ac0c0ac 100644 Binary files a/LLama/runtimes/deps/cu11.7.1/libllama.so and b/LLama/runtimes/deps/cu11.7.1/libllama.so differ diff --git a/LLama/runtimes/deps/cu11.7.1/llama.dll b/LLama/runtimes/deps/cu11.7.1/llama.dll index e75a353ea..e3ba94e70 100644 Binary files a/LLama/runtimes/deps/cu11.7.1/llama.dll and b/LLama/runtimes/deps/cu11.7.1/llama.dll differ diff --git a/LLama/runtimes/deps/cu12.1.0/libllama.so b/LLama/runtimes/deps/cu12.1.0/libllama.so index dbc7d066a..13949b5d2 100644 Binary files a/LLama/runtimes/deps/cu12.1.0/libllama.so and b/LLama/runtimes/deps/cu12.1.0/libllama.so differ diff --git a/LLama/runtimes/deps/cu12.1.0/llama.dll b/LLama/runtimes/deps/cu12.1.0/llama.dll index 88ea37f83..dec5278fe 100644 Binary files a/LLama/runtimes/deps/cu12.1.0/llama.dll and b/LLama/runtimes/deps/cu12.1.0/llama.dll differ diff --git a/LLama/runtimes/deps/libllama.so b/LLama/runtimes/deps/libllama.so index b9f6a8193..c97dff7fd 100644 Binary files a/LLama/runtimes/deps/libllama.so and b/LLama/runtimes/deps/libllama.so differ diff --git a/LLama/runtimes/deps/llama.dll b/LLama/runtimes/deps/llama.dll index 9325dadf6..5c2d6e96b 100644 Binary files a/LLama/runtimes/deps/llama.dll and b/LLama/runtimes/deps/llama.dll differ diff --git a/LLama/runtimes/deps/osx-arm64/ggml-metal.metal b/LLama/runtimes/deps/osx-arm64/ggml-metal.metal index efed6ad46..09ebcc9e3 100644 --- a/LLama/runtimes/deps/osx-arm64/ggml-metal.metal +++ b/LLama/runtimes/deps/osx-arm64/ggml-metal.metal @@ -351,12 +351,17 @@ kernel void kernel_sum_rows( kernel void kernel_soft_max( device const float * src0, device const float * src1, + device const float * src2, device float * dst, constant int64_t & ne00, constant int64_t & ne01, constant int64_t & ne02, constant float & scale, - threadgroup float * buf [[threadgroup(0)]], + constant float & max_bias, + constant float & m0, + constant float & m1, + constant uint32_t & n_head_log2, + threadgroup float * buf [[threadgroup(0)]], uint tgpig[[threadgroup_position_in_grid]], uint tpitg[[thread_position_in_threadgroup]], uint sgitg[[simdgroup_index_in_threadgroup]], @@ -368,13 +373,26 @@ kernel void kernel_soft_max( device const float * psrc0 = src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; device const float * pmask = src1 != src0 ? src1 + i01*ne00 : nullptr; + device const float * ppos = src2 != src0 ? src2 : nullptr; device float * pdst = dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00; + float slope = 0.0f; + + // ALiBi + if (max_bias > 0.0f) { + const int64_t h = i02; + + const float base = h < n_head_log2 ? m0 : m1; + const int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1; + + slope = pow(base, exp); + } + // parallel max float lmax = -INFINITY; for (int i00 = tpitg; i00 < ne00; i00 += ntg) { - lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f)); + lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]); } // find the max value in the block @@ -399,7 +417,7 @@ kernel void kernel_soft_max( // parallel sum float lsum = 0.0f; for (int i00 = tpitg; i00 < ne00; i00 += ntg) { - const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f)) - max_val); + const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]) - max_val); lsum += exp_psrc0; pdst[i00] = exp_psrc0; } @@ -437,12 +455,17 @@ kernel void kernel_soft_max( kernel void kernel_soft_max_4( device const float * src0, device const float * src1, + device const float * src2, device float * dst, constant int64_t & ne00, constant int64_t & ne01, constant int64_t & ne02, constant float & scale, - threadgroup float * buf [[threadgroup(0)]], + constant float & max_bias, + constant float & m0, + constant float & m1, + constant uint32_t & n_head_log2, + threadgroup float * buf [[threadgroup(0)]], uint tgpig[[threadgroup_position_in_grid]], uint tpitg[[thread_position_in_threadgroup]], uint sgitg[[simdgroup_index_in_threadgroup]], @@ -454,13 +477,25 @@ kernel void kernel_soft_max_4( device const float4 * psrc4 = (device const float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00); device const float4 * pmask = src1 != src0 ? (device const float4 *)(src1 + i01*ne00) : nullptr; + device const float4 * ppos = src2 != src0 ? (device const float4 *)(src2) : nullptr; device float4 * pdst4 = (device float4 *)(dst + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00); + float slope = 0.0f; + + if (max_bias > 0.0f) { + const int64_t h = i02; + + const float base = h < n_head_log2 ? m0 : m1; + const int exp = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1; + + slope = pow(base, exp); + } + // parallel max float4 lmax4 = -INFINITY; for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) { - lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f)); + lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]); } const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3])); @@ -486,7 +521,7 @@ kernel void kernel_soft_max_4( // parallel sum float4 lsum4 = 0.0f; for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) { - const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f)) - max_val); + const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]) - max_val); lsum4 += exp_psrc4; pdst4[i00] = exp_psrc4; } diff --git a/LLama/runtimes/deps/osx-arm64/libllama.dylib b/LLama/runtimes/deps/osx-arm64/libllama.dylib index 853998a7b..1cf020462 100644 Binary files a/LLama/runtimes/deps/osx-arm64/libllama.dylib and b/LLama/runtimes/deps/osx-arm64/libllama.dylib differ diff --git a/LLama/runtimes/deps/osx-x64/libllama.dylib b/LLama/runtimes/deps/osx-x64/libllama.dylib index 208bfe846..53aa7980c 100644 Binary files a/LLama/runtimes/deps/osx-x64/libllama.dylib and b/LLama/runtimes/deps/osx-x64/libllama.dylib differ diff --git a/LLama/runtimes/deps/vulkan/libllama.so b/LLama/runtimes/deps/vulkan/libllama.so new file mode 100644 index 000000000..2aee9edd9 Binary files /dev/null and b/LLama/runtimes/deps/vulkan/libllama.so differ diff --git a/LLama/runtimes/deps/vulkan/llama.dll b/LLama/runtimes/deps/vulkan/llama.dll new file mode 100644 index 000000000..e03fdcdf9 Binary files /dev/null and b/LLama/runtimes/deps/vulkan/llama.dll differ