diff --git a/.github/workflows/compile.yml b/.github/workflows/compile.yml
index 2e9ac1e86..cc95273d2 100644
--- a/.github/workflows/compile.yml
+++ b/.github/workflows/compile.yml
@@ -172,7 +172,85 @@ jobs:
             ./build/libllama.so
             # ./build/libclblast.so
           name: llama-bin-linux-clblast-x64.so
-          
+  
+  compile-vulkan:
+    name: Compile (vulkan) - ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [
+          ubuntu-22.04,
+          windows-latest
+        ]
+    env:
+      OPENBLAS_VERSION: 0.3.23
+      OPENCL_VERSION: 2023.04.17
+      CLBLAST_VERSION: 1.6.0
+      VULKAN_VERSION: 1.3.261.1
+    runs-on: ${{ matrix.os }}
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v4
+        with:
+          repository: ggerganov/llama.cpp
+
+      - name: Download dependencies - Linux
+        if: ${{ matrix.os == 'ubuntu-22.04' }}
+        run: |
+          wget -qO- https://packages.lunarg.com/lunarg-signing-key-pub.asc | sudo tee /etc/apt/trusted.gpg.d/lunarg.asc
+          sudo wget -qO /etc/apt/sources.list.d/lunarg-vulkan-jammy.list http://packages.lunarg.com/vulkan/lunarg-vulkan-jammy.list
+          sudo apt update
+          sudo apt install vulkan-sdk
+
+      - name: Download dependencies - Windows
+        id: get_vulkan
+        if: ${{ matrix.os == 'windows-latest' }}
+        run: |
+          curl.exe -o $env:RUNNER_TEMP/VulkanSDK-Installer.exe -L "https://sdk.lunarg.com/sdk/download/${env:VULKAN_VERSION}/windows/VulkanSDK-${env:VULKAN_VERSION}-Installer.exe"
+          & "$env:RUNNER_TEMP\VulkanSDK-Installer.exe" --accept-licenses --default-answer --confirm-command install
+          Add-Content $env:GITHUB_ENV "VULKAN_SDK=C:\VulkanSDK\${env:VULKAN_VERSION}"
+          Add-Content $env:GITHUB_PATH "C:\VulkanSDK\${env:VULKAN_VERSION}\bin"
+
+      - name: Build
+        id: cmake_build
+        if: ${{ matrix.os == 'windows-latest' }}
+        run: |
+          mkdir build
+          cd build
+          cmake .. ${{ env.COMMON_DEFINE }} -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON -DBUILD_SHARED_LIBS=ON -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/vulkan"
+          cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
+          #copy $env:RUNNER_TEMP/clblast/lib/clblast.dll .\bin\Release\clblast.dll
+          # # We should probably generate a sha256 sum in a file, and use that.
+          # echo "78a8c98bcb2efe1a63318d901ab204d9ba96c3b29707b4ce0c4240bdcdc698d6  ./bin/Release/clblast.dll" >> tmp
+          # sha256sum -c tmp || exit 255
+          # rm tmp
+          ls -R
+      - name: Build
+        if: ${{ matrix.os == 'ubuntu-22.04' }}
+        run: |
+          mkdir build
+          cd build
+          cmake .. ${{ env.COMMON_DEFINE }} -DLLAMA_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DLLAMA_VULKAN=ON -DBUILD_SHARED_LIBS=ON
+          cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS}
+          # if we ever want to pull libvulkan.so back into the packages, just uncomment this line, and the one below for the upload
+          # cp $(ldconfig -p | grep libvulkan.so | tail -n 1 | cut -d ' ' -f 4) ./
+          ls -R
+      - name: Upload artifacts (Windows)
+        if: ${{ matrix.os == 'windows-latest' }}
+        uses: actions/upload-artifact@v4
+        with:
+          path: |
+            .\build\bin\Release\llama.dll
+          name: llama-bin-win-vulkan-x64.dll
+      - name: Upload artifacts (linux)
+        if: ${{ matrix.os == 'ubuntu-22.04' }}
+        uses: actions/upload-artifact@v4
+        with:
+          path: |
+            ./build/libllama.so
+          name: llama-bin-linux-vulkan-x64.so
+  
   compile-cublas:
     name: Compile (cublas)
     strategy:
@@ -277,7 +355,8 @@ jobs:
       "compile-macos",
       "compile-windows",
       "compile-cublas",
-      "compile-clblast"
+      "compile-clblast",
+      "compile-vulkan"
     ]
     steps:      
       - uses: actions/download-artifact@v4
@@ -288,7 +367,7 @@ jobs:
       - name: Rearrange Files
         run: |
           # Make all directories at once
-          mkdir --parents deps/{avx,avx2,avx512,osx-arm64,osx-x64,cu11.7.1,cu12.1.0,clblast}
+          mkdir --parents deps/{avx,avx2,avx512,osx-arm64,osx-x64,cu11.7.1,cu12.1.0,clblast,vulkan}
 
           cp artifacts/llama-bin-linux-noavx-x64.so/libllama.so  deps/libllama.so
           cp artifacts/llama-bin-linux-avx-x64.so/libllama.so    deps/avx/libllama.so
@@ -312,6 +391,9 @@ jobs:
           cp artifacts/llama-bin-win-clblast-x64.dll/{llama,clblast}.dll deps/clblast/
           cp artifacts/llama-bin-linux-clblast-x64.so/libllama.so deps/clblast/
 
+          cp artifacts/llama-bin-win-vulkan-x64.dll/llama.dll deps/vulkan/
+          cp artifacts/llama-bin-linux-vulkan-x64.so/libllama.so deps/vulkan/
+
       - name: Upload artifacts
         uses: actions/upload-artifact@v4
         with:
diff --git a/LLama/LLamaSharp.Runtime.targets b/LLama/LLamaSharp.Runtime.targets
index f26ad24e0..98e580bf5 100644
--- a/LLama/LLamaSharp.Runtime.targets
+++ b/LLama/LLamaSharp.Runtime.targets
@@ -28,6 +28,10 @@
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/win-x64/native/cuda12/llama.dll</Link>
       </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/vulkan/llama.dll">
+        <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+        <Link>runtimes/win-x64/native/vulkan/llama.dll</Link>
+      </None>
 
       <None Include="$(MSBuildThisFileDirectory)runtimes/deps/libllama.so">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
@@ -53,6 +57,10 @@
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
         <Link>runtimes/linux-x64/native/cuda12/libllama.so</Link>
       </None>
+      <None Include="$(MSBuildThisFileDirectory)runtimes/deps/vulkan/libllama.so">
+        <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
+        <Link>runtimes/linux-x64/native/vulkan/libllama.so</Link>
+      </None>
 
       <None Include="$(MSBuildThisFileDirectory)runtimes/deps/osx-arm64/libllama.dylib">
         <CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
diff --git a/LLama/Native/NativeApi.Load.cs b/LLama/Native/NativeApi.Load.cs
index 8a02d8a56..7264c65b5 100644
--- a/LLama/Native/NativeApi.Load.cs
+++ b/LLama/Native/NativeApi.Load.cs
@@ -4,6 +4,7 @@
 using System.Collections.Generic;
 using System.Diagnostics;
 using System.IO;
+using System.Linq;
 using System.Runtime.InteropServices;
 using System.Text.Json;
 
@@ -59,6 +60,51 @@ private static void Log(string message, LogLevel level)
             Console.ResetColor();
         }
 
+        private static string GetVulkanVersion()
+        {
+          var apiVersionString = string.Empty;
+          try
+          {
+            ProcessStartInfo start = new()
+            {
+              FileName = "vulkaninfo",
+              Arguments = "--summary",
+              RedirectStandardOutput = true,
+              UseShellExecute = false,
+              CreateNoWindow = true
+            };
+
+            Process process = new()
+            {
+              StartInfo = start
+            };
+            process.Start();
+            
+            string output = process.StandardOutput.ReadToEnd();
+            process.WaitForExit();
+
+            var lines = output.Split('\n');
+            int apiVersionLineIndex = lines.ToList().FindIndex(line => line.Contains("apiVersion"));
+            if (apiVersionLineIndex >= 0)
+            {
+                var apiVersionline = lines[apiVersionLineIndex];
+                //apiVersionline ="	apiVersion         = 1.3.260";
+                //apiVersionline ="        apiVersion         = 4206830 (1.3.238)";
+                if(apiVersionline.Contains('=') && apiVersionline.Length > apiVersionline.IndexOf('=')+1)
+                {
+                  apiVersionString = apiVersionline.Substring(apiVersionline.IndexOf('=')+1).Trim();
+                  if(apiVersionString.Contains('(') && apiVersionString.Contains(')') && apiVersionString.IndexOf(')') > apiVersionString.IndexOf('('))
+                  {
+                    apiVersionString = apiVersionString.Substring(apiVersionString.IndexOf('(')+1,apiVersionString.IndexOf(')')-apiVersionString.IndexOf('(')-1);
+                  }
+                  return apiVersionString;
+                } 
+            }
+          }
+          catch {}
+          return apiVersionString;
+        }
+
         private static int GetCudaMajorVersion()
         {
             string? cudaPath;
@@ -209,6 +255,17 @@ private static List<string> GetLibraryTryOrder(NativeLibraryConfig.Description c
                 // otherwise no cuda detected but allow fallback
             }
 
+            if (configuration.UseVulkan && (platform == OSPlatform.Windows || platform == OSPlatform.Linux)) // no vulkan on macos
+            {
+                string vulkanVersion = GetVulkanVersion();
+                if(!string.IsNullOrEmpty(vulkanVersion))
+                {
+                  Log($"Detected device supporting vulkan version {vulkanVersion}.", LogLevel.Information);
+                  string vulkanLibraryPath = $"{prefix}vulkan/{libraryNamePrefix}{libraryName}{suffix}";
+                  result.Add(vulkanLibraryPath);
+                }
+            }
+
             // use cpu (or mac possibly with metal)
             if (!configuration.AllowFallback && platform != OSPlatform.OSX)
             {
diff --git a/LLama/Native/NativeLibraryConfig.cs b/LLama/Native/NativeLibraryConfig.cs
index ad52fc816..2bc88c667 100644
--- a/LLama/Native/NativeLibraryConfig.cs
+++ b/LLama/Native/NativeLibraryConfig.cs
@@ -25,6 +25,7 @@ public sealed class NativeLibraryConfig
 
         private string _libraryPath = string.Empty;
         private bool _useCuda = true;
+        private bool _useVulkan = true;
         private AvxLevel _avxLevel;
         private bool _allowFallback = true;
         private bool _skipCheck = false;
@@ -69,6 +70,20 @@ public NativeLibraryConfig WithCuda(bool enable = true)
             return this;
         }
 
+        /// <summary>
+        /// Configure whether to use vulkan backend if possible.
+        /// </summary>
+        /// <param name="enable"></param>
+        /// <returns></returns>
+        /// <exception cref="InvalidOperationException">Thrown if `LibraryHasLoaded` is true.</exception>
+        public NativeLibraryConfig WithVulkan(bool enable = true)
+        {
+            ThrowIfLoaded();
+
+            _useVulkan = enable;
+            return this;
+        }
+
         /// <summary>
         /// Configure the prefferred avx support level of the backend.
         /// </summary>
@@ -164,7 +179,8 @@ internal static Description CheckAndGatherDescription()
             
             return new Description(
                 Instance._libraryPath, 
-                Instance._useCuda, 
+                Instance._useCuda,
+                Instance._useVulkan,
                 Instance._avxLevel, 
                 Instance._allowFallback, 
                 Instance._skipCheck, 
@@ -250,7 +266,7 @@ public enum AvxLevel
             Avx512,
         }
 
-        internal record Description(string Path, bool UseCuda, AvxLevel AvxLevel, bool AllowFallback, bool SkipCheck, bool Logging, string[] SearchDirectories)
+        internal record Description(string Path, bool UseCuda,bool UseVulkan, AvxLevel AvxLevel, bool AllowFallback, bool SkipCheck, bool Logging, string[] SearchDirectories)
         {
             public override string ToString()
             {
@@ -268,6 +284,7 @@ public override string ToString()
                 return $"NativeLibraryConfig Description:\n" +
                        $"- Path: {Path}\n" +
                        $"- PreferCuda: {UseCuda}\n" +
+                       $"- PreferVulkan: {UseVulkan}\n" +
                        $"- PreferredAvxLevel: {avxLevelString}\n" +
                        $"- AllowFallback: {AllowFallback}\n" +
                        $"- SkipCheck: {SkipCheck}\n" +
diff --git a/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.nuspec b/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.nuspec
new file mode 100644
index 000000000..3e45b249a
--- /dev/null
+++ b/LLama/runtimes/build/LLamaSharp.Backend.Vulkan.nuspec
@@ -0,0 +1,24 @@
+<?xml version="1.0" encoding="utf-8"?>
+<package >
+  <metadata>
+    <id>LLamaSharp.Backend.Vulkan</id>
+    <version>$version$</version>
+    <title>LLamaSharp.Backend.Vulkan - Vulkan Backend for LLamaSharp</title>
+    <authors>llama.cpp Authors</authors>
+    <requireLicenseAcceptance>false</requireLicenseAcceptance>
+    <license type="expression">MIT</license>
+    <icon>icon512.png</icon>
+    <projectUrl>https://github.com/SciSharp/LLamaSharp</projectUrl>
+    <description>LLamaSharp.Backend.Vulkan is a backend for LLamaSharp to use with Vulkan.</description>
+    <releaseNotes></releaseNotes>
+    <copyright>Copyright 2023 The llama.cpp Authors. All rights reserved.</copyright>
+    <tags>LLamaSharp LLama LLM GPT AI ChatBot SciSharp</tags>
+  </metadata>
+
+  <files>
+    <file src="LLamaSharpBackend.props" target="build/netstandard2.0/LLamaSharp.Backend.Vulkan.props" />
+    <file src="runtimes/deps/vulkan/llama.dll" target="runtimes\win-x64\native\vulkan\llama.dll" />
+    <file src="runtimes/deps/vulkan/libllama.so" target="runtimes\linux-x64\native\vulkan\libllama.so" />
+    <file src="icon512.png" target="icon512.png" />
+  </files>
+</package>
diff --git a/LLama/runtimes/build/temp.csproj b/LLama/runtimes/build/temp.csproj
deleted file mode 100644
index 9f5c4f4ab..000000000
--- a/LLama/runtimes/build/temp.csproj
+++ /dev/null
@@ -1,7 +0,0 @@
-<Project Sdk="Microsoft.NET.Sdk">
-
-  <PropertyGroup>
-    <TargetFramework>netstandard2.0</TargetFramework>
-  </PropertyGroup>
-
-</Project>
diff --git a/LLama/runtimes/deps/avx/libllama.so b/LLama/runtimes/deps/avx/libllama.so
index 49a80191e..9c1e586b0 100644
Binary files a/LLama/runtimes/deps/avx/libllama.so and b/LLama/runtimes/deps/avx/libllama.so differ
diff --git a/LLama/runtimes/deps/avx/llama.dll b/LLama/runtimes/deps/avx/llama.dll
index e9924272f..6192e5395 100644
Binary files a/LLama/runtimes/deps/avx/llama.dll and b/LLama/runtimes/deps/avx/llama.dll differ
diff --git a/LLama/runtimes/deps/avx2/libllama.so b/LLama/runtimes/deps/avx2/libllama.so
index ffa59a499..ff9e73846 100644
Binary files a/LLama/runtimes/deps/avx2/libllama.so and b/LLama/runtimes/deps/avx2/libllama.so differ
diff --git a/LLama/runtimes/deps/avx2/llama.dll b/LLama/runtimes/deps/avx2/llama.dll
index 996ee0a1a..08e5e0365 100644
Binary files a/LLama/runtimes/deps/avx2/llama.dll and b/LLama/runtimes/deps/avx2/llama.dll differ
diff --git a/LLama/runtimes/deps/avx512/libllama.so b/LLama/runtimes/deps/avx512/libllama.so
index baee01e64..4f3510eb8 100644
Binary files a/LLama/runtimes/deps/avx512/libllama.so and b/LLama/runtimes/deps/avx512/libllama.so differ
diff --git a/LLama/runtimes/deps/avx512/llama.dll b/LLama/runtimes/deps/avx512/llama.dll
index 754d5cfb4..9d92e60fa 100644
Binary files a/LLama/runtimes/deps/avx512/llama.dll and b/LLama/runtimes/deps/avx512/llama.dll differ
diff --git a/LLama/runtimes/deps/clblast/libllama.so b/LLama/runtimes/deps/clblast/libllama.so
index bf2ca9e1e..8bb61ecb2 100644
Binary files a/LLama/runtimes/deps/clblast/libllama.so and b/LLama/runtimes/deps/clblast/libllama.so differ
diff --git a/LLama/runtimes/deps/clblast/llama.dll b/LLama/runtimes/deps/clblast/llama.dll
index 55d4a8517..6e45a4e5a 100644
Binary files a/LLama/runtimes/deps/clblast/llama.dll and b/LLama/runtimes/deps/clblast/llama.dll differ
diff --git a/LLama/runtimes/deps/cu11.7.1/libllama.so b/LLama/runtimes/deps/cu11.7.1/libllama.so
index e6eee3cd7..52ac0c0ac 100644
Binary files a/LLama/runtimes/deps/cu11.7.1/libllama.so and b/LLama/runtimes/deps/cu11.7.1/libllama.so differ
diff --git a/LLama/runtimes/deps/cu11.7.1/llama.dll b/LLama/runtimes/deps/cu11.7.1/llama.dll
index e75a353ea..e3ba94e70 100644
Binary files a/LLama/runtimes/deps/cu11.7.1/llama.dll and b/LLama/runtimes/deps/cu11.7.1/llama.dll differ
diff --git a/LLama/runtimes/deps/cu12.1.0/libllama.so b/LLama/runtimes/deps/cu12.1.0/libllama.so
index dbc7d066a..13949b5d2 100644
Binary files a/LLama/runtimes/deps/cu12.1.0/libllama.so and b/LLama/runtimes/deps/cu12.1.0/libllama.so differ
diff --git a/LLama/runtimes/deps/cu12.1.0/llama.dll b/LLama/runtimes/deps/cu12.1.0/llama.dll
index 88ea37f83..dec5278fe 100644
Binary files a/LLama/runtimes/deps/cu12.1.0/llama.dll and b/LLama/runtimes/deps/cu12.1.0/llama.dll differ
diff --git a/LLama/runtimes/deps/libllama.so b/LLama/runtimes/deps/libllama.so
index b9f6a8193..c97dff7fd 100644
Binary files a/LLama/runtimes/deps/libllama.so and b/LLama/runtimes/deps/libllama.so differ
diff --git a/LLama/runtimes/deps/llama.dll b/LLama/runtimes/deps/llama.dll
index 9325dadf6..5c2d6e96b 100644
Binary files a/LLama/runtimes/deps/llama.dll and b/LLama/runtimes/deps/llama.dll differ
diff --git a/LLama/runtimes/deps/osx-arm64/ggml-metal.metal b/LLama/runtimes/deps/osx-arm64/ggml-metal.metal
index efed6ad46..09ebcc9e3 100644
--- a/LLama/runtimes/deps/osx-arm64/ggml-metal.metal
+++ b/LLama/runtimes/deps/osx-arm64/ggml-metal.metal
@@ -351,12 +351,17 @@ kernel void kernel_sum_rows(
 kernel void kernel_soft_max(
         device const float * src0,
         device const float * src1,
+        device const float * src2,
         device       float * dst,
         constant   int64_t & ne00,
         constant   int64_t & ne01,
         constant   int64_t & ne02,
         constant     float & scale,
-        threadgroup float  * buf [[threadgroup(0)]],
+        constant     float & max_bias,
+        constant     float & m0,
+        constant     float & m1,
+        constant  uint32_t & n_head_log2,
+        threadgroup  float * buf [[threadgroup(0)]],
         uint  tgpig[[threadgroup_position_in_grid]],
         uint  tpitg[[thread_position_in_threadgroup]],
         uint  sgitg[[simdgroup_index_in_threadgroup]],
@@ -368,13 +373,26 @@ kernel void kernel_soft_max(
 
     device const float * psrc0 =         src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
     device const float * pmask = src1 != src0 ? src1                               + i01*ne00 : nullptr;
+    device const float * ppos  = src2 != src0 ? src2                                          : nullptr;
     device       float * pdst  =         dst  + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00;
 
+    float slope = 0.0f;
+
+    // ALiBi
+    if (max_bias > 0.0f) {
+        const int64_t h = i02;
+
+        const float base = h < n_head_log2 ? m0 : m1;
+        const int   exp  = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
+
+        slope = pow(base, exp);
+    }
+
     // parallel max
     float lmax = -INFINITY;
 
     for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
-        lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f));
+        lmax = MAX(lmax, psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]);
     }
 
     // find the max value in the block
@@ -399,7 +417,7 @@ kernel void kernel_soft_max(
     // parallel sum
     float lsum = 0.0f;
     for (int i00 = tpitg; i00 < ne00; i00 += ntg) {
-        const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f)) - max_val);
+        const float exp_psrc0 = exp((psrc0[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]) - max_val);
         lsum += exp_psrc0;
         pdst[i00] = exp_psrc0;
     }
@@ -437,12 +455,17 @@ kernel void kernel_soft_max(
 kernel void kernel_soft_max_4(
         device const float * src0,
         device const float * src1,
+        device const float * src2,
         device       float * dst,
         constant   int64_t & ne00,
         constant   int64_t & ne01,
         constant   int64_t & ne02,
         constant     float & scale,
-        threadgroup float  * buf [[threadgroup(0)]],
+        constant     float & max_bias,
+        constant     float & m0,
+        constant     float & m1,
+        constant  uint32_t & n_head_log2,
+        threadgroup  float * buf [[threadgroup(0)]],
         uint  tgpig[[threadgroup_position_in_grid]],
         uint  tpitg[[thread_position_in_threadgroup]],
         uint  sgitg[[simdgroup_index_in_threadgroup]],
@@ -454,13 +477,25 @@ kernel void kernel_soft_max_4(
 
     device const float4 * psrc4 =                (device const float4 *)(src0 + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
     device const float4 * pmask = src1 != src0 ? (device const float4 *)(src1 +                                      i01*ne00) : nullptr;
+    device const float4 * ppos  = src2 != src0 ? (device const float4 *)(src2)                                                 : nullptr;
     device       float4 * pdst4 =                (device       float4 *)(dst  + i03*ne02*ne01*ne00 + i02*ne01*ne00 + i01*ne00);
 
+    float slope = 0.0f;
+
+    if (max_bias > 0.0f) {
+        const int64_t h = i02;
+
+        const float base = h < n_head_log2 ? m0 : m1;
+        const int   exp  = h < n_head_log2 ? h + 1 : 2*(h - n_head_log2) + 1;
+
+        slope = pow(base, exp);
+    }
+
     // parallel max
     float4 lmax4 = -INFINITY;
 
     for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
-        lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f));
+        lmax4 = fmax(lmax4, psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]);
     }
 
     const float lmax = MAX(MAX(lmax4[0], lmax4[1]), MAX(lmax4[2], lmax4[3]));
@@ -486,7 +521,7 @@ kernel void kernel_soft_max_4(
     // parallel sum
     float4 lsum4 = 0.0f;
     for (int i00 = tpitg; i00 < ne00/4; i00 += ntg) {
-        const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f)) - max_val);
+        const float4 exp_psrc4 = exp((psrc4[i00]*scale + (pmask ? pmask[i00] : 0.0f) + slope*ppos[i00]) - max_val);
         lsum4 += exp_psrc4;
         pdst4[i00] = exp_psrc4;
     }
diff --git a/LLama/runtimes/deps/osx-arm64/libllama.dylib b/LLama/runtimes/deps/osx-arm64/libllama.dylib
index 853998a7b..1cf020462 100644
Binary files a/LLama/runtimes/deps/osx-arm64/libllama.dylib and b/LLama/runtimes/deps/osx-arm64/libllama.dylib differ
diff --git a/LLama/runtimes/deps/osx-x64/libllama.dylib b/LLama/runtimes/deps/osx-x64/libllama.dylib
index 208bfe846..53aa7980c 100644
Binary files a/LLama/runtimes/deps/osx-x64/libllama.dylib and b/LLama/runtimes/deps/osx-x64/libllama.dylib differ
diff --git a/LLama/runtimes/deps/vulkan/libllama.so b/LLama/runtimes/deps/vulkan/libllama.so
new file mode 100644
index 000000000..2aee9edd9
Binary files /dev/null and b/LLama/runtimes/deps/vulkan/libllama.so differ
diff --git a/LLama/runtimes/deps/vulkan/llama.dll b/LLama/runtimes/deps/vulkan/llama.dll
new file mode 100644
index 000000000..e03fdcdf9
Binary files /dev/null and b/LLama/runtimes/deps/vulkan/llama.dll differ