vLLM v0.8.5

SystemPanic · SystemPanic · commit d9fec8160989 · 2025-05-03T01:57:21.000-05:00
Signed-off-by: Javier &lt;25750030+SystemPanic@users.noreply.github.com&gt;
diff --git a/cmake/external_projects/flashmla.cmake b/cmake/external_projects/flashmla.cmake
@@ -30,6 +30,12 @@ endif()
 FetchContent_MakeAvailable(flashmla)
 message(STATUS "FlashMLA is available at ${flashmla_SOURCE_DIR}")
 
+if (WIN32)
+  find_package(PythonInterp)
+  find_package(Python)
+  execute_process(COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/fix_cutlass_msvc.py ${flashmla_SOURCE_DIR})
+endif()
+
 # The FlashMLA kernels only work on hopper and require CUDA 12.3 or later.
 # Only build FlashMLA kernels if we are building for something compatible with 
 # sm90a
diff --git a/cmake/external_projects/vllm_flash_attn.cmake b/cmake/external_projects/vllm_flash_attn.cmake
@@ -50,6 +50,12 @@ endif()
 FetchContent_MakeAvailable(vllm-flash-attn)
 message(STATUS "vllm-flash-attn is available at ${vllm-flash-attn_SOURCE_DIR}")
 
+if (WIN32)
+  find_package(PythonInterp)
+  find_package(Python)
+  execute_process(COMMAND ${PYTHON_EXECUTABLE} ${CMAKE_CURRENT_SOURCE_DIR}/fix_cutlass_msvc.py ${vllm-flash-attn_SOURCE_DIR})
+endif()
+
 # Copy over the vllm-flash-attn python files (duplicated for fa2 and fa3, in
 # case only one is built, in the case both are built redundant work is done)
 install(
diff --git a/csrc/moe/marlin_moe_wna16/ops.cu b/csrc/moe/marlin_moe_wna16/ops.cu
@@ -405,33 +405,141 @@ bool is_valid_config(thread_config_t const& th_config, int thread_m_blocks,
              NUM_THREADS, true)
 
 template <typename scalar_t>
-MarlinFuncPtr get_marlin_kernel(const vllm::ScalarType q_type,
-                                int thread_m_blocks, int thread_n_blocks,
-                                int thread_k_blocks, bool m_block_size_8,
-                                bool has_act_order, bool has_zp,
-                                int group_blocks, int num_threads,
-                                bool is_zp_float) {
-  int num_bits = q_type.size_bits();
-  auto kernel = MarlinDefault;
+bool gptq_marlin_m1_u4b8(
+    MarlinFuncPtr& kernel, const vllm::ScalarType q_type, int thread_m_blocks,
+    int thread_n_blocks, int thread_k_blocks, bool m_block_size_8,
+    bool has_act_order, bool has_zp, int group_blocks, int num_threads, 
+	bool is_zp_float) {
+  bool skipped = false;
   if (false) {
   }
   GPTQ_GET_IF_M1(vllm::kU4B8, 8, 8, 256)
   GPTQ_GET_IF_M1(vllm::kU4B8, 8, 4, 128)
+  else {
+    skipped = true;
+  }
+  return skipped;
+}
 
+template <typename scalar_t>
+bool gptq_marlin_m234_u4b8(
+    MarlinFuncPtr& kernel, const vllm::ScalarType q_type, int thread_m_blocks,
+    int thread_n_blocks, int thread_k_blocks, bool m_block_size_8,
+    bool has_act_order, bool has_zp, int group_blocks, int num_threads, 
+	bool is_zp_float) {
+  bool skipped = false;
+  if (false) {
+  }
   GPTQ_GET_IF_M234(vllm::kU4B8, 16, 4, 256)
   GPTQ_GET_IF_M234(vllm::kU4B8, 8, 4, 128)
+  else {
+    skipped = true;
+  }
+  return skipped;
+}
 
+template <typename scalar_t>
+bool gptq_marlin_m1_u8b128(
+    MarlinFuncPtr& kernel, const vllm::ScalarType q_type, int thread_m_blocks,
+    int thread_n_blocks, int thread_k_blocks, bool m_block_size_8,
+    bool has_act_order, bool has_zp, int group_blocks, int num_threads, 
+	bool is_zp_float) {
+  bool skipped = false;
+  if (false) {
+  }
   GPTQ_GET_IF_M1(vllm::kU8B128, 8, 8, 256)
   GPTQ_GET_IF_M1(vllm::kU8B128, 8, 4, 128)
+  else {
+    skipped = true;
+  }
+  return skipped;
+}
 
+template <typename scalar_t>
+bool gptq_marlin_m234_u8b128(
+    MarlinFuncPtr& kernel, const vllm::ScalarType q_type, int thread_m_blocks,
+    int thread_n_blocks, int thread_k_blocks, bool m_block_size_8,
+    bool has_act_order, bool has_zp, int group_blocks, int num_threads, 
+	bool is_zp_float) {
+  bool skipped = false;
+  if (false) {
+  }
   GPTQ_GET_IF_M234(vllm::kU8B128, 16, 4, 256)
   GPTQ_GET_IF_M234(vllm::kU8B128, 8, 4, 128)
+  else {
+    skipped = true;
+  }
+  return skipped;
+}
 
+template <typename scalar_t>
+bool awq_marlin_m1_u4(
+    MarlinFuncPtr& kernel, const vllm::ScalarType q_type, int thread_m_blocks,
+    int thread_n_blocks, int thread_k_blocks, bool m_block_size_8,
+    bool has_act_order, bool has_zp, int group_blocks, int num_threads, 
+	bool is_zp_float) {
+  bool skipped = false;
+  if (false) {
+  }
   AWQ_GET_IF_M1(vllm::kU4, 8, 8, 256)
   AWQ_GET_IF_M1(vllm::kU4, 8, 4, 128)
+  else {
+    skipped = true;
+  }
+  return skipped;
+}
 
+template <typename scalar_t>
+bool awq_marlin_m234_u4(
+    MarlinFuncPtr& kernel, const vllm::ScalarType q_type, int thread_m_blocks,
+    int thread_n_blocks, int thread_k_blocks, bool m_block_size_8,
+    bool has_act_order, bool has_zp, int group_blocks, int num_threads, 
+	bool is_zp_float) {
+  bool skipped = false;
+  if (false) {
+  }
   AWQ_GET_IF_M234(vllm::kU4, 16, 4, 256)
   AWQ_GET_IF_M234(vllm::kU4, 8, 4, 128)
+  else {
+    skipped = true;
+  }
+  return skipped;
+}
+
+template <typename scalar_t>
+MarlinFuncPtr get_marlin_kernel(const vllm::ScalarType q_type,
+                                int thread_m_blocks, int thread_n_blocks,
+                                int thread_k_blocks, bool m_block_size_8,
+                                bool has_act_order, bool has_zp,
+                                int group_blocks, int num_threads,
+                                bool is_zp_float) {
+  int num_bits = q_type.size_bits();
+  auto kernel = MarlinDefault;
+  
+  bool skipped = gptq_marlin_m1_u4b8<scalar_t>(
+            kernel, q_type, thread_m_blocks, thread_n_blocks,
+            thread_k_blocks, m_block_size_8, has_act_order, 
+			has_zp, group_blocks, num_threads, is_zp_float) &&
+        gptq_marlin_m234_u4b8<scalar_t>(
+            kernel, q_type, thread_m_blocks, thread_n_blocks,
+            thread_k_blocks, m_block_size_8, has_act_order, 
+			has_zp, group_blocks, num_threads, is_zp_float) &&
+        gptq_marlin_m1_u8b128<scalar_t>(
+            kernel, q_type, thread_m_blocks, thread_n_blocks,
+            thread_k_blocks, m_block_size_8, has_act_order, 
+			has_zp, group_blocks, num_threads, is_zp_float) &&
+        gptq_marlin_m234_u8b128<scalar_t>(
+            kernel, q_type, thread_m_blocks, thread_n_blocks,
+            thread_k_blocks, m_block_size_8, has_act_order, 
+			has_zp, group_blocks, num_threads, is_zp_float) &&
+        awq_marlin_m1_u4<scalar_t>(
+            kernel, q_type, thread_m_blocks, thread_n_blocks,
+            thread_k_blocks, m_block_size_8, has_act_order, 
+			has_zp, group_blocks, num_threads, is_zp_float) &&
+        awq_marlin_m234_u4<scalar_t>(
+            kernel, q_type, thread_m_blocks, thread_n_blocks,
+            thread_k_blocks, m_block_size_8, has_act_order, 
+			has_zp, group_blocks, num_threads, is_zp_float);
 
   return kernel;
 }
diff --git a/fix_cutlass_msvc.py b/fix_cutlass_msvc.py
@@ -0,0 +1,13 @@
+import os
+import sys
+
+platform_h_file = os.path.join(sys.argv[1], "csrc", "cutlass", "include", "cutlass", "platform", "platform.h")
+
+if os.path.exists(platform_h_file):
+    with open(platform_h_file, mode="r", encoding="utf-8") as file:
+        header_content = "".join(file.readlines())
+
+    if "\n#if (201703L <=__cplusplus)\n" in header_content:
+        header_content = header_content.replace("#if (201703L <=__cplusplus)", "#if defined(_MSC_VER) || (201703L <=__cplusplus)")
+        with open(platform_h_file, mode="w", encoding="utf-8") as file:
+            file.write(header_content)