togethercomputer · orangetin · Sep 29, 2023 · Jun 30, 2023 · Jul 28, 2023 · Sep 4, 2023
diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu
@@ -144,6 +144,7 @@ __global__ void single_query_cached_kv_attention_kernel(
 
   const int* block_table = block_tables + seq_idx * max_num_blocks_per_seq;
   const int context_len = context_lens[seq_idx];
+  const int kv_head_idx = kv_head_mapping[head_idx];
   const int num_blocks = (context_len + BLOCK_SIZE - 1) / BLOCK_SIZE;
 
   // Iterate over the key blocks.

diff --git a/setup.py b/setup.py
@@ -128,7 +128,7 @@ def get_torch_arch_list() -> Set[str]:
 
 # Cache operations.
 cache_extension = CUDAExtension(
-    name="vllm.cache_ops",
+    name="vllm_cache_ops",
     sources=["csrc/cache.cpp", "csrc/cache_kernels.cu"],
     extra_compile_args={
         "cxx": CXX_FLAGS,
@@ -139,7 +139,7 @@ def get_torch_arch_list() -> Set[str]:
 
 # Attention kernels.
 attention_extension = CUDAExtension(
-    name="vllm.attention_ops",
+    name="vllm_attention_ops",
     sources=["csrc/attention.cpp", "csrc/attention/attention_kernels.cu"],
     extra_compile_args={
         "cxx": CXX_FLAGS,
@@ -148,63 +148,63 @@ def get_torch_arch_list() -> Set[str]:
 )
 ext_modules.append(attention_extension)
 
-# Positional encoding kernels.
-positional_encoding_extension = CUDAExtension(
-    name="vllm.pos_encoding_ops",
-    sources=["csrc/pos_encoding.cpp", "csrc/pos_encoding_kernels.cu"],
-    extra_compile_args={
-        "cxx": CXX_FLAGS,
-        "nvcc": NVCC_FLAGS,
-    },
-)
-ext_modules.append(positional_encoding_extension)
-
-# Layer normalization kernels.
-layernorm_extension = CUDAExtension(
-    name="vllm.layernorm_ops",
-    sources=["csrc/layernorm.cpp", "csrc/layernorm_kernels.cu"],
-    extra_compile_args={
-        "cxx": CXX_FLAGS,
-        "nvcc": NVCC_FLAGS,
-    },
-)
-ext_modules.append(layernorm_extension)
-
-# Activation kernels.
-activation_extension = CUDAExtension(
-    name="vllm.activation_ops",
-    sources=["csrc/activation.cpp", "csrc/activation_kernels.cu"],
-    extra_compile_args={
-        "cxx": CXX_FLAGS,
-        "nvcc": NVCC_FLAGS,
-    },
-)
-ext_modules.append(activation_extension)
-
-# Quantization kernels.
-quantization_extension = CUDAExtension(
-    name="vllm.quantization_ops",
-    sources=[
-        "csrc/quantization.cpp",
-        "csrc/quantization/awq/gemm_kernels.cu",
-    ],
-    extra_compile_args={
-        "cxx": CXX_FLAGS,
-        "nvcc": NVCC_FLAGS,
-    },
-)
-ext_modules.append(quantization_extension)
-
-# Misc. CUDA utils.
-cuda_utils_extension = CUDAExtension(
-    name="vllm.cuda_utils",
-    sources=["csrc/cuda_utils.cpp", "csrc/cuda_utils_kernels.cu"],
-    extra_compile_args={
-        "cxx": CXX_FLAGS,
-        "nvcc": NVCC_FLAGS,
-    },
-)
-ext_modules.append(cuda_utils_extension)
+# # Positional encoding kernels.
+# positional_encoding_extension = CUDAExtension(
+#     name="vllm.pos_encoding_ops",
+#     sources=["csrc/pos_encoding.cpp", "csrc/pos_encoding_kernels.cu"],
+#     extra_compile_args={
+#         "cxx": CXX_FLAGS,
+#         "nvcc": NVCC_FLAGS,
+#     },
+# )
+# ext_modules.append(positional_encoding_extension)
+
+# # Layer normalization kernels.
+# layernorm_extension = CUDAExtension(
+#     name="vllm.layernorm_ops",
+#     sources=["csrc/layernorm.cpp", "csrc/layernorm_kernels.cu"],
+#     extra_compile_args={
+#         "cxx": CXX_FLAGS,
+#         "nvcc": NVCC_FLAGS,
+#     },
+# )
+# ext_modules.append(layernorm_extension)
+
+# # Activation kernels.
+# activation_extension = CUDAExtension(
+#     name="vllm.activation_ops",
+#     sources=["csrc/activation.cpp", "csrc/activation_kernels.cu"],
+#     extra_compile_args={
+#         "cxx": CXX_FLAGS,
+#         "nvcc": NVCC_FLAGS,
+#     },
+# )
+# ext_modules.append(activation_extension)
+
+# # Quantization kernels.
+# quantization_extension = CUDAExtension(
+#     name="vllm.quantization_ops",
+#     sources=[
+#         "csrc/quantization.cpp",
+#         "csrc/quantization/awq/gemm_kernels.cu",
+#     ],
+#     extra_compile_args={
+#         "cxx": CXX_FLAGS,
+#         "nvcc": NVCC_FLAGS,
+#     },
+# )
+# ext_modules.append(quantization_extension)
+
+# # Misc. CUDA utils.
+# cuda_utils_extension = CUDAExtension(
+#     name="vllm.cuda_utils",
+#     sources=["csrc/cuda_utils.cpp", "csrc/cuda_utils_kernels.cu"],
+#     extra_compile_args={
+#         "cxx": CXX_FLAGS,
+#         "nvcc": NVCC_FLAGS,
+#     },
+# )
+# ext_modules.append(cuda_utils_extension)
 
 
 def get_path(*filepath) -> str:
@@ -238,7 +238,7 @@ def get_requirements() -> List[str]:
 
 setuptools.setup(
     name="vllm",
-    version=find_version(get_path("vllm", "__init__.py")),
+    # version=find_version(get_path("vllm", "__init__.py")),
     author="vLLM Team",
     license="Apache 2.0",
     description=("A high-throughput and memory-efficient inference and "
@@ -258,10 +258,10 @@ def get_requirements() -> List[str]:
         "License :: OSI Approved :: Apache Software License",
         "Topic :: Scientific/Engineering :: Artificial Intelligence",
     ],
-    packages=setuptools.find_packages(exclude=("benchmarks", "csrc", "docs",
-                                               "examples", "tests")),
+    # packages=setuptools.find_packages(
+    #     exclude=("assets", "benchmarks", "csrc", "docs", "examples", "tests")),
     python_requires=">=3.8",
-    install_requires=get_requirements(),
+    # install_requires=get_requirements(),
     ext_modules=ext_modules,
     cmdclass={"build_ext": BuildExtension},
 )