diff --git a/csrc/attention/attention_kernels.cu b/csrc/attention/attention_kernels.cu index 8955b503bdd1..3b308de72c08 100644 --- a/csrc/attention/attention_kernels.cu +++ b/csrc/attention/attention_kernels.cu @@ -144,6 +144,7 @@ __global__ void single_query_cached_kv_attention_kernel( const int* block_table = block_tables + seq_idx * max_num_blocks_per_seq; const int context_len = context_lens[seq_idx]; + const int kv_head_idx = kv_head_mapping[head_idx]; const int num_blocks = (context_len + BLOCK_SIZE - 1) / BLOCK_SIZE; // Iterate over the key blocks. diff --git a/setup.py b/setup.py index 8b2ad97dd540..2ff9890d951e 100644 --- a/setup.py +++ b/setup.py @@ -128,7 +128,7 @@ def get_torch_arch_list() -> Set[str]: # Cache operations. cache_extension = CUDAExtension( - name="vllm.cache_ops", + name="vllm_cache_ops", sources=["csrc/cache.cpp", "csrc/cache_kernels.cu"], extra_compile_args={ "cxx": CXX_FLAGS, @@ -139,7 +139,7 @@ def get_torch_arch_list() -> Set[str]: # Attention kernels. attention_extension = CUDAExtension( - name="vllm.attention_ops", + name="vllm_attention_ops", sources=["csrc/attention.cpp", "csrc/attention/attention_kernels.cu"], extra_compile_args={ "cxx": CXX_FLAGS, @@ -148,63 +148,63 @@ def get_torch_arch_list() -> Set[str]: ) ext_modules.append(attention_extension) -# Positional encoding kernels. -positional_encoding_extension = CUDAExtension( - name="vllm.pos_encoding_ops", - sources=["csrc/pos_encoding.cpp", "csrc/pos_encoding_kernels.cu"], - extra_compile_args={ - "cxx": CXX_FLAGS, - "nvcc": NVCC_FLAGS, - }, -) -ext_modules.append(positional_encoding_extension) - -# Layer normalization kernels. -layernorm_extension = CUDAExtension( - name="vllm.layernorm_ops", - sources=["csrc/layernorm.cpp", "csrc/layernorm_kernels.cu"], - extra_compile_args={ - "cxx": CXX_FLAGS, - "nvcc": NVCC_FLAGS, - }, -) -ext_modules.append(layernorm_extension) - -# Activation kernels. -activation_extension = CUDAExtension( - name="vllm.activation_ops", - sources=["csrc/activation.cpp", "csrc/activation_kernels.cu"], - extra_compile_args={ - "cxx": CXX_FLAGS, - "nvcc": NVCC_FLAGS, - }, -) -ext_modules.append(activation_extension) - -# Quantization kernels. -quantization_extension = CUDAExtension( - name="vllm.quantization_ops", - sources=[ - "csrc/quantization.cpp", - "csrc/quantization/awq/gemm_kernels.cu", - ], - extra_compile_args={ - "cxx": CXX_FLAGS, - "nvcc": NVCC_FLAGS, - }, -) -ext_modules.append(quantization_extension) - -# Misc. CUDA utils. -cuda_utils_extension = CUDAExtension( - name="vllm.cuda_utils", - sources=["csrc/cuda_utils.cpp", "csrc/cuda_utils_kernels.cu"], - extra_compile_args={ - "cxx": CXX_FLAGS, - "nvcc": NVCC_FLAGS, - }, -) -ext_modules.append(cuda_utils_extension) +# # Positional encoding kernels. +# positional_encoding_extension = CUDAExtension( +# name="vllm.pos_encoding_ops", +# sources=["csrc/pos_encoding.cpp", "csrc/pos_encoding_kernels.cu"], +# extra_compile_args={ +# "cxx": CXX_FLAGS, +# "nvcc": NVCC_FLAGS, +# }, +# ) +# ext_modules.append(positional_encoding_extension) + +# # Layer normalization kernels. +# layernorm_extension = CUDAExtension( +# name="vllm.layernorm_ops", +# sources=["csrc/layernorm.cpp", "csrc/layernorm_kernels.cu"], +# extra_compile_args={ +# "cxx": CXX_FLAGS, +# "nvcc": NVCC_FLAGS, +# }, +# ) +# ext_modules.append(layernorm_extension) + +# # Activation kernels. +# activation_extension = CUDAExtension( +# name="vllm.activation_ops", +# sources=["csrc/activation.cpp", "csrc/activation_kernels.cu"], +# extra_compile_args={ +# "cxx": CXX_FLAGS, +# "nvcc": NVCC_FLAGS, +# }, +# ) +# ext_modules.append(activation_extension) + +# # Quantization kernels. +# quantization_extension = CUDAExtension( +# name="vllm.quantization_ops", +# sources=[ +# "csrc/quantization.cpp", +# "csrc/quantization/awq/gemm_kernels.cu", +# ], +# extra_compile_args={ +# "cxx": CXX_FLAGS, +# "nvcc": NVCC_FLAGS, +# }, +# ) +# ext_modules.append(quantization_extension) + +# # Misc. CUDA utils. +# cuda_utils_extension = CUDAExtension( +# name="vllm.cuda_utils", +# sources=["csrc/cuda_utils.cpp", "csrc/cuda_utils_kernels.cu"], +# extra_compile_args={ +# "cxx": CXX_FLAGS, +# "nvcc": NVCC_FLAGS, +# }, +# ) +# ext_modules.append(cuda_utils_extension) def get_path(*filepath) -> str: @@ -238,7 +238,7 @@ def get_requirements() -> List[str]: setuptools.setup( name="vllm", - version=find_version(get_path("vllm", "__init__.py")), + # version=find_version(get_path("vllm", "__init__.py")), author="vLLM Team", license="Apache 2.0", description=("A high-throughput and memory-efficient inference and " @@ -258,10 +258,10 @@ def get_requirements() -> List[str]: "License :: OSI Approved :: Apache Software License", "Topic :: Scientific/Engineering :: Artificial Intelligence", ], - packages=setuptools.find_packages(exclude=("benchmarks", "csrc", "docs", - "examples", "tests")), + # packages=setuptools.find_packages( + # exclude=("assets", "benchmarks", "csrc", "docs", "examples", "tests")), python_requires=">=3.8", - install_requires=get_requirements(), + # install_requires=get_requirements(), ext_modules=ext_modules, cmdclass={"build_ext": BuildExtension}, )