diff --git a/tests/singlecard/spec_decode/test_spec_decode_worker.py b/tests/singlecard/spec_decode/test_spec_decode_worker.py index b44a1f3784..442041a3a4 100644 --- a/tests/singlecard/spec_decode/test_spec_decode_worker.py +++ b/tests/singlecard/spec_decode/test_spec_decode_worker.py @@ -589,7 +589,6 @@ def test_empty_input_batch(k: int, batch_size: int, @pytest.mark.parametrize("acceptance_sampler_method", ["rejection_sampler", "typical_acceptance_sampler"]) -@pytest.mark.skip_global_cleanup def test_init_device(acceptance_sampler_method: str): """Verify SpecDecodeWorker invokes proposer/scorer worker init_device, as well as other GPU initialization. @@ -646,7 +645,6 @@ def test_initialize_cache(acceptance_sampler_method): @pytest.mark.parametrize('draft_kv_size_bytes', [0, 2 * 2 * 768, 2 * 2 * 4096]) @pytest.mark.parametrize("acceptance_sampler_method", ["rejection_sampler", "typical_acceptance_sampler"]) -@pytest.mark.skip_global_cleanup def test_determine_num_available_blocks(available_gpu_blocks: int, available_cpu_blocks: int, target_cache_block_size_bytes: int, @@ -685,7 +683,6 @@ def test_determine_num_available_blocks(available_gpu_blocks: int, @pytest.mark.parametrize('target_cache_block_size_bytes', [2 * 2 * 4096, 2 * 2 * 8192]) @pytest.mark.parametrize('draft_kv_size_bytes', [0, 2 * 2 * 768, 2 * 2 * 4096]) -@pytest.mark.skip_global_cleanup def test_split_num_cache_blocks_evenly(available_gpu_blocks: int, target_cache_block_size_bytes: int, draft_kv_size_bytes: int): diff --git a/vllm_ascend/__init__.py b/vllm_ascend/__init__.py index c8f33313ba..24469928c4 100644 --- a/vllm_ascend/__init__.py +++ b/vllm_ascend/__init__.py @@ -18,7 +18,6 @@ def register(): """Register the NPU platform.""" - return "vllm_ascend.platform.NPUPlatform" @@ -26,6 +25,5 @@ def register_model(): # fix pytorch schema check error, remove this line after pytorch # is upgraded to 2.7.0 import vllm_ascend.patch.worker.patch_common.patch_utils # noqa: F401 - - from .models import register_model + from vllm_ascend.models import register_model register_model() diff --git a/vllm_ascend/models/__init__.py b/vllm_ascend/models/__init__.py index 42960f17eb..0fe0361780 100644 --- a/vllm_ascend/models/__init__.py +++ b/vllm_ascend/models/__init__.py @@ -2,12 +2,15 @@ def register_model(): - from .deepseek_mtp import CustomDeepSeekMTP # noqa: F401 - from .deepseek_v2 import CustomDeepseekV2ForCausalLM # noqa: F401 - from .deepseek_v2 import CustomDeepseekV3ForCausalLM # noqa: F401 - from .qwen2_5_vl import \ + from vllm_ascend.models.deepseek_mtp import CustomDeepSeekMTP # noqa: F401 + from vllm_ascend.models.deepseek_v2 import \ + CustomDeepseekV2ForCausalLM # noqa: F401 + from vllm_ascend.models.deepseek_v2 import \ + CustomDeepseekV3ForCausalLM # noqa: F401 + from vllm_ascend.models.qwen2_5_vl import \ AscendQwen2_5_VLForConditionalGeneration # noqa: F401 - from .qwen2_vl import AscendQwen2VLForConditionalGeneration # noqa: F401 + from vllm_ascend.models.qwen2_vl import \ + AscendQwen2VLForConditionalGeneration # noqa: F401 ModelRegistry.register_model( "DeepSeekMTPModel", diff --git a/vllm_ascend/models/deepseek_mtp.py b/vllm_ascend/models/deepseek_mtp.py index 979a6099f1..3181274acc 100644 --- a/vllm_ascend/models/deepseek_mtp.py +++ b/vllm_ascend/models/deepseek_mtp.py @@ -37,7 +37,7 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors -from .deepseek_v2 import CustomDeepseekV2DecoderLayer +from vllm_ascend.models.deepseek_v2 import CustomDeepseekV2DecoderLayer class CustomDeepSeekMultiTokenPredictorLayer(DeepSeekMultiTokenPredictorLayer): diff --git a/vllm_ascend/patch/__init__.py b/vllm_ascend/patch/__init__.py index 445a167b08..83973c213f 100644 --- a/vllm_ascend/patch/__init__.py +++ b/vllm_ascend/patch/__init__.py @@ -88,16 +88,39 @@ # # * Worker Patch: # =============== +# ** File: worker/patch_common/patch_utils.py ** +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# 1. `vllm.utils.direct_register_custom_op` +# Why: +# direct_register_custom_op requires pytorch version >= 2.7.0, +# but vllm-ascend only support pytorch version 2.5.1 +# How: +# Convert annotation type to typing type for 2.5.1 backward compatibility +# Related PR (if no, explain why): +# No related PR, it's the change in vllm-ascend. +# Future Plan: +# Update pytorch and torch-npu to 2.7.0 in the future. +# ** File: worker/patch_common/patch_cache_engine.py ** +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# 1. `vllm.worker.cache_engine.CacheEngine._allocate_kv_cache` +# Why: +# Add graph_mode optimization for kv cache allocation. +# How: +# If graph_mode is enabled, add layer_kv_cache_nope and layer_kv_cache_pe to the kv_cache. +# Related PR (if no, explain why): +# Need a PR to vllm to fix the issue. +# Future Plan: +# Revert it when the related pr is merged in vllm. # ** File: worker/patch_common/patch_metrics.py ** # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -# 1. `vllm.spec_decode.metrics.AsyncMetricsCollector.maybe_collect_rejsample_metrics` +# 1. `vllm.spec_decode.metrics.AsyncMetricsCollector._copy_rejsample_metrics_async` # Why: # There are cuda hard code (current_platform.is_cuda_alike()) in -# `AsyncMetricsCollector.maybe_collect_rejsample_metrics` +# `AsyncMetricsCollector._copy_rejsample_metrics_async` # How: # Change to use `current_platform.Event` to determine whether to return None -# Related PR (if no, explain why): 1. refused by vllm. 2. vllm doesn't support 3. prepare to submit.... -# https://github.com/vllm-project/vllm/pull/14411 +# Related PR (if no, explain why): +# Need a PR to vllm to fix the issue. # Future Plan: # Revert it when the related pr is merged in vllm. # @@ -110,7 +133,7 @@ # However float32 is not supported in cann rope op, thus we keep this patch # How: # Removed the dtype convert operations in forward -# Related PR (if no, explain why): 1. refused by vllm. 2. vllm doesn't support 3. prepare to submit.... +# Related PR (if no, explain why): # NO, only for npu due to rope op. # Future Plan: # Keep this patch in vllm-ascend. @@ -126,7 +149,7 @@ # - support attention metadata register to the set supported spec decode # - offer a api in platform to determine whether spec decode is supported, # and deprecate is_cuda_alike in it. -# Related PR (if no, explain why): 1. refused by vllm. 2. vllm doesn't support 3. prepare to submit.... +# Related PR (if no, explain why): # - https://github.com/vllm-project/vllm/pull/15195 # - https://github.com/vllm-project/vllm-ascend/pull/395 # Future Plan: @@ -138,7 +161,7 @@ # vLLM `Remove Sampler from Model Code` so vllm-ascend needs adapt to this change. # How: # Use vLLM 0.8.4 method to patch it. -# Related PR (if no, explain why): 1. refused by vllm. 2. vllm doesn't support 3. prepare to submit.... +# Related PR (if no, explain why): # - https://github.com/vllm-project/vllm/pull/15195 # - https://github.com/vllm-project/vllm-ascend/pull/395 # Future Plan: @@ -153,7 +176,7 @@ # `FlashAttentionMetadata` # How: # ditto -# Related PR (if no, explain why): 1. refused by vllm. 2. vllm doesn't support 3. prepare to submit.... +# Related PR (if no, explain why): # - https://github.com/vllm-project/vllm/pull/15195 # - https://github.com/vllm-project/vllm-ascend/pull/395 # Future Plan: diff --git a/vllm_ascend/patch/worker/patch_common/__init__.py b/vllm_ascend/patch/worker/patch_common/__init__.py index 9369596f81..e8708435d4 100644 --- a/vllm_ascend/patch/worker/patch_common/__init__.py +++ b/vllm_ascend/patch/worker/patch_common/__init__.py @@ -18,6 +18,7 @@ # patch_utils should be the first import, because it will be used by other # patch files. import vllm_ascend.patch.worker.patch_common.patch_utils # noqa isort:skip +import vllm_ascend.patch.worker.patch_common.patch_cache_engine # noqa import vllm_ascend.patch.worker.patch_common.patch_distributed # noqa import vllm_ascend.patch.worker.patch_common.patch_metrics # noqa import vllm_ascend.patch.worker.patch_common.patch_minicpm # noqa diff --git a/vllm_ascend/worker/cache_engine.py b/vllm_ascend/patch/worker/patch_common/patch_cache_engine.py similarity index 96% rename from vllm_ascend/worker/cache_engine.py rename to vllm_ascend/patch/worker/patch_common/patch_cache_engine.py index 72de201f1d..8a313352e0 100644 --- a/vllm_ascend/worker/cache_engine.py +++ b/vllm_ascend/patch/worker/patch_common/patch_cache_engine.py @@ -1,7 +1,5 @@ # # Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved. -# This file is a part of the vllm-ascend project. -# Adapted from vllm-project/vllm/vllm/worker/model_runner.py # Copyright 2023 The vLLM team. # # Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/vllm_ascend/patch/worker/patch_common/patch_metrics.py b/vllm_ascend/patch/worker/patch_common/patch_metrics.py index 6d1f2dc0a9..b97618f2b6 100644 --- a/vllm_ascend/patch/worker/patch_common/patch_metrics.py +++ b/vllm_ascend/patch/worker/patch_common/patch_metrics.py @@ -15,13 +15,9 @@ # limitations under the License. # -from typing import Callable - import torch from vllm.spec_decode.metrics import AsyncMetricsCollector -Timer = Callable[[], float] - def _copy_rejsample_metrics_async(self) -> torch.npu.Event: """ diff --git a/vllm_ascend/quantization/quant_config.py b/vllm_ascend/quantization/quant_config.py index 1aededd322..84f70c9816 100644 --- a/vllm_ascend/quantization/quant_config.py +++ b/vllm_ascend/quantization/quant_config.py @@ -38,8 +38,7 @@ from vllm.model_executor.utils import set_weight_attrs from vllm_ascend.ops.fused_moe import AscendUnquantizedFusedMoEMethod - -from .quantizer import AscendQuantizer +from vllm_ascend.quantization.quantizer import AscendQuantizer @register_quantization_config("ascend") diff --git a/vllm_ascend/quantization/quantizer.py b/vllm_ascend/quantization/quantizer.py index ea1297bf35..e883644591 100644 --- a/vllm_ascend/quantization/quantizer.py +++ b/vllm_ascend/quantization/quantizer.py @@ -22,11 +22,12 @@ from vllm.logger import logger -from .func_wrapper import (wrapper_load_model, wrapper_rmsnorm_forward_oot, - wrapper_rmsnorm_init) -from .w8a8 import AscendW8A8LinearMethod -from .w8a8_dynamic import (AscendW8A8DynamicFusedMoEMethod, - AscendW8A8DynamicLinearMethod) +from vllm_ascend.quantization.func_wrapper import (wrapper_load_model, + wrapper_rmsnorm_forward_oot, + wrapper_rmsnorm_init) +from vllm_ascend.quantization.w8a8 import AscendW8A8LinearMethod +from vllm_ascend.quantization.w8a8_dynamic import ( + AscendW8A8DynamicFusedMoEMethod, AscendW8A8DynamicLinearMethod) CUSTOMIZED_QUANTIZER_TYPE: List[str] = [] diff --git a/vllm_ascend/utils.py b/vllm_ascend/utils.py index 6a750da30b..4d5d8b43d6 100644 --- a/vllm_ascend/utils.py +++ b/vllm_ascend/utils.py @@ -24,7 +24,7 @@ from packaging.version import InvalidVersion, Version from vllm.logger import logger -import vllm_ascend.envs as envs +from vllm_ascend import envs if TYPE_CHECKING: from vllm.config import VllmConfig diff --git a/vllm_ascend/worker/__init__.py b/vllm_ascend/worker/__init__.py index ee59a056ef..116c73c06c 100644 --- a/vllm_ascend/worker/__init__.py +++ b/vllm_ascend/worker/__init__.py @@ -14,4 +14,3 @@ # See the License for the specific language governing permissions and # limitations under the License. # -import vllm_ascend.worker.cache_engine # noqa \ No newline at end of file