update

shen-shanshan · shen-shanshan · commit 7a8b57abc7ad · 2025-05-29T02:20:21.000Z
Signed-off-by: shen-shanshan &lt;467638484@qq.com&gt;
diff --git a/vllm_ascend/platform.py b/vllm_ascend/platform.py
@@ -15,6 +15,7 @@
 # This file is a part of the vllm-ascend project.
 #
 
+import gc
 import logging
 import os
 from typing import TYPE_CHECKING, Optional, Tuple
@@ -244,3 +245,9 @@ def supports_v1(cls, model_config: ModelConfig) -> bool:
         model configuration.
         """
         return True
+
+    @classmethod
+    def clear_npu_memory(cls):
+        gc.collect()
+        torch.npu.empty_cache()
+        torch.npu.reset_peak_memory_stats()
diff --git a/vllm_ascend/worker/worker.py b/vllm_ascend/worker/worker.py
@@ -17,13 +17,11 @@
 # Adapted from vllm-project/vllm/vllm/worker/worker.py
 #
 
-import gc
 import os
 from typing import Dict, List, Optional, Set, Tuple, Type, Union
 
 import msgpack  # type: ignore
 import torch
-import torch.distributed
 import zmq
 from torch import nn
 from vllm import envs
@@ -209,9 +207,7 @@ def init_device(self) -> None:
         if self.device_config.device.type == "npu":
             self.device = torch.device(f"npu:{self.local_rank}")
             NPUPlatform.set_device(self.device)
-            gc.collect()
-            NPUPlatform.empty_cache()
-            torch.npu.reset_peak_memory_stats()
+            NPUPlatform.clear_npu_memory()
             self.init_npu_memory = NPUPlatform.mem_get_info()[0]
         else:
             raise RuntimeError(
@@ -278,9 +274,7 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
         """
         # Profile the memory usage of the model and get the maximum number of
         # cache blocks that can be allocated with the remaining free memory.
-        gc.collect()
-        NPUPlatform.empty_cache()
-        torch.npu.reset_peak_memory_stats()
+        NPUPlatform.clear_npu_memory()
 
         # Execute a forward pass with dummy inputs to profile the memory usage
         # of the model.
@@ -306,10 +300,8 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
                              cache_block_size)
         num_npu_blocks = max(num_npu_blocks, 0)
         num_cpu_blocks = max(num_cpu_blocks, 0)
-        gc.collect()
-        # TODO: don`t need impl this func after empty_cache in
-        # Worker.determine_num_available_blocks() unified`
-        NPUPlatform.empty_cache()
+
+        NPUPlatform.clear_npu_memory()
         return num_npu_blocks, num_cpu_blocks
 
     def initialize_cache(self, num_gpu_blocks: int,