1717# Adapted from vllm-project/vllm/vllm/worker/worker.py
1818#
1919
20- import gc
2120import os
2221from typing import Dict , List , Optional , Set , Tuple , Type , Union
2322
2423import msgpack # type: ignore
2524import torch
26- import torch .distributed
2725import zmq
2826from torch import nn
2927from vllm import envs
@@ -209,9 +207,7 @@ def init_device(self) -> None:
209207 if self .device_config .device .type == "npu" :
210208 self .device = torch .device (f"npu:{ self .local_rank } " )
211209 NPUPlatform .set_device (self .device )
212- gc .collect ()
213- NPUPlatform .empty_cache ()
214- torch .npu .reset_peak_memory_stats ()
210+ NPUPlatform .clear_npu_memory ()
215211 self .init_npu_memory = NPUPlatform .mem_get_info ()[0 ]
216212 else :
217213 raise RuntimeError (
@@ -278,9 +274,7 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
278274 """
279275 # Profile the memory usage of the model and get the maximum number of
280276 # cache blocks that can be allocated with the remaining free memory.
281- gc .collect ()
282- NPUPlatform .empty_cache ()
283- torch .npu .reset_peak_memory_stats ()
277+ NPUPlatform .clear_npu_memory ()
284278
285279 # Execute a forward pass with dummy inputs to profile the memory usage
286280 # of the model.
@@ -306,10 +300,8 @@ def determine_num_available_blocks(self) -> Tuple[int, int]:
306300 cache_block_size )
307301 num_npu_blocks = max (num_npu_blocks , 0 )
308302 num_cpu_blocks = max (num_cpu_blocks , 0 )
309- gc .collect ()
310- # TODO: don`t need impl this func after empty_cache in
311- # Worker.determine_num_available_blocks() unified`
312- NPUPlatform .empty_cache ()
303+
304+ NPUPlatform .clear_npu_memory ()
313305 return num_npu_blocks , num_cpu_blocks
314306
315307 def initialize_cache (self , num_gpu_blocks : int ,
0 commit comments