vllm-project · maxdebayser · May 28, 2025 · Jun 3, 2025 · Jun 3, 2025 · Jun 4, 2025
diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py
@@ -205,7 +205,9 @@ def determine_available_memory(self) -> int:
         )["allocated_bytes.all.current"]
         total_allocated_bytes = torch.cuda.mem_get_info(
         )[1] - torch.cuda.mem_get_info()[0]
-        non_torch_allocations = total_allocated_bytes - torch_allocated_bytes
+        init_allocated = total_gpu_memory - self.init_gpu_memory
+        non_torch_allocations = total_allocated_bytes - torch_allocated_bytes \
+            - init_allocated
         if non_torch_allocations > 0:
             peak_memory += non_torch_allocations
         available_kv_cache_memory = (