diff --git a/csrc/cpu/dnnl_kernels.cpp b/csrc/cpu/dnnl_kernels.cpp index 9a3af4ac9d8a..1c42a75bc2d6 100644 --- a/csrc/cpu/dnnl_kernels.cpp +++ b/csrc/cpu/dnnl_kernels.cpp @@ -523,7 +523,7 @@ void onednn_mm(torch::Tensor& c, // [M, OC], row-major CPU_KERNEL_GUARD_IN(onednn_mm) TORCH_CHECK(a.dim() == 2); TORCH_CHECK(a.stride(-1) == 1); - TORCH_CHECK(c.is_contiguous()); + TORCH_CHECK(c.stride(-1) == 1); MatMulPrimitiveHandler* ptr = reinterpret_cast(handler); diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index c5b6d91a62b6..544e091491bf 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -185,6 +185,11 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None: parallel_config.distributed_executor_backend = "mp" if parallel_config.worker_cls == "auto": parallel_config.worker_cls = "vllm.v1.worker.cpu_worker.CPUWorker" + # Disable DBO + if parallel_config.enable_dbo: + logger.warning( + "Dual-Batch Overlap is not supported on CPU, disabled.") + parallel_config.enable_dbo = False # Note: workaround for v1 gpu_model_runner from vllm.config import CompilationLevel diff --git a/vllm/v1/worker/cpu_model_runner.py b/vllm/v1/worker/cpu_model_runner.py index ccdbeac64bce..cd0f0af43e7e 100644 --- a/vllm/v1/worker/cpu_model_runner.py +++ b/vllm/v1/worker/cpu_model_runner.py @@ -145,12 +145,20 @@ def __init__(self, *args, **kwargs) -> None: self.record = lambda: None self.synchronize = lambda: None + class _StreamPlaceholder: + + def __init__(self, *args, **kwargs) -> None: + pass + cuda_event = torch.cuda.Event + cuda_stream = torch.cuda.Stream try: torch.cuda.Event = _EventPlaceholder + torch.cuda.Stream = _StreamPlaceholder yield finally: torch.cuda.Event = cuda_event + torch.cuda.Stream = cuda_stream @contextmanager