ri938 · ri938 · Aug 24, 2023 · Aug 24, 2023
diff --git a/vllm/model_executor/layers/quant.py b/vllm/model_executor/layers/quant.py
@@ -5,11 +5,10 @@
 
 
 try:
-    import awq_inference_engine  # with CUDA kernels
+    import awq_inference_engine
+    KERNELS_INSTALLED = True
 except ImportError as ex:
-    raise ImportError(
-        "Unable to import awq_inference_engine: run setup.py"
-        " to install AWQ CUDA kernels")
+    KERNELS_INSTALLED = False
 
 
 class ScaledActivation(nn.Module):
@@ -34,6 +33,11 @@ def __init__(
         ):
         super().__init__()
 
+        if not KERNELS_INSTALLED:
+            raise ImportError(
+                "Unable to import awq_ext: run setup.py"
+                " to install AWQ CUDA kernels")
+
         if w_bit not in [4]:
             raise NotImplementedError("Only 4-bit are supported for now.")