diff --git a/packaging/env_var_script_linux.sh b/packaging/env_var_script_linux.sh
index 3d3394fbd5..b9abe46d24 100644
--- a/packaging/env_var_script_linux.sh
+++ b/packaging/env_var_script_linux.sh
@@ -17,3 +17,7 @@ TORCH_CUDA_ARCH_LIST="8.0;8.6"
 if [[ ${CU_VERSION:-} == "cu124" ]]; then
   TORCH_CUDA_ARCH_LIST="${TORCH_CUDA_ARCH_LIST};9.0"
 fi
+
+# Ensure pip does not use PEP 517 build isolation so that pre-installed
+# tools from pre_build_script.sh (setuptools, wheel) are visible to the build.
+export PIP_NO_BUILD_ISOLATION=1
diff --git a/packaging/post_build_script.sh b/packaging/post_build_script.sh
index d47aacd339..ab342d4e8c 100644
--- a/packaging/post_build_script.sh
+++ b/packaging/post_build_script.sh
@@ -10,11 +10,13 @@ set -eux
 # Prepare manywheel, only for CUDA.
 # The wheel is a pure python wheel for other platforms.
 if [[ "$CU_VERSION" == cu* ]]; then
-    WHEEL_NAME=$(ls dist/)
-
     pushd dist
+    # Determine the original wheel produced by build (there should be exactly one)
+    ORIG_WHEEL=$(ls -1 *.whl | head -n 1)
     manylinux_plat=manylinux_2_28_x86_64
-    auditwheel repair --plat "$manylinux_plat" -w . \
+    # Only run auditwheel if the wheel contains at least one shared object (.so)
+    if unzip -l "$ORIG_WHEEL" | awk '{print $4}' | grep -E '\\.so($|\.)' >/dev/null 2>&1; then
+        auditwheel repair --plat "$manylinux_plat" -w . \
     --exclude libtorch.so \
     --exclude libtorch_python.so \
     --exclude libtorch_cuda.so \
@@ -23,15 +25,16 @@ if [[ "$CU_VERSION" == cu* ]]; then
     --exclude libc10_cuda.so \
     --exclude libcuda.so.* \
     --exclude libcudart.so.* \
-    "${WHEEL_NAME}"
+        "${ORIG_WHEEL}"
+    else
+        echo "No shared libraries detected in wheel ${ORIG_WHEEL}; skipping auditwheel."
+    fi
 
     ls -lah .
-    # Clean up the linux_x86_64 wheel
-    rm "${WHEEL_NAME}"
     popd
 fi
 
-MANYWHEEL_NAME=$(ls dist/)
-# Try to install the new wheel
-pip install "dist/${MANYWHEEL_NAME}"
+INSTALL_WHEEL=$(ls -1t dist/*.whl | head -n 1)
+# Try to install the new wheel (pick most recent wheel file)
+pip install "${INSTALL_WHEEL}"
 python -c "import torchao"
diff --git a/setup.py b/setup.py
index fd4ee9f40f..73217dca65 100644
--- a/setup.py
+++ b/setup.py
@@ -13,6 +13,7 @@
 from typing import List, Optional
 
 from setuptools import Extension, find_packages, setup
+from setuptools.command.build_ext import build_ext as _setuptools_build_ext
 
 current_date = datetime.now().strftime("%Y%m%d")
 
@@ -98,16 +99,7 @@ def use_debug_mode():
     return os.getenv("DEBUG", "0") == "1"
 
 
-import torch
-from torch.utils.cpp_extension import (
-    CUDA_HOME,
-    IS_WINDOWS,
-    ROCM_HOME,
-    BuildExtension,
-    CppExtension,
-    CUDAExtension,
-    _get_cuda_arch_flags,
-)
+# Heavy imports (torch, torch.utils.cpp_extension) are deferred to build time
 
 
 class BuildOptions:
@@ -139,6 +131,8 @@ def __init__(self):
             "TORCHAO_BUILD_EXPERIMENTAL_MPS", default=False
         )
         if self.build_experimental_mps:
+            import torch  # Lazy import
+
             assert is_macos, "TORCHAO_BUILD_EXPERIMENTAL_MPS requires macOS"
             assert is_arm64, "TORCHAO_BUILD_EXPERIMENTAL_MPS requires arm64"
             assert torch.mps.is_available(), (
@@ -260,10 +254,23 @@ def get_cuda_version_from_nvcc():
         return None
 
 
+def is_nvcc_available():
+    """Check if nvcc is available on the system."""
+    try:
+        subprocess.check_output(["nvcc", "--version"], stderr=subprocess.STDOUT)
+        return True
+    except:
+        return False
+
+
 def get_cutlass_build_flags():
     """Determine which CUTLASS kernels to build based on CUDA version.
     SM90a: CUDA 12.6+, SM100a: CUDA 12.8+
     """
+    # Lazy import torch and helper; only needed when building CUDA extensions
+    import torch
+    from torch.utils.cpp_extension import _get_cuda_arch_flags
+
     # Try nvcc then torch version
     cuda_version = get_cuda_version_from_nvcc() or torch.version.cuda
 
@@ -290,64 +297,77 @@ def get_cutlass_build_flags():
         )
 
 
-# BuildExtension is a subclass of from setuptools.command.build_ext.build_ext
-class TorchAOBuildExt(BuildExtension):
-    def __init__(self, *args, **kwargs) -> None:
-        super().__init__(*args, **kwargs)
-
-    def build_extensions(self):
-        cmake_extensions = [
-            ext for ext in self.extensions if isinstance(ext, CMakeExtension)
-        ]
-        other_extensions = [
-            ext for ext in self.extensions if not isinstance(ext, CMakeExtension)
-        ]
-        for ext in cmake_extensions:
-            self.build_cmake(ext)
-
-        # Use BuildExtension to build other extensions
-        self.extensions = other_extensions
-        super().build_extensions()
-
-        self.extensions = other_extensions + cmake_extensions
-
-    def build_cmake(self, ext):
-        extdir = os.path.abspath(os.path.dirname(self.get_ext_fullpath(ext.name)))
-
-        if not os.path.exists(self.build_temp):
-            os.makedirs(self.build_temp)
-
-        # Get the expected extension file name that Python will look for
-        # We force CMake to use this library name
-        ext_filename = os.path.basename(self.get_ext_filename(ext.name))
-        ext_basename = os.path.splitext(ext_filename)[0]
-
-        print(
-            "CMAKE COMMANG",
-            [
-                "cmake",
-                ext.cmake_lists_dir,
-            ]
-            + ext.cmake_args
-            + [
-                "-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=" + extdir,
-                "-DTORCHAO_CMAKE_EXT_SO_NAME=" + ext_basename,
-            ],
-        )
+class LazyTorchAOBuildExt(_setuptools_build_ext):
+    def run(self):
+        # Import heavy torch build only when actually running build_ext
+        from torch.utils.cpp_extension import BuildExtension as _BuildExtension
+
+        class _TorchAOBuildExt(_BuildExtension):
+            def run(self_inner):
+                if os.getenv("USE_CPP", "1") != "0":
+                    check_submodules()
+                if not self_inner.distribution.ext_modules:
+                    self_inner.distribution.ext_modules = get_extensions()
+                super(_TorchAOBuildExt, self_inner).run()
+
+            def build_extensions(self_inner):
+                cmake_extensions = [
+                    ext
+                    for ext in self_inner.extensions
+                    if isinstance(ext, CMakeExtension)
+                ]
+                other_extensions = [
+                    ext
+                    for ext in self_inner.extensions
+                    if not isinstance(ext, CMakeExtension)
+                ]
+                for ext in cmake_extensions:
+                    self_inner.build_cmake(ext)
+
+                self_inner.extensions = other_extensions
+                super(_TorchAOBuildExt, self_inner).build_extensions()
+                self_inner.extensions = other_extensions + cmake_extensions
+
+            def build_cmake(self_inner, ext):
+                extdir = os.path.abspath(
+                    os.path.dirname(self_inner.get_ext_fullpath(ext.name))
+                )
+                if not os.path.exists(self_inner.build_temp):
+                    os.makedirs(self_inner.build_temp)
+                ext_filename = os.path.basename(self_inner.get_ext_filename(ext.name))
+                ext_basename = os.path.splitext(ext_filename)[0]
+                if os.getenv("VERBOSE_BUILD", "0") == "1" or use_debug_mode():
+                    print(
+                        "CMAKE COMMAND",
+                        [
+                            "cmake",
+                            ext.cmake_lists_dir,
+                        ]
+                        + ext.cmake_args
+                        + [
+                            "-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=" + extdir,
+                            "-DTORCHAO_CMAKE_EXT_SO_NAME=" + ext_basename,
+                        ],
+                    )
+                subprocess.check_call(
+                    [
+                        "cmake",
+                        ext.cmake_lists_dir,
+                    ]
+                    + ext.cmake_args
+                    + [
+                        "-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=" + extdir,
+                        "-DTORCHAO_CMAKE_EXT_SO_NAME=" + ext_basename,
+                    ],
+                    cwd=self_inner.build_temp,
+                )
+                subprocess.check_call(
+                    ["cmake", "--build", "."], cwd=self_inner.build_temp
+                )
 
-        subprocess.check_call(
-            [
-                "cmake",
-                ext.cmake_lists_dir,
-            ]
-            + ext.cmake_args
-            + [
-                "-DCMAKE_LIBRARY_OUTPUT_DIRECTORY=" + extdir,
-                "-DTORCHAO_CMAKE_EXT_SO_NAME=" + ext_basename,
-            ],
-            cwd=self.build_temp,
-        )
-        subprocess.check_call(["cmake", "--build", "."], cwd=self.build_temp)
+        # Morph this instance into the real BuildExtension subclass and run
+        self.__class__ = _TorchAOBuildExt
+        return _TorchAOBuildExt.run(self)
 
 
 class CMakeExtension(Extension):
@@ -371,8 +391,22 @@ def get_extensions():
     if debug_mode:
         print("Compiling in debug mode")
 
-    if CUDA_HOME is None and torch.version.cuda:
-        print("CUDA toolkit is not available. Skipping compilation of CUDA extensions")
+    # Heavy imports moved here to minimize setup.py import overhead
+    import torch
+    from torch.utils.cpp_extension import (
+        CUDA_HOME,
+        IS_WINDOWS,
+        ROCM_HOME,
+        CppExtension,
+        CUDAExtension,
+    )
+
+    # Only skip CUDA extensions if neither CUDA_HOME nor nvcc is available.
+    # In many CI environments CUDA_HOME may be unset even though nvcc is on PATH.
+    if torch.version.cuda and CUDA_HOME is None and not is_nvcc_available():
+        print(
+            "CUDA toolkit is not available (CUDA_HOME unset and nvcc not found). Skipping compilation of CUDA extensions"
+        )
         print(
             "If you'd like to compile CUDA extensions locally please install the cudatoolkit from https://anaconda.org/nvidia/cuda-toolkit"
         )
@@ -380,7 +414,10 @@ def get_extensions():
         print("ROCm is not available. Skipping compilation of ROCm extensions")
         print("If you'd like to compile ROCm extensions locally please install ROCm")
 
-    use_cuda = torch.version.cuda and CUDA_HOME is not None
+    # Build CUDA extensions if CUDA is available and either CUDA_HOME is set or nvcc is present
+    use_cuda = bool(torch.version.cuda) and (
+        CUDA_HOME is not None or is_nvcc_available()
+    )
     use_rocm = torch.version.hip and ROCM_HOME is not None
     extension = CUDAExtension if (use_cuda or use_rocm) else CppExtension
 
@@ -452,11 +489,13 @@ def get_extensions():
         found_col16 = False
         found_vec_ext = False
         found_outer_vec = False
-        print("ROCM_HOME", ROCM_HOME)
+        if os.getenv("VERBOSE_BUILD", "0") == "1" or debug_mode:
+            print("ROCM_HOME", ROCM_HOME)
         hipblaslt_headers = list(
             glob.glob(os.path.join(ROCM_HOME, "include", "hipblaslt", "hipblaslt.h"))
         )
-        print("hipblaslt_headers", hipblaslt_headers)
+        if os.getenv("VERBOSE_BUILD", "0") == "1" or debug_mode:
+            print("hipblaslt_headers", hipblaslt_headers)
         for header in hipblaslt_headers:
             with open(header) as f:
                 text = f.read()
@@ -468,17 +507,22 @@ def get_extensions():
                     found_outer_vec = True
         if found_col16:
             extra_compile_args["cxx"].append("-DHIPBLASLT_HAS_ORDER_COL16")
-            print("hipblaslt found extended col order enums")
+            if os.getenv("VERBOSE_BUILD", "0") == "1" or debug_mode:
+                print("hipblaslt found extended col order enums")
         else:
-            print("hipblaslt does not have extended col order enums")
+            if os.getenv("VERBOSE_BUILD", "0") == "1" or debug_mode:
+                print("hipblaslt does not have extended col order enums")
         if found_outer_vec:
             extra_compile_args["cxx"].append("-DHIPBLASLT_OUTER_VEC")
-            print("hipblaslt found outer vec")
+            if os.getenv("VERBOSE_BUILD", "0") == "1" or debug_mode:
+                print("hipblaslt found outer vec")
         elif found_vec_ext:
             extra_compile_args["cxx"].append("-DHIPBLASLT_VEC_EXT")
-            print("hipblaslt found vec ext")
+            if os.getenv("VERBOSE_BUILD", "0") == "1" or debug_mode:
+                print("hipblaslt found vec ext")
         else:
-            print("hipblaslt does not have vec ext")
+            if os.getenv("VERBOSE_BUILD", "0") == "1" or debug_mode:
+                print("hipblaslt does not have vec ext")
 
     # Get base directory and source paths
     curdir = os.path.dirname(os.path.curdir)
@@ -641,7 +685,8 @@ def get_extensions():
 
     ext_modules = []
     if len(sources) > 0:
-        print("SOURCES", sources)
+        if os.getenv("VERBOSE_BUILD", "0") == "1" or debug_mode:
+            print("SOURCES", sources)
         # Double-check to ensure mx_fp_cutlass_kernels.cu is not in sources
         sources = [
             s for s in sources if os.path.basename(s) != "mx_fp_cutlass_kernels.cu"
@@ -735,9 +780,13 @@ def get_extensions():
         def bool_to_on_off(value):
             return "ON" if value else "OFF"
 
-        from distutils.sysconfig import get_python_lib
+        import importlib.util
 
-        torch_dir = get_python_lib() + "/torch/share/cmake/Torch"
+        spec = importlib.util.find_spec("torch")
+        if spec is None or spec.origin is None:
+            raise RuntimeError("Unable to locate 'torch' package for CMake config")
+        torch_pkg_dir = os.path.dirname(spec.origin)
+        torch_dir = os.path.join(torch_pkg_dir, "share", "cmake", "Torch")
 
         ext_modules.append(
             CMakeExtension(
@@ -762,24 +811,23 @@ def bool_to_on_off(value):
     return ext_modules
 
 
-# Only check submodules if we're going to build C++ extensions
-if use_cpp != "0":
-    check_submodules()
+# Defer submodule checks to build time via build_ext
 
 setup(
     name="torchao",
     version=version + version_suffix,
-    packages=find_packages(exclude=["benchmarks", "benchmarks.*"]),
+    packages=find_packages(include=["torchao*"]),
     include_package_data=True,
     package_data={
         "torchao.kernel.configs": ["*.pkl"],
     },
-    ext_modules=get_extensions(),
+    # Defer extension discovery to build time for performance
+    ext_modules=[],
     extras_require={"dev": read_requirements("dev-requirements.txt")},
     description="Package for applying ao techniques to GPU models",
     long_description=open("README.md", encoding="utf-8").read(),
     long_description_content_type="text/markdown",
     url="https://github.com/pytorch/ao",
-    cmdclass={"build_ext": TorchAOBuildExt},
+    cmdclass={"build_ext": LazyTorchAOBuildExt},
     options={"bdist_wheel": {"py_limited_api": "cp39"}},
 )