[Bug]: vLLM crashes with SIGABRT on Intel Arc B-series (Battlemage) GPUs during model inspection

### Your current environment

vLLM Version: 0.11.0
PyTorch Version: 2.5.1+cxx11.abi
Intel Extension for PyTorch: 2.5.10+xpu
OS: Fedora 42 (kernel 6.16.12-200.fc42.x86_64)
GPU: 2x Intel Arc Pro B60 (24GB each, device ID 0xe211)
Container Runtime: Podman 5.x (rootless)
Base Image: intel/intel-extension-for-pytorch:2.8.10-xpu
Python: 3.11

Driver versions:
- Level-Zero: 1.6.32224.5
- Intel OpenCL: 24.52.32224.5
- Intel Compute Runtime: NEO 24.52.32224

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

# ruff: noqa
# code borrowed from https://github.com/pytorch/pytorch/blob/main/torch/utils/collect_env.py

import datetime
import locale
import os
import subprocess
import sys

# Unlike the rest of the PyTorch this file must be python2 compliant.
# This script outputs relevant system environment info
# Run it with `python collect_env.py` or `python -m torch.utils.collect_env`
from collections import namedtuple

import regex as re

from vllm.envs import environment_variables

try:
    import torch

    TORCH_AVAILABLE = True
except (ImportError, NameError, AttributeError, OSError):
    TORCH_AVAILABLE = False

# System Environment Information
SystemEnv = namedtuple(
    "SystemEnv",
    [
        "torch_version",
        "is_debug_build",
        "cuda_compiled_version",
        "gcc_version",
        "clang_version",
        "cmake_version",
        "os",
        "libc_version",
        "python_version",
        "python_platform",
        "is_cuda_available",
        "cuda_runtime_version",
        "cuda_module_loading",
        "nvidia_driver_version",
        "nvidia_gpu_models",
        "cudnn_version",
        "pip_version",  # 'pip' or 'pip3'
        "pip_packages",
        "conda_packages",
        "hip_compiled_version",
        "hip_runtime_version",
        "miopen_runtime_version",
        "caching_allocator_config",
        "is_xnnpack_available",
        "cpu_info",
        "rocm_version",  # vllm specific field
        "vllm_version",  # vllm specific field
        "vllm_build_flags",  # vllm specific field
        "gpu_topo",  # vllm specific field
        "env_vars",
    ],
)

DEFAULT_CONDA_PATTERNS = {
    "torch",
    "numpy",
    "cudatoolkit",
    "soumith",
    "mkl",
    "magma",
    "triton",
    "optree",
    "nccl",
    "transformers",
    "zmq",
    "nvidia",
    "pynvml",
    "flashinfer-python",
}

DEFAULT_PIP_PATTERNS = {
    "torch",
    "numpy",
    "mypy",
    "flake8",
    "triton",
    "optree",
    "onnx",
    "nccl",
    "transformers",
    "zmq",
    "nvidia",
    "pynvml",
    "flashinfer-python",
}


def run(command):
    """Return (return-code, stdout, stderr)."""
    shell = True if type(command) is str else False
    try:
        p = subprocess.Popen(
            command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=shell
        )
        raw_output, raw_err = p.communicate()
        rc = p.returncode
        if get_platform() == "win32":
            enc = "oem"
        else:
            enc = locale.getpreferredencoding()
        output = raw_output.decode(enc)
        if command == "nvidia-smi topo -m":
            # don't remove the leading whitespace of `nvidia-smi topo -m`
            #   because they are meaningful
            output = output.rstrip()
        else:
            output = output.strip()
        err = raw_err.decode(enc)
        return rc, output, err.strip()

    except FileNotFoundError:
        cmd_str = command if isinstance(command, str) else command[0]
        return 127, "", f"Command not found: {cmd_str}"


def run_and_read_all(run_lambda, command):
    """Run command using run_lambda; reads and returns entire output if rc is 0."""
    rc, out, _ = run_lambda(command)
    if rc != 0:
        return None
    return out


def run_and_parse_first_match(run_lambda, command, regex):
    """Run command using run_lambda, returns the first regex match if it exists."""
    rc, out, _ = run_lambda(command)
    if rc != 0:
        return None
    match = re.search(regex, out)
    if match is None:
        return None
    return match.group(1)


def run_and_return_first_line(run_lambda, command):
    """Run command using run_lambda and returns first line if output is not empty."""
    rc, out, _ = run_lambda(command)
    if rc != 0:
        return None
    return out.split("\n")[0]


def get_conda_packages(run_lambda, patterns=None):
    if patterns is None:
        patterns = DEFAULT_CONDA_PATTERNS
    conda = os.environ.get("CONDA_EXE", "conda")
    out = run_and_read_all(run_lambda, [conda, "list"])
    if out is None:
        return out

    return "\n".join(
        line
        for line in out.splitlines()
        if not line.startswith("#") and any(name in line for name in patterns)
    )


def get_gcc_version(run_lambda):
    return run_and_parse_first_match(run_lambda, "gcc --version", r"gcc (.*)")


def get_clang_version(run_lambda):
    return run_and_parse_first_match(
        run_lambda, "clang --version", r"clang version (.*)"
    )


def get_cmake_version(run_lambda):
    return run_and_parse_first_match(run_lambda, "cmake --version", r"cmake (.*)")


def get_nvidia_driver_version(run_lambda):
    if get_platform() == "darwin":
        cmd = "kextstat | grep -i cuda"
        return run_and_parse_first_match(
            run_lambda, cmd, r"com[.]nvidia[.]CUDA [(](.*?)[)]"
        )
    smi = get_nvidia_smi()
    return run_and_parse_first_match(run_lambda, smi, r"Driver Version: (.*?) ")


def get_gpu_info(run_lambda):
    if get_platform() == "darwin" or (
        TORCH_AVAILABLE
        and hasattr(torch.version, "hip")
        and torch.version.hip is not None
    ):
        if TORCH_AVAILABLE and torch.cuda.is_available():
            if torch.version.hip is not None:
                prop = torch.cuda.get_device_properties(0)
                if hasattr(prop, "gcnArchName"):
                    gcnArch = " ({})".format(prop.gcnArchName)
                else:
                    gcnArch = "NoGCNArchNameOnOldPyTorch"
            else:
                gcnArch = ""
            return torch.cuda.get_device_name(None) + gcnArch
        return None
    smi = get_nvidia_smi()
    uuid_regex = re.compile(r" $UUID: .+?$")
    rc, out, _ = run_lambda(smi + " -L")
    if rc != 0:
        return None
    # Anonymize GPUs by removing their UUID
    return re.sub(uuid_regex, "", out)


def get_running_cuda_version(run_lambda):
    return run_and_parse_first_match(run_lambda, "nvcc --version", r"release .+ V(.*)")


def get_cudnn_version(run_lambda):
    """Return a list of libcudnn.so; it's hard to tell which one is being used."""
    if get_platform() == "win32":
        system_root = os.environ.get("SYSTEMROOT", "C:\\Windows")
        cuda_path = os.environ.get("CUDA_PATH", "%CUDA_PATH%")
        where_cmd = os.path.join(system_root, "System32", "where")
        cudnn_cmd = '{} /R "{}\\bin" cudnn*.dll'.format(where_cmd, cuda_path)
    elif get_platform() == "darwin":
        # CUDA libraries and drivers can be found in /usr/local/cuda/. See
        # https://docs.nvidia.com/cuda/cuda-installation-guide-mac-os-x/index.html#install
        # https://docs.nvidia.com/deeplearning/sdk/cudnn-install/index.html#installmac
        # Use CUDNN_LIBRARY when cudnn library is installed elsewhere.
        cudnn_cmd = "ls /usr/local/cuda/lib/libcudnn*"
    else:
        cudnn_cmd = 'ldconfig -p | grep libcudnn | rev | cut -d" " -f1 | rev'
    rc, out, _ = run_lambda(cudnn_cmd)
    # find will return 1 if there are permission errors or if not found
    if len(out) == 0 or (rc != 1 and rc != 0):
        l = os.environ.get("CUDNN_LIBRARY")
        if l is not None and os.path.isfile(l):
            return os.path.realpath(l)
        return None
    files_set = set()
    for fn in out.split("\n"):
        fn = os.path.realpath(fn)  # eliminate symbolic links
        if os.path.isfile(fn):
            files_set.add(fn)
    if not files_set:
        return None
    # Alphabetize the result because the order is non-deterministic otherwise
    files = sorted(files_set)
    if len(files) == 1:
        return files[0]
    result = "\n".join(files)
    return "Probably one of the following:\n{}".format(result)


def get_nvidia_smi():
    # Note: nvidia-smi is currently available only on Windows and Linux
    smi = "nvidia-smi"
    if get_platform() == "win32":
        system_root = os.environ.get("SYSTEMROOT", "C:\\Windows")
        program_files_root = os.environ.get("PROGRAMFILES", "C:\\Program Files")
        legacy_path = os.path.join(
            program_files_root, "NVIDIA Corporation", "NVSMI", smi
        )
        new_path = os.path.join(system_root, "System32", smi)
        smis = [new_path, legacy_path]
        for candidate_smi in smis:
            if os.path.exists(candidate_smi):
                smi = '"{}"'.format(candidate_smi)
                break
    return smi


def get_rocm_version(run_lambda):
    """Returns the ROCm version if available, otherwise 'N/A'."""
    return run_and_parse_first_match(
        run_lambda, "hipcc --version", r"HIP version: (\S+)"
    )


def get_vllm_version():
    from vllm import __version__, __version_tuple__

    if __version__ == "dev":
        return "N/A (dev)"
    version_str = __version_tuple__[-1]
    if isinstance(version_str, str) and version_str.startswith("g"):
        # it's a dev build
        if "." in version_str:
            # it's a dev build containing local changes
            git_sha = version_str.split(".")[0][1:]
            date = version_str.split(".")[-1][1:]
            return f"{__version__} (git sha: {git_sha}, date: {date})"
        else:
            # it's a dev build without local changes
            git_sha = version_str[1:]  # type: ignore
            return f"{__version__} (git sha: {git_sha})"
    return __version__


def summarize_vllm_build_flags():
    # This could be a static method if the flags are constant, or dynamic if you need to check environment variables, etc.
    return "CUDA Archs: {}; ROCm: {}".format(
        os.environ.get("TORCH_CUDA_ARCH_LIST", "Not Set"),
        "Enabled" if os.environ.get("ROCM_HOME") else "Disabled",
    )


def get_gpu_topo(run_lambda):
    output = None

    if get_platform() == "linux":
        output = run_and_read_all(run_lambda, "nvidia-smi topo -m")
        if output is None:
            output = run_and_read_all(run_lambda, "rocm-smi --showtopo")

    return output


# example outputs of CPU infos
#  * linux
#    Architecture:            x86_64
#      CPU op-mode(s):        32-bit, 64-bit
#      Address sizes:         46 bits physical, 48 bits virtual
#      Byte Order:            Little Endian
#    CPU(s):                  128
#      On-line CPU(s) list:   0-127
#    Vendor ID:               GenuineIntel
#      Model name:            Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz
#        CPU family:          6
#        Model:               106
#        Thread(s) per core:  2
#        Core(s) per socket:  32
#        Socket(s):           2
#        Stepping:            6
#        BogoMIPS:            5799.78
#        Flags:               fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr
#                             sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon rep_good nopl
#                             xtopology nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq monitor ssse3 fma cx16
#                             pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand
#                             hypervisor lahf_lm abm 3dnowprefetch invpcid_single ssbd ibrs ibpb stibp ibrs_enhanced
#                             fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid avx512f avx512dq rdseed adx smap
#                             avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1
#                             xsaves wbnoinvd ida arat avx512vbmi pku ospke avx512_vbmi2 gfni vaes vpclmulqdq
#                             avx512_vnni avx512_bitalg tme avx512_vpopcntdq rdpid md_clear flush_l1d arch_capabilities
#    Virtualization features:
#      Hypervisor vendor:     KVM
#      Virtualization type:   full
#    Caches (sum of all):
#      L1d:                   3 MiB (64 instances)
#      L1i:                   2 MiB (64 instances)
#      L2:                    80 MiB (64 instances)
#      L3:                    108 MiB (2 instances)
#    NUMA:
#      NUMA node(s):          2
#      NUMA node0 CPU(s):     0-31,64-95
#      NUMA node1 CPU(s):     32-63,96-127
#    Vulnerabilities:
#      Itlb multihit:         Not affected
#      L1tf:                  Not affected
#      Mds:                   Not affected
#      Meltdown:              Not affected
#      Mmio stale data:       Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown
#      Retbleed:              Not affected
#      Spec store bypass:     Mitigation; Speculative Store Bypass disabled via prctl and seccomp
#      Spectre v1:            Mitigation; usercopy/swapgs barriers and __user pointer sanitization
#      Spectre v2:            Mitigation; Enhanced IBRS, IBPB conditional, RSB filling, PBRSB-eIBRS SW sequence
#      Srbds:                 Not affected
#      Tsx async abort:       Not affected
#  * win32
#    Architecture=9
#    CurrentClockSpeed=2900
#    DeviceID=CPU0
#    Family=179
#    L2CacheSize=40960
#    L2CacheSpeed=
#    Manufacturer=GenuineIntel
#    MaxClockSpeed=2900
#    Name=Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz
#    ProcessorType=3
#    Revision=27142
#
#    Architecture=9
#    CurrentClockSpeed=2900
#    DeviceID=CPU1
#    Family=179
#    L2CacheSize=40960
#    L2CacheSpeed=
#    Manufacturer=GenuineIntel
#    MaxClockSpeed=2900
#    Name=Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz
#    ProcessorType=3
#    Revision=27142


def get_cpu_info(run_lambda):
    rc, out, err = 0, "", ""
    if get_platform() == "linux":
        rc, out, err = run_lambda("lscpu")
    elif get_platform() == "win32":
        rc, out, err = run_lambda(
            "wmic cpu get Name,Manufacturer,Family,Architecture,ProcessorType,DeviceID, \
        CurrentClockSpeed,MaxClockSpeed,L2CacheSize,L2CacheSpeed,Revision /VALUE"
        )
    elif get_platform() == "darwin":
        rc, out, err = run_lambda("sysctl -n machdep.cpu.brand_string")
    cpu_info = "None"
    if rc == 0:
        cpu_info = out
    else:
        cpu_info = err
    return cpu_info


def get_platform():
    if sys.platform.startswith("linux"):
        return "linux"
    elif sys.platform.startswith("win32"):
        return "win32"
    elif sys.platform.startswith("cygwin"):
        return "cygwin"
    elif sys.platform.startswith("darwin"):
        return "darwin"
    else:
        return sys.platform


def get_mac_version(run_lambda):
    return run_and_parse_first_match(run_lambda, "sw_vers -productVersion", r"(.*)")


def get_windows_version(run_lambda):
    system_root = os.environ.get("SYSTEMROOT", "C:\\Windows")
    wmic_cmd = os.path.join(system_root, "System32", "Wbem", "wmic")
    findstr_cmd = os.path.join(system_root, "System32", "findstr")
    return run_and_read_all(
        run_lambda, "{} os get Caption | {} /v Caption".format(wmic_cmd, findstr_cmd)
    )


def get_lsb_version(run_lambda):
    return run_and_parse_first_match(
        run_lambda, "lsb_release -a", r"Description:\t(.*)"
    )


def check_release_file(run_lambda):
    return run_and_parse_first_match(
        run_lambda, "cat /etc/*-release", r'PRETTY_NAME="(.*)"'
    )


def get_os(run_lambda):
    from platform import machine

    platform = get_platform()

    if platform == "win32" or platform == "cygwin":
        return get_windows_version(run_lambda)

    if platform == "darwin":
        version = get_mac_version(run_lambda)
        if version is None:
            return None
        return "macOS {} ({})".format(version, machine())

    if platform == "linux":
        # Ubuntu/Debian based
        desc = get_lsb_version(run_lambda)
        if desc is not None:
            return "{} ({})".format(desc, machine())

        # Try reading /etc/*-release
        desc = check_release_file(run_lambda)
        if desc is not None:
            return "{} ({})".format(desc, machine())

        return "{} ({})".format(platform, machine())

    # Unknown platform
    return platform


def get_python_platform():
    import platform

    return platform.platform()


def get_libc_version():
    import platform

    if get_platform() != "linux":
        return "N/A"
    return "-".join(platform.libc_ver())


def is_uv_venv():
    if os.environ.get("UV"):
        return True
    pyvenv_cfg_path = os.path.join(sys.prefix, "pyvenv.cfg")
    if os.path.exists(pyvenv_cfg_path):
        with open(pyvenv_cfg_path, "r") as f:
            return any(line.startswith("uv = ") for line in f)
    return False


def get_pip_packages(run_lambda, patterns=None):
    """Return `pip list` output. Note: will also find conda-installed pytorch and numpy packages."""
    if patterns is None:
        patterns = DEFAULT_PIP_PATTERNS

    def run_with_pip():
        try:
            import importlib.util

            pip_spec = importlib.util.find_spec("pip")
            pip_available = pip_spec is not None
        except ImportError:
            pip_available = False

        if pip_available:
            cmd = [sys.executable, "-mpip", "list", "--format=freeze"]
        elif is_uv_venv():
            print("uv is set")
            cmd = ["uv", "pip", "list", "--format=freeze"]
        else:
            raise RuntimeError(
                "Could not collect pip list output (pip or uv module not available)"
            )

        out = run_and_read_all(run_lambda, cmd)
        return "\n".join(
            line for line in out.splitlines() if any(name in line for name in patterns)
        )

    pip_version = "pip3" if sys.version[0] == "3" else "pip"
    out = run_with_pip()
    return pip_version, out


def get_cachingallocator_config():
    ca_config = os.environ.get("PYTORCH_CUDA_ALLOC_CONF", "")
    return ca_config


def get_cuda_module_loading_config():
    if TORCH_AVAILABLE and torch.cuda.is_available():
        torch.cuda.init()
        config = os.environ.get("CUDA_MODULE_LOADING", "")
        return config
    else:
        return "N/A"


def is_xnnpack_available():
    if TORCH_AVAILABLE:
        import torch.backends.xnnpack

        return str(torch.backends.xnnpack.enabled)  # type: ignore[attr-defined]
    else:
        return "N/A"


def get_env_vars():
    env_vars = ""
    secret_terms = ("secret", "token", "api", "access", "password")
    report_prefix = (
        "TORCH",
        "NCCL",
        "PYTORCH",
        "CUDA",
        "CUBLAS",
        "CUDNN",
        "OMP_",
        "MKL_",
        "NVIDIA",
    )
    for k, v in os.environ.items():
        if any(term in k.lower() for term in secret_terms):
            continue
        if k in environment_variables:
            env_vars = env_vars + "{}={}".format(k, v) + "\n"
        if k.startswith(report_prefix):
            env_vars = env_vars + "{}={}".format(k, v) + "\n"

    return env_vars


def get_env_info():
    run_lambda = run
    pip_version, pip_list_output = get_pip_packages(run_lambda)

    if TORCH_AVAILABLE:
        version_str = torch.__version__
        debug_mode_str = str(torch.version.debug)
        cuda_available_str = str(torch.cuda.is_available())
        cuda_version_str = torch.version.cuda
        if (
            not hasattr(torch.version, "hip") or torch.version.hip is None
        ):  # cuda version
            hip_compiled_version = hip_runtime_version = miopen_runtime_version = "N/A"
        else:  # HIP version

            def get_version_or_na(cfg, prefix):
                _lst = [s.rsplit(None, 1)[-1] for s in cfg if prefix in s]
                return _lst[0] if _lst else "N/A"

            cfg = torch._C._show_config().split("\n")
            hip_runtime_version = get_version_or_na(cfg, "HIP Runtime")
            miopen_runtime_version = get_version_or_na(cfg, "MIOpen")
            cuda_version_str = "N/A"
            hip_compiled_version = torch.version.hip
    else:
        version_str = debug_mode_str = cuda_available_str = cuda_version_str = "N/A"
        hip_compiled_version = hip_runtime_version = miopen_runtime_version = "N/A"

    sys_version = sys.version.replace("\n", " ")

    conda_packages = get_conda_packages(run_lambda)

    rocm_version = get_rocm_version(run_lambda)
    vllm_version = get_vllm_version()
    vllm_build_flags = summarize_vllm_build_flags()
    gpu_topo = get_gpu_topo(run_lambda)

    return SystemEnv(
        torch_version=version_str,
        is_debug_build=debug_mode_str,
        python_version="{} ({}-bit runtime)".format(
            sys_version, sys.maxsize.bit_length() + 1
        ),
        python_platform=get_python_platform(),
        is_cuda_available=cuda_available_str,
        cuda_compiled_version=cuda_version_str,
        cuda_runtime_version=get_running_cuda_version(run_lambda),
        cuda_module_loading=get_cuda_module_loading_config(),
        nvidia_gpu_models=get_gpu_info(run_lambda),
        nvidia_driver_version=get_nvidia_driver_version(run_lambda),
        cudnn_version=get_cudnn_version(run_lambda),
        hip_compiled_version=hip_compiled_version,
        hip_runtime_version=hip_runtime_version,
        miopen_runtime_version=miopen_runtime_version,
        pip_version=pip_version,
        pip_packages=pip_list_output,
        conda_packages=conda_packages,
        os=get_os(run_lambda),
        libc_version=get_libc_version(),
        gcc_version=get_gcc_version(run_lambda),
        clang_version=get_clang_version(run_lambda),
        cmake_version=get_cmake_version(run_lambda),
        caching_allocator_config=get_cachingallocator_config(),
        is_xnnpack_available=is_xnnpack_available(),
        cpu_info=get_cpu_info(run_lambda),
        rocm_version=rocm_version,
        vllm_version=vllm_version,
        vllm_build_flags=vllm_build_flags,
        gpu_topo=gpu_topo,
        env_vars=get_env_vars(),
    )


env_info_fmt = """
==============================
        System Info
==============================
OS                           : {os}
GCC version                  : {gcc_version}
Clang version                : {clang_version}
CMake version                : {cmake_version}
Libc version                 : {libc_version}

==============================
       PyTorch Info
==============================
PyTorch version              : {torch_version}
Is debug build               : {is_debug_build}
CUDA used to build PyTorch   : {cuda_compiled_version}
ROCM used to build PyTorch   : {hip_compiled_version}

==============================
      Python Environment
==============================
Python version               : {python_version}
Python platform              : {python_platform}

==============================
       CUDA / GPU Info
==============================
Is CUDA available            : {is_cuda_available}
CUDA runtime version         : {cuda_runtime_version}
CUDA_MODULE_LOADING set to   : {cuda_module_loading}
GPU models and configuration : {nvidia_gpu_models}
Nvidia driver version        : {nvidia_driver_version}
cuDNN version                : {cudnn_version}
HIP runtime version          : {hip_runtime_version}
MIOpen runtime version       : {miopen_runtime_version}
Is XNNPACK available         : {is_xnnpack_available}

==============================
          CPU Info
==============================
{cpu_info}

==============================
Versions of relevant libraries
==============================
{pip_packages}
{conda_packages}
""".strip()

# both the above code and the following code use `strip()` to
# remove leading/trailing whitespaces, so we need to add a newline
# in between to separate the two sections
env_info_fmt += "\n\n"

env_info_fmt += """
==============================
         vLLM Info
==============================
ROCM Version                 : {rocm_version}
vLLM Version                 : {vllm_version}
vLLM Build Flags:
  {vllm_build_flags}
GPU Topology:
  {gpu_topo}

==============================
     Environment Variables
==============================
{env_vars}
""".strip()


def pretty_str(envinfo):
    def replace_nones(dct, replacement="Could not collect"):
        for key in dct.keys():
            if dct[key] is not None:
                continue
            dct[key] = replacement
        return dct

    def replace_bools(dct, true="Yes", false="No"):
        for key in dct.keys():
            if dct[key] is True:
                dct[key] = true
            elif dct[key] is False:
                dct[key] = false
        return dct

    def prepend(text, tag="[prepend]"):
        lines = text.split("\n")
        updated_lines = [tag + line for line in lines]
        return "\n".join(updated_lines)

    def replace_if_empty(text, replacement="No relevant packages"):
        if text is not None and len(text) == 0:
            return replacement
        return text

    def maybe_start_on_next_line(string):
        # If `string` is multiline, prepend a \n to it.
        if string is not None and len(string.split("\n")) > 1:
            return "\n{}\n".format(string)
        return string

    mutable_dict = envinfo._asdict()

    # If nvidia_gpu_models is multiline, start on the next line
    mutable_dict["nvidia_gpu_models"] = maybe_start_on_next_line(
        envinfo.nvidia_gpu_models
    )

    # If the machine doesn't have CUDA, report some fields as 'No CUDA'
    dynamic_cuda_fields = [
        "cuda_runtime_version",
        "nvidia_gpu_models",
        "nvidia_driver_version",
    ]
    all_cuda_fields = dynamic_cuda_fields + ["cudnn_version"]
    all_dynamic_cuda_fields_missing = all(
        mutable_dict[field] is None for field in dynamic_cuda_fields
    )
    if (
        TORCH_AVAILABLE
        and not torch.cuda.is_available()
        and all_dynamic_cuda_fields_missing
    ):
        for field in all_cuda_fields:
            mutable_dict[field] = "No CUDA"
        if envinfo.cuda_compiled_version is None:
            mutable_dict["cuda_compiled_version"] = "None"

    # Replace True with Yes, False with No
    mutable_dict = replace_bools(mutable_dict)

    # Replace all None objects with 'Could not collect'
    mutable_dict = replace_nones(mutable_dict)

    # If either of these are '', replace with 'No relevant packages'
    mutable_dict["pip_packages"] = replace_if_empty(mutable_dict["pip_packages"])
    mutable_dict["conda_packages"] = replace_if_empty(mutable_dict["conda_packages"])

    # Tag conda and pip packages with a prefix
    # If they were previously None, they'll show up as ie '[conda] Could not collect'
    if mutable_dict["pip_packages"]:
        mutable_dict["pip_packages"] = prepend(
            mutable_dict["pip_packages"], "[{}] ".format(envinfo.pip_version)
        )
    if mutable_dict["conda_packages"]:
        mutable_dict["conda_packages"] = prepend(
            mutable_dict["conda_packages"], "[conda] "
        )
    mutable_dict["cpu_info"] = envinfo.cpu_info
    return env_info_fmt.format(**mutable_dict)


def get_pretty_env_info():
    return pretty_str(get_env_info())


def main():
    print("Collecting environment information...")
    output = get_pretty_env_info()
    print(output)

    if (
        TORCH_AVAILABLE
        and hasattr(torch, "utils")
        and hasattr(torch.utils, "_crash_handler")
    ):
        minidump_dir = torch.utils._crash_handler.DEFAULT_MINIDUMP_DIR
        if sys.platform == "linux" and os.path.exists(minidump_dir):
            dumps = [
                os.path.join(minidump_dir, dump) for dump in os.listdir(minidump_dir)
            ]
            latest = max(dumps, key=os.path.getctime)
            ctime = os.path.getctime(latest)
            creation_time = datetime.datetime.fromtimestamp(ctime).strftime(
                "%Y-%m-%d %H:%M:%S"
            )
            msg = (
                "\n*** Detected a minidump at {} created on {}, ".format(
                    latest, creation_time
                )
                + "if this is related to your bug please include it when you file a report ***"
            )
            print(msg, file=sys.stderr)


if __name__ == "__main__":
    main()



### 🐛 Describe the bug

vLLM crashes with SIGABRT during model architecture inspection when attempting to start the OpenAI API server with Intel XPU backend on Intel Arc Pro B60 GPUs. The crash occurs in the Intel Compute Runtime (drm_neo.cpp) during the model inspection subprocess.
Your current environment
vLLM Version: 0.11.0
PyTorch Version: 2.5.1+cxx11.abi
Intel Extension for PyTorch: 2.5.10+xpu
OS: Fedora 42 (kernel 6.16.12-200.fc42.x86_64)
GPU: 2x Intel Arc Pro B60 (24GB each, device ID 0xe211)
Container Runtime: Podman 5.x (rootless)
Base Image: intel/intel-extension-for-pytorch:2.8.10-xpu
Python: 3.11

Driver versions:
- Level-Zero: 1.6.32224.5
- Intel OpenCL: 24.52.32224.5
- Intel Compute Runtime: NEO 24.52.32224
GPU Detection (Working)
GPUs are correctly detected by PyTorch + IPEX:
bash$ python3 -c "import torch; import intel_extension_for_pytorch as ipex; \
  print('Device count:', torch.xpu.device_count()); \
  [print(f'Device {i}: {torch.xpu.get_device_name(i)}') for i in range(torch.xpu.device_count())]"

Device count: 2
Device 0: Intel(R) Graphics [0xe211]
Device 1: Intel(R) Graphics [0xe211]
To Reproduce

Use container with Intel IPEX and vLLM:

bashpodman run -d \
  --device=/dev/dri \
  --group-add=video \
  --group-add=render \
  --name vllm-test \
  --network host \
  --shm-size=10g \
  intel/intel-extension-for-pytorch:2.8.10-xpu

Install vLLM:

bashpodman exec vllm-test pip install vllm

Start vLLM server:

bashpython3 -m vllm.entrypoints.openai.api_server \
  --model Qwen/Qwen2.5-0.5B-Instruct \
  --served-model-name qwen2.5 \
  --port 8000 \
  --dtype float16 \
  --max-model-len 2048 \
  --max-num-seqs 128 \
  --max-num-batched-tokens 4096 \
  --trust-remote-code
```

**Error Log**
```
INFO 10-23 09:17:18 [__init__.py:216] Automatically detected platform xpu.
INFO 10-23 09:17:19 [api_server.py:1839] vLLM API server version 0.11.0
ERROR 10-23 09:17:27 [registry.py:548] Error in inspecting model architecture 'Qwen2ForCausalLM'
ERROR 10-23 09:17:27 [registry.py:548] Traceback (most recent call last):
ERROR 10-23 09:17:27 [registry.py:548]   File "/usr/local/lib/python3.11/dist-packages/vllm/model_executor/models/registry.py", line 966, in _run_in_subprocess
ERROR 10-23 09:17:27 [registry.py:548]     returned.check_returncode()
ERROR 10-23 09:17:27 [registry.py:548] subprocess.CalledProcessError: Command '['/usr/local/bin/python3', '-m', 'vllm.model_executor.models.registry']' died with <Signals.SIGABRT: 6>.

RuntimeError: Error raised in subprocess:
Abort was called at 433 line in file:
./shared/source/os_interface/linux/drm_neo.cpp
Expected behavior
vLLM should successfully inspect the model architecture and start the API server using Intel XPU devices, similar to how it works on NVIDIA GPUs.
Additional context

Works: PyTorch inference directly with IPEX (manual model loading works)
Works: Ollama with Intel GPU support successfully runs models on this hardware
Fails: vLLM's model registry inspection subprocess crashes before model loading

The crash happens specifically during the subprocess spawned by vLLM's model registry (vllm.model_executor.models.registry). The error originates from Intel's Compute Runtime (drm_neo.cpp line 433).
Questions

Does vLLM's XPU backend officially support Intel Arc B-series (Battlemage) GPUs?
Are there known issues with vLLM's subprocess-based model inspection on Intel XPU?
Is there a workaround to skip or modify the model inspection process for Intel XPU?
Should this be reported to Intel's Compute Runtime team instead?

System Details
bash# Host has 3 GPUs visible
$ ls -la /dev/dri/by-path/
pci-0000:04:00.0-card -> ../card0  # Arc Pro B60 #1
pci-0000:00:02.0-card -> ../card1  # iGPU (Arrow Lake)
pci-0000:09:00.0-card -> ../card2  # Arc Pro B60 #2

# Container sees only the dGPUs (desired behavior)
$ python3 -c "import torch; print(torch.xpu.device_count())"
2

### Before submitting a new issue...

- [x] Make sure you already searched for relevant issues, and asked the chatbot living at the bottom right corner of the [documentation page](https://docs.vllm.ai/en/latest/), which can answer lots of frequently asked questions.

Uh oh!

[Bug]: vLLM crashes with SIGABRT on Intel Arc B-series (Battlemage) GPUs during model inspection #27408

Description

Your current environment

SPDX-License-Identifier: Apache-2.0

SPDX-FileCopyrightText: Copyright contributors to the vLLM project

ruff: noqa

code borrowed from https://github.com/pytorch/pytorch/blob/main/torch/utils/collect_env.py

Unlike the rest of the PyTorch this file must be python2 compliant.

This script outputs relevant system environment info

Run it with python collect_env.py or python -m torch.utils.collect_env

System Environment Information

example outputs of CPU infos

* linux

Architecture: x86_64

CPU op-mode(s): 32-bit, 64-bit

Address sizes: 46 bits physical, 48 bits virtual

Byte Order: Little Endian

CPU(s): 128

On-line CPU(s) list: 0-127

Vendor ID: GenuineIntel

Model name: Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz

CPU family: 6

Model: 106

Thread(s) per core: 2

Core(s) per socket: 32

Socket(s): 2

Stepping: 6

BogoMIPS: 5799.78

Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr

sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc arch_perfmon rep_good nopl

xtopology nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq monitor ssse3 fma cx16

pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand

hypervisor lahf_lm abm 3dnowprefetch invpcid_single ssbd ibrs ibpb stibp ibrs_enhanced

fsgsbase tsc_adjust bmi1 avx2 smep bmi2 erms invpcid avx512f avx512dq rdseed adx smap

avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1

xsaves wbnoinvd ida arat avx512vbmi pku ospke avx512_vbmi2 gfni vaes vpclmulqdq

avx512_vnni avx512_bitalg tme avx512_vpopcntdq rdpid md_clear flush_l1d arch_capabilities

Virtualization features:

Hypervisor vendor: KVM

Virtualization type: full

Caches (sum of all):

L1d: 3 MiB (64 instances)

L1i: 2 MiB (64 instances)

L2: 80 MiB (64 instances)

L3: 108 MiB (2 instances)

NUMA:

NUMA node(s): 2

NUMA node0 CPU(s): 0-31,64-95

NUMA node1 CPU(s): 32-63,96-127

Vulnerabilities:

Itlb multihit: Not affected

L1tf: Not affected

Mds: Not affected

Meltdown: Not affected

Mmio stale data: Vulnerable: Clear CPU buffers attempted, no microcode; SMT Host state unknown

Retbleed: Not affected

Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp

Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization

Spectre v2: Mitigation; Enhanced IBRS, IBPB conditional, RSB filling, PBRSB-eIBRS SW sequence

Srbds: Not affected

Tsx async abort: Not affected

* win32

Architecture=9

CurrentClockSpeed=2900

DeviceID=CPU0

Family=179

L2CacheSize=40960

L2CacheSpeed=

Manufacturer=GenuineIntel

MaxClockSpeed=2900

Name=Intel(R) Xeon(R) Platinum 8375C CPU @ 2.90GHz

ProcessorType=3

Revision=27142

Architecture=9

CurrentClockSpeed=2900

DeviceID=CPU1

Family=179

L2CacheSize=40960

L2CacheSpeed=

Run it with `python collect_env.py` or `python -m torch.utils.collect_env`

==============================
PyTorch Info

==============================
Python Environment

==============================
CUDA / GPU Info

==============================
CPU Info

==============================
Versions of relevant libraries

both the above code and the following code use `strip()` to

==============================
Environment Variables