[Bug]: RuntimeError: CUDA error: invalid device ordinal  with multi node multi gpus 

### Your current environment

vllm(0.3.3) on ray(2.10.0) cluster deployed by docker on 2 nodes with 2 GPU(Tesla T4) each.

<details>
 <summary>
 linux environment
 </summary>
root@ai151:/vllm-workspace# env
NV_LIBCUBLAS_VERSION=12.1.0.26-1
NVIDIA_VISIBLE_DEVICES=all
NV_NVML_DEV_VERSION=12.1.55-1
NV_LIBNCCL_DEV_PACKAGE=libnccl-dev=2.17.1-1+cuda12.1
NV_LIBNCCL_DEV_PACKAGE_VERSION=2.17.1-1
HOSTNAME=ai151
NVIDIA_REQUIRE_CUDA=cuda>=12.1 brand=tesla,driver>=470,driver<471 brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 brand=geforce,driver>=470,driver<471 brand=geforcertx,driver>=470,driver<471 brand=quadro,driver>=470,driver<471 brand=quadrortx,driver>=470,driver<471 brand=titan,driver>=470,driver<471 brand=titanrtx,driver>=470,driver<471 brand=tesla,driver>=525,driver<526 brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526 brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526 brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526
NV_LIBCUBLAS_DEV_PACKAGE=libcublas-dev-12-1=12.1.0.26-1
NV_NVTX_VERSION=12.1.66-1
NV_CUDA_CUDART_DEV_VERSION=12.1.55-1
NV_LIBCUSPARSE_VERSION=12.0.2.55-1
NV_LIBNPP_VERSION=12.0.2.50-1
NCCL_VERSION=2.17.1-1
PWD=/vllm-workspace
NVIDIA_DRIVER_CAPABILITIES=compute,utility
NV_NVPROF_DEV_PACKAGE=cuda-nvprof-12-1=12.1.55-1
NV_LIBNPP_PACKAGE=libnpp-12-1=12.0.2.50-1
NV_LIBNCCL_DEV_PACKAGE_NAME=libnccl-dev
NV_LIBCUBLAS_DEV_VERSION=12.1.0.26-1
NVIDIA_PRODUCT_NAME=CUDA
NV_LIBCUBLAS_DEV_PACKAGE_NAME=libcublas-dev-12-1
NV_CUDA_CUDART_VERSION=12.1.55-1
HOME=/root
LS_COLORS=rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.webp=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36:
NVIDIA_CUDA_END_OF_LIFE=1
CUDA_VERSION=12.1.0
NV_LIBCUBLAS_PACKAGE=libcublas-12-1=12.1.0.26-1
NV_CUDA_NSIGHT_COMPUTE_DEV_PACKAGE=cuda-nsight-compute-12-1=12.1.0-1
NV_LIBNPP_DEV_PACKAGE=libnpp-dev-12-1=12.0.2.50-1
NV_LIBCUBLAS_PACKAGE_NAME=libcublas-12-1
NV_LIBNPP_DEV_VERSION=12.0.2.50-1
LESSCLOSE=/usr/bin/lesspipe %s %s
TERM=xterm
NV_LIBCUSPARSE_DEV_VERSION=12.0.2.55-1
LESSOPEN=| /usr/bin/lesspipe %s
LIBRARY_PATH=/usr/local/cuda/lib64/stubs
SHLVL=1
NV_CUDA_LIB_VERSION=12.1.0-1
NVARCH=x86_64
NV_CUDA_COMPAT_PACKAGE=cuda-compat-12-1
NV_LIBNCCL_PACKAGE=libnccl2=2.17.1-1+cuda12.1
LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
NV_CUDA_NSIGHT_COMPUTE_VERSION=12.1.0-1
NV_NVPROF_VERSION=12.1.55-1
PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
NV_LIBNCCL_PACKAGE_NAME=libnccl2
NV_LIBNCCL_PACKAGE_VERSION=2.17.1-1
_=/usr/bin/env
</details>

<details>
 <summary>
 pip list
 </summary>
root@ai151:/vllm-workspace# pip list
Package Version
------------------------- ---------------
accelerate 0.28.0
aiofiles 23.2.1
aiohttp 3.9.3
aiohttp-cors 0.7.0
aiosignal 1.3.1
altair 5.2.0
annotated-types 0.6.0
anyio 4.3.0
async-timeout 4.0.3
attrs 23.2.0
awscli 1.32.70
botocore 1.34.70
cachetools 5.3.3
certifi 2024.2.2
charset-normalizer 3.3.2
click 8.1.7
cloudpickle 3.0.0
cmake 3.28.4
codespell 2.2.6
colorama 0.4.4
colorful 0.5.6
contourpy 1.2.0
cycler 0.12.1
deepspeed 0.14.0
diskcache 5.6.3
distlib 0.3.8
distro 1.9.0
docutils 0.16
einops 0.7.0
exceptiongroup 1.2.0
fastapi 0.110.0
ffmpy 0.3.2
filelock 3.13.3
flash-attn 2.5.6
fonttools 4.50.0
frozenlist 1.4.1
fsspec 2024.3.1
google-api-core 2.18.0
google-auth 2.29.0
googleapis-common-protos 1.63.0
gradio 4.24.0
gradio_client 0.14.0
grpcio 1.62.1
h11 0.14.0
hjson 3.1.0
httpcore 1.0.4
httptools 0.6.1
httpx 0.27.0
huggingface-hub 0.22.1
idna 3.6
importlib_resources 6.4.0
iniconfig 2.0.0
interegular 0.3.3
isort 5.13.2
Jinja2 3.1.3
jmespath 1.0.1
joblib 1.3.2
jsonschema 4.21.1
jsonschema-specifications 2023.12.1
kiwisolver 1.4.5
lark 1.1.9
llvmlite 0.42.0
markdown-it-py 3.0.0
MarkupSafe 2.1.5
matplotlib 3.8.3
mdurl 0.1.2
mpmath 1.3.0
msgpack 1.0.8
multidict 6.0.5
mypy 0.991
mypy-extensions 1.0.0
nest-asyncio 1.6.0
networkx 3.2.1
ninja 1.11.1.1
numba 0.59.1
numpy 1.26.4
nvidia-cublas-cu12 12.1.3.1
nvidia-cuda-cupti-cu12 12.1.105
nvidia-cuda-nvrtc-cu12 12.1.105
nvidia-cuda-runtime-cu12 12.1.105
nvidia-cudnn-cu12 8.9.2.26
nvidia-cufft-cu12 11.0.2.54
nvidia-curand-cu12 10.3.2.106
nvidia-cusolver-cu12 11.4.5.107
nvidia-cusparse-cu12 12.1.0.106
nvidia-nccl-cu12 2.18.1
nvidia-nvjitlink-cu12 12.4.99
nvidia-nvtx-cu12 12.1.105
openai 1.14.3
opencensus 0.11.4
opencensus-context 0.1.3
orjson 3.10.0
outlines 0.0.34
packaging 24.0
pandas 2.2.1
peft 0.10.0
pillow 10.2.0
pip 22.0.2
platformdirs 4.2.0
pluggy 1.4.0
prometheus_client 0.20.0
proto-plus 1.23.0
protobuf 4.25.3
psutil 5.9.8
py 1.11.0
py-cpuinfo 9.0.0
py-spy 0.3.14
pyasn1 0.5.1
pyasn1_modules 0.4.0
pydantic 2.6.4
pydantic_core 2.16.3
pydub 0.25.1
Pygments 2.17.2
pynvml 11.5.0
pyparsing 3.1.2
pytest 8.1.1
pytest-asyncio 0.23.6
pytest-forked 1.6.0
pytest-rerunfailures 14.0
pytest-shard 0.1.2
python-dateutil 2.9.0.post0
python-dotenv 1.0.1
python-multipart 0.0.9
pytz 2024.1
PyYAML 6.0.1
ray 2.10.0
referencing 0.34.0
regex 2023.12.25
requests 2.31.0
rich 13.7.1
rpds-py 0.18.0
rsa 4.7.2
ruff 0.3.4
s3transfer 0.10.1
safetensors 0.4.2
scipy 1.12.0
semantic-version 2.10.0
sentencepiece 0.2.0
setuptools 59.6.0
shellingham 1.5.4
six 1.16.0
smart-open 7.0.4
sniffio 1.3.1
starlette 0.36.3
sympy 1.12
tokenizers 0.15.2
toml 0.10.2
tomli 2.0.1
tomlkit 0.12.0
toolz 0.12.1
torch 2.1.2
tqdm 4.66.2
transformers 4.39.1
triton 2.1.0
typer 0.11.0
types-PyYAML 6.0.12.20240311
types-requests 2.31.0.20240311
types-setuptools 69.2.0.20240317
typing_extensions 4.10.0
tzdata 2024.1
urllib3 2.2.1
uvicorn 0.29.0
uvloop 0.19.0
virtualenv 20.25.1
vllm 0.3.3
watchfiles 0.21.0
websockets 11.0.3
wheel 0.37.1
wrapt 1.16.0
xformers 0.0.23.post1
yapf 0.32.0
yarl 1.9.4
</details>



### 🐛 Describe the bug

vllm works good with argument ```--tensor-parallel-size 2```, but sucks with ```--tensor-parallel-size 4```


<details>
 <summary>
RuntimeError: CUDA error: invalid device ordinal
 </summary>
root@ai151:/vllm-workspace# python3 -m vllm.entrypoints.api_server --model /models/openchat-3.5-0106/ --tensor-parallel-size 4 --dtype float16 --enforce-eager
WARNING 03-29 13:57:06 config.py:732] Casting torch.bfloat16 to torch.float16.
2024-03-29 13:57:06,969	INFO worker.py:1567 -- Connecting to existing Ray cluster at address: 10.4.80.151:6379...
2024-03-29 13:57:06,980	INFO worker.py:1743 -- Connected to Ray cluster. View the dashboard at 10.4.80.151:8265 
INFO 03-29 13:57:09 llm_engine.py:70] Initializing an LLM engine (v0.3.3) with config: model=/models/openchat-3.5-0106/, tokenizer=/models/openchat-3.5-0106/, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=4, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto, device_config=cuda, seed=0)
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
INFO 03-29 13:57:22 pynccl.py:49] Loading nccl from library libnccl.so
INFO 03-29 13:57:22 pynccl_utils.py:13] vLLM is using nccl==2.17.1
INFO 03-29 13:57:23 selector.py:33] Cannot use FlashAttention backend for Volta and Turing GPUs.
INFO 03-29 13:57:23 selector.py:20] Using XFormers backend.
(RayWorkerVllm pid=392, ip=10.4.80.152) INFO 03-29 13:57:16 pynccl.py:49] Loading nccl from library libnccl.so
(RayWorkerVllm pid=392, ip=10.4.80.152) INFO 03-29 13:57:16 pynccl_utils.py:13] vLLM is using nccl==2.17.1
(RayWorkerVllm pid=11442) INFO 03-29 13:57:25 selector.py:33] Cannot use FlashAttention backend for Volta and Turing GPUs.
(RayWorkerVllm pid=11442) INFO 03-29 13:57:25 selector.py:20] Using XFormers backend.
(RayWorkerVllm pid=309, ip=10.4.80.152) ERROR 03-29 13:57:18 ray_utils.py:44] Error executing method init_device. This might cause deadlock in distributed execution.
(RayWorkerVllm pid=309, ip=10.4.80.152) ERROR 03-29 13:57:18 ray_utils.py:44] Traceback (most recent call last):
(RayWorkerVllm pid=309, ip=10.4.80.152) ERROR 03-29 13:57:18 ray_utils.py:44] File /usr/local/lib/python3.10/dist-packages/vllm/engine/ray_utils.py, line 37, in execute_method
(RayWorkerVllm pid=309, ip=10.4.80.152) ERROR 03-29 13:57:18 ray_utils.py:44] return executor(*args, **kwargs)
(RayWorkerVllm pid=309, ip=10.4.80.152) ERROR 03-29 13:57:18 ray_utils.py:44] File /usr/local/lib/python3.10/dist-packages/vllm/worker/worker.py, line 100, in init_device
(RayWorkerVllm pid=309, ip=10.4.80.152) ERROR 03-29 13:57:18 ray_utils.py:44] init_distributed_environment(self.parallel_config, self.rank,
(RayWorkerVllm pid=309, ip=10.4.80.152) ERROR 03-29 13:57:18 ray_utils.py:44] File /usr/local/lib/python3.10/dist-packages/vllm/worker/worker.py, line 286, in init_distributed_environment
(RayWorkerVllm pid=309, ip=10.4.80.152) ERROR 03-29 13:57:18 ray_utils.py:44] pynccl_utils.init_process_group(
(RayWorkerVllm pid=309, ip=10.4.80.152) ERROR 03-29 13:57:18 ray_utils.py:44] File /usr/local/lib/python3.10/dist-packages/vllm/model_executor/parallel_utils/pynccl_utils.py, line 42, in init_process_group
(RayWorkerVllm pid=309, ip=10.4.80.152) ERROR 03-29 13:57:18 ray_utils.py:44] comm = NCCLCommunicator(init_method=init_method,
(RayWorkerVllm pid=309, ip=10.4.80.152) ERROR 03-29 13:57:18 ray_utils.py:44] File /usr/local/lib/python3.10/dist-packages/vllm/model_executor/parallel_utils/pynccl.py, line 226, in __init__
(RayWorkerVllm pid=309, ip=10.4.80.152) ERROR 03-29 13:57:18 ray_utils.py:44] torch.cuda.set_device(self.rank)
(RayWorkerVllm pid=309, ip=10.4.80.152) ERROR 03-29 13:57:18 ray_utils.py:44] File /usr/local/lib/python3.10/dist-packages/torch/cuda/__init__.py, line 404, in set_device
(RayWorkerVllm pid=309, ip=10.4.80.152) ERROR 03-29 13:57:18 ray_utils.py:44] torch._C._cuda_setDevice(device)
(RayWorkerVllm pid=309, ip=10.4.80.152) ERROR 03-29 13:57:18 ray_utils.py:44] RuntimeError: CUDA error: invalid device ordinal
(RayWorkerVllm pid=309, ip=10.4.80.152) ERROR 03-29 13:57:18 ray_utils.py:44] CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
(RayWorkerVllm pid=309, ip=10.4.80.152) ERROR 03-29 13:57:18 ray_utils.py:44] For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
(RayWorkerVllm pid=309, ip=10.4.80.152) ERROR 03-29 13:57:18 ray_utils.py:44] Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
(RayWorkerVllm pid=309, ip=10.4.80.152) ERROR 03-29 13:57:18 ray_utils.py:44] 
(RayWorkerVllm pid=309, ip=10.4.80.152) Exception ignored in: <function NCCLCommunicator.__del__ at 0x7fcbecdecc10>
(RayWorkerVllm pid=309, ip=10.4.80.152) Traceback (most recent call last):
(RayWorkerVllm pid=309, ip=10.4.80.152) File /usr/local/lib/python3.10/dist-packages/vllm/model_executor/parallel_utils/pynccl.py, line 260, in __del__
(RayWorkerVllm pid=309, ip=10.4.80.152) _c_ncclCommDestroy(self.comm)
(RayWorkerVllm pid=309, ip=10.4.80.152) AttributeError: NCCLCommunicator object has no attribute comm
</details>

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Uh oh!

[Bug]: RuntimeError: CUDA error: invalid device ordinal with multi node multi gpus #3722

Your current environment

🐛 Describe the bug

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Uh oh!

[Bug]: RuntimeError: CUDA error: invalid device ordinal with multi node multi gpus #3722

Description

Your current environment

🐛 Describe the bug

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions