-
-
Notifications
You must be signed in to change notification settings - Fork 10.8k
Closed
Closed
Copy link
Labels
bugSomething isn't workingSomething isn't working
Description
Your current environment
vllm(0.3.3) on ray(2.10.0) cluster deployed by docker on 2 nodes with 2 GPU(Tesla T4) each.
linux environment
root@ai151:/vllm-workspace# envNV_LIBCUBLAS_VERSION=12.1.0.26-1
NVIDIA_VISIBLE_DEVICES=all
NV_NVML_DEV_VERSION=12.1.55-1
NV_LIBNCCL_DEV_PACKAGE=libnccl-dev=2.17.1-1+cuda12.1
NV_LIBNCCL_DEV_PACKAGE_VERSION=2.17.1-1
HOSTNAME=ai151
NVIDIA_REQUIRE_CUDA=cuda>=12.1 brand=tesla,driver>=470,driver<471 brand=unknown,driver>=470,driver<471 brand=nvidia,driver>=470,driver<471 brand=nvidiartx,driver>=470,driver<471 brand=geforce,driver>=470,driver<471 brand=geforcertx,driver>=470,driver<471 brand=quadro,driver>=470,driver<471 brand=quadrortx,driver>=470,driver<471 brand=titan,driver>=470,driver<471 brand=titanrtx,driver>=470,driver<471 brand=tesla,driver>=525,driver<526 brand=unknown,driver>=525,driver<526 brand=nvidia,driver>=525,driver<526 brand=nvidiartx,driver>=525,driver<526 brand=geforce,driver>=525,driver<526 brand=geforcertx,driver>=525,driver<526 brand=quadro,driver>=525,driver<526 brand=quadrortx,driver>=525,driver<526 brand=titan,driver>=525,driver<526 brand=titanrtx,driver>=525,driver<526
NV_LIBCUBLAS_DEV_PACKAGE=libcublas-dev-12-1=12.1.0.26-1
NV_NVTX_VERSION=12.1.66-1
NV_CUDA_CUDART_DEV_VERSION=12.1.55-1
NV_LIBCUSPARSE_VERSION=12.0.2.55-1
NV_LIBNPP_VERSION=12.0.2.50-1
NCCL_VERSION=2.17.1-1
PWD=/vllm-workspace
NVIDIA_DRIVER_CAPABILITIES=compute,utility
NV_NVPROF_DEV_PACKAGE=cuda-nvprof-12-1=12.1.55-1
NV_LIBNPP_PACKAGE=libnpp-12-1=12.0.2.50-1
NV_LIBNCCL_DEV_PACKAGE_NAME=libnccl-dev
NV_LIBCUBLAS_DEV_VERSION=12.1.0.26-1
NVIDIA_PRODUCT_NAME=CUDA
NV_LIBCUBLAS_DEV_PACKAGE_NAME=libcublas-dev-12-1
NV_CUDA_CUDART_VERSION=12.1.55-1
HOME=/root
LS_COLORS=rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=40;33;01:cd=40;33;01:or=40;31;01:mi=00:su=37;41:sg=30;43:ca=30;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arc=01;31:*.arj=01;31:*.taz=01;31:*.lha=01;31:*.lz4=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.tzo=01;31:*.t7z=01;31:*.zip=01;31:*.z=01;31:*.dz=01;31:*.gz=01;31:*.lrz=01;31:*.lz=01;31:*.lzo=01;31:*.xz=01;31:*.zst=01;31:*.tzst=01;31:*.bz2=01;31:*.bz=01;31:*.tbz=01;31:*.tbz2=01;31:*.tz=01;31:*.deb=01;31:*.rpm=01;31:*.jar=01;31:*.war=01;31:*.ear=01;31:*.sar=01;31:*.rar=01;31:*.alz=01;31:*.ace=01;31:*.zoo=01;31:*.cpio=01;31:*.7z=01;31:*.rz=01;31:*.cab=01;31:*.wim=01;31:*.swm=01;31:*.dwm=01;31:*.esd=01;31:*.jpg=01;35:*.jpeg=01;35:*.mjpg=01;35:*.mjpeg=01;35:*.gif=01;35:*.bmp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*.xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=01;35:*.mkv=01;35:*.webm=01;35:*.webp=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.vob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.rmvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*.dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=00;36:*.au=00;36:*.flac=00;36:*.m4a=00;36:*.mid=00;36:*.midi=00;36:*.mka=00;36:*.mp3=00;36:*.mpc=00;36:*.ogg=00;36:*.ra=00;36:*.wav=00;36:*.oga=00;36:*.opus=00;36:*.spx=00;36:*.xspf=00;36:
NVIDIA_CUDA_END_OF_LIFE=1
CUDA_VERSION=12.1.0
NV_LIBCUBLAS_PACKAGE=libcublas-12-1=12.1.0.26-1
NV_CUDA_NSIGHT_COMPUTE_DEV_PACKAGE=cuda-nsight-compute-12-1=12.1.0-1
NV_LIBNPP_DEV_PACKAGE=libnpp-dev-12-1=12.0.2.50-1
NV_LIBCUBLAS_PACKAGE_NAME=libcublas-12-1
NV_LIBNPP_DEV_VERSION=12.0.2.50-1
LESSCLOSE=/usr/bin/lesspipe %s %s
TERM=xterm
NV_LIBCUSPARSE_DEV_VERSION=12.0.2.55-1
LESSOPEN=| /usr/bin/lesspipe %s
LIBRARY_PATH=/usr/local/cuda/lib64/stubs
SHLVL=1
NV_CUDA_LIB_VERSION=12.1.0-1
NVARCH=x86_64
NV_CUDA_COMPAT_PACKAGE=cuda-compat-12-1
NV_LIBNCCL_PACKAGE=libnccl2=2.17.1-1+cuda12.1
LD_LIBRARY_PATH=/usr/local/nvidia/lib:/usr/local/nvidia/lib64
NV_CUDA_NSIGHT_COMPUTE_VERSION=12.1.0-1
NV_NVPROF_VERSION=12.1.55-1
PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
NV_LIBNCCL_PACKAGE_NAME=libnccl2
NV_LIBNCCL_PACKAGE_VERSION=2.17.1-1
_=/usr/bin/env
pip list
root@ai151:/vllm-workspace# pip listPackage Version
------------------------- ---------------
accelerate 0.28.0
aiofiles 23.2.1
aiohttp 3.9.3
aiohttp-cors 0.7.0
aiosignal 1.3.1
altair 5.2.0
annotated-types 0.6.0
anyio 4.3.0
async-timeout 4.0.3
attrs 23.2.0
awscli 1.32.70
botocore 1.34.70
cachetools 5.3.3
certifi 2024.2.2
charset-normalizer 3.3.2
click 8.1.7
cloudpickle 3.0.0
cmake 3.28.4
codespell 2.2.6
colorama 0.4.4
colorful 0.5.6
contourpy 1.2.0
cycler 0.12.1
deepspeed 0.14.0
diskcache 5.6.3
distlib 0.3.8
distro 1.9.0
docutils 0.16
einops 0.7.0
exceptiongroup 1.2.0
fastapi 0.110.0
ffmpy 0.3.2
filelock 3.13.3
flash-attn 2.5.6
fonttools 4.50.0
frozenlist 1.4.1
fsspec 2024.3.1
google-api-core 2.18.0
google-auth 2.29.0
googleapis-common-protos 1.63.0
gradio 4.24.0
gradio_client 0.14.0
grpcio 1.62.1
h11 0.14.0
hjson 3.1.0
httpcore 1.0.4
httptools 0.6.1
httpx 0.27.0
huggingface-hub 0.22.1
idna 3.6
importlib_resources 6.4.0
iniconfig 2.0.0
interegular 0.3.3
isort 5.13.2
Jinja2 3.1.3
jmespath 1.0.1
joblib 1.3.2
jsonschema 4.21.1
jsonschema-specifications 2023.12.1
kiwisolver 1.4.5
lark 1.1.9
llvmlite 0.42.0
markdown-it-py 3.0.0
MarkupSafe 2.1.5
matplotlib 3.8.3
mdurl 0.1.2
mpmath 1.3.0
msgpack 1.0.8
multidict 6.0.5
mypy 0.991
mypy-extensions 1.0.0
nest-asyncio 1.6.0
networkx 3.2.1
ninja 1.11.1.1
numba 0.59.1
numpy 1.26.4
nvidia-cublas-cu12 12.1.3.1
nvidia-cuda-cupti-cu12 12.1.105
nvidia-cuda-nvrtc-cu12 12.1.105
nvidia-cuda-runtime-cu12 12.1.105
nvidia-cudnn-cu12 8.9.2.26
nvidia-cufft-cu12 11.0.2.54
nvidia-curand-cu12 10.3.2.106
nvidia-cusolver-cu12 11.4.5.107
nvidia-cusparse-cu12 12.1.0.106
nvidia-nccl-cu12 2.18.1
nvidia-nvjitlink-cu12 12.4.99
nvidia-nvtx-cu12 12.1.105
openai 1.14.3
opencensus 0.11.4
opencensus-context 0.1.3
orjson 3.10.0
outlines 0.0.34
packaging 24.0
pandas 2.2.1
peft 0.10.0
pillow 10.2.0
pip 22.0.2
platformdirs 4.2.0
pluggy 1.4.0
prometheus_client 0.20.0
proto-plus 1.23.0
protobuf 4.25.3
psutil 5.9.8
py 1.11.0
py-cpuinfo 9.0.0
py-spy 0.3.14
pyasn1 0.5.1
pyasn1_modules 0.4.0
pydantic 2.6.4
pydantic_core 2.16.3
pydub 0.25.1
Pygments 2.17.2
pynvml 11.5.0
pyparsing 3.1.2
pytest 8.1.1
pytest-asyncio 0.23.6
pytest-forked 1.6.0
pytest-rerunfailures 14.0
pytest-shard 0.1.2
python-dateutil 2.9.0.post0
python-dotenv 1.0.1
python-multipart 0.0.9
pytz 2024.1
PyYAML 6.0.1
ray 2.10.0
referencing 0.34.0
regex 2023.12.25
requests 2.31.0
rich 13.7.1
rpds-py 0.18.0
rsa 4.7.2
ruff 0.3.4
s3transfer 0.10.1
safetensors 0.4.2
scipy 1.12.0
semantic-version 2.10.0
sentencepiece 0.2.0
setuptools 59.6.0
shellingham 1.5.4
six 1.16.0
smart-open 7.0.4
sniffio 1.3.1
starlette 0.36.3
sympy 1.12
tokenizers 0.15.2
toml 0.10.2
tomli 2.0.1
tomlkit 0.12.0
toolz 0.12.1
torch 2.1.2
tqdm 4.66.2
transformers 4.39.1
triton 2.1.0
typer 0.11.0
types-PyYAML 6.0.12.20240311
types-requests 2.31.0.20240311
types-setuptools 69.2.0.20240317
typing_extensions 4.10.0
tzdata 2024.1
urllib3 2.2.1
uvicorn 0.29.0
uvloop 0.19.0
virtualenv 20.25.1
vllm 0.3.3
watchfiles 0.21.0
websockets 11.0.3
wheel 0.37.1
wrapt 1.16.0
xformers 0.0.23.post1
yapf 0.32.0
yarl 1.9.4
🐛 Describe the bug
vllm works good with argument --tensor-parallel-size 2, but sucks with --tensor-parallel-size 4
RuntimeError: CUDA error: invalid device ordinal
root@ai151:/vllm-workspace# python3 -m vllm.entrypoints.api_server --model /models/openchat-3.5-0106/ --tensor-parallel-size 4 --dtype float16 --enforce-eagerWARNING 03-29 13:57:06 config.py:732] Casting torch.bfloat16 to torch.float16.
2024-03-29 13:57:06,969 INFO worker.py:1567 -- Connecting to existing Ray cluster at address: 10.4.80.151:6379...
2024-03-29 13:57:06,980 INFO worker.py:1743 -- Connected to Ray cluster. View the dashboard at 10.4.80.151:8265
INFO 03-29 13:57:09 llm_engine.py:70] Initializing an LLM engine (v0.3.3) with config: model=/models/openchat-3.5-0106/, tokenizer=/models/openchat-3.5-0106/, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=8192, download_dir=None, load_format=auto, tensor_parallel_size=4, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto, device_config=cuda, seed=0)
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
INFO 03-29 13:57:22 pynccl.py:49] Loading nccl from library libnccl.so
INFO 03-29 13:57:22 pynccl_utils.py:13] vLLM is using nccl==2.17.1
INFO 03-29 13:57:23 selector.py:33] Cannot use FlashAttention backend for Volta and Turing GPUs.
INFO 03-29 13:57:23 selector.py:20] Using XFormers backend.
(RayWorkerVllm pid=392, ip=10.4.80.152) INFO 03-29 13:57:16 pynccl.py:49] Loading nccl from library libnccl.so
(RayWorkerVllm pid=392, ip=10.4.80.152) INFO 03-29 13:57:16 pynccl_utils.py:13] vLLM is using nccl==2.17.1
(RayWorkerVllm pid=11442) INFO 03-29 13:57:25 selector.py:33] Cannot use FlashAttention backend for Volta and Turing GPUs.
(RayWorkerVllm pid=11442) INFO 03-29 13:57:25 selector.py:20] Using XFormers backend.
(RayWorkerVllm pid=309, ip=10.4.80.152) ERROR 03-29 13:57:18 ray_utils.py:44] Error executing method init_device. This might cause deadlock in distributed execution.
(RayWorkerVllm pid=309, ip=10.4.80.152) ERROR 03-29 13:57:18 ray_utils.py:44] Traceback (most recent call last):
(RayWorkerVllm pid=309, ip=10.4.80.152) ERROR 03-29 13:57:18 ray_utils.py:44] File /usr/local/lib/python3.10/dist-packages/vllm/engine/ray_utils.py, line 37, in execute_method
(RayWorkerVllm pid=309, ip=10.4.80.152) ERROR 03-29 13:57:18 ray_utils.py:44] return executor(*args, **kwargs)
(RayWorkerVllm pid=309, ip=10.4.80.152) ERROR 03-29 13:57:18 ray_utils.py:44] File /usr/local/lib/python3.10/dist-packages/vllm/worker/worker.py, line 100, in init_device
(RayWorkerVllm pid=309, ip=10.4.80.152) ERROR 03-29 13:57:18 ray_utils.py:44] init_distributed_environment(self.parallel_config, self.rank,
(RayWorkerVllm pid=309, ip=10.4.80.152) ERROR 03-29 13:57:18 ray_utils.py:44] File /usr/local/lib/python3.10/dist-packages/vllm/worker/worker.py, line 286, in init_distributed_environment
(RayWorkerVllm pid=309, ip=10.4.80.152) ERROR 03-29 13:57:18 ray_utils.py:44] pynccl_utils.init_process_group(
(RayWorkerVllm pid=309, ip=10.4.80.152) ERROR 03-29 13:57:18 ray_utils.py:44] File /usr/local/lib/python3.10/dist-packages/vllm/model_executor/parallel_utils/pynccl_utils.py, line 42, in init_process_group
(RayWorkerVllm pid=309, ip=10.4.80.152) ERROR 03-29 13:57:18 ray_utils.py:44] comm = NCCLCommunicator(init_method=init_method,
(RayWorkerVllm pid=309, ip=10.4.80.152) ERROR 03-29 13:57:18 ray_utils.py:44] File /usr/local/lib/python3.10/dist-packages/vllm/model_executor/parallel_utils/pynccl.py, line 226, in __init__
(RayWorkerVllm pid=309, ip=10.4.80.152) ERROR 03-29 13:57:18 ray_utils.py:44] torch.cuda.set_device(self.rank)
(RayWorkerVllm pid=309, ip=10.4.80.152) ERROR 03-29 13:57:18 ray_utils.py:44] File /usr/local/lib/python3.10/dist-packages/torch/cuda/__init__.py, line 404, in set_device
(RayWorkerVllm pid=309, ip=10.4.80.152) ERROR 03-29 13:57:18 ray_utils.py:44] torch._C._cuda_setDevice(device)
(RayWorkerVllm pid=309, ip=10.4.80.152) ERROR 03-29 13:57:18 ray_utils.py:44] RuntimeError: CUDA error: invalid device ordinal
(RayWorkerVllm pid=309, ip=10.4.80.152) ERROR 03-29 13:57:18 ray_utils.py:44] CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
(RayWorkerVllm pid=309, ip=10.4.80.152) ERROR 03-29 13:57:18 ray_utils.py:44] For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
(RayWorkerVllm pid=309, ip=10.4.80.152) ERROR 03-29 13:57:18 ray_utils.py:44] Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
(RayWorkerVllm pid=309, ip=10.4.80.152) ERROR 03-29 13:57:18 ray_utils.py:44]
(RayWorkerVllm pid=309, ip=10.4.80.152) Exception ignored in:
(RayWorkerVllm pid=309, ip=10.4.80.152) Traceback (most recent call last):
(RayWorkerVllm pid=309, ip=10.4.80.152) File /usr/local/lib/python3.10/dist-packages/vllm/model_executor/parallel_utils/pynccl.py, line 260, in __del__
(RayWorkerVllm pid=309, ip=10.4.80.152) _c_ncclCommDestroy(self.comm)
(RayWorkerVllm pid=309, ip=10.4.80.152) AttributeError: NCCLCommunicator object has no attribute comm
Metadata
Metadata
Assignees
Labels
bugSomething isn't workingSomething isn't working