Skip to content

Commit 0b60da2

Browse files
yuantailingkaiyux
andauthored
feat: large-scale EP(part 7: DeepEP integration) (#4792)
Signed-off-by: Tailing Yuan <[email protected]> Co-authored-by: Kaiyu Xie <[email protected]>
1 parent 443b2eb commit 0b60da2

File tree

18 files changed

+610
-88
lines changed

18 files changed

+610
-88
lines changed

.devcontainer/docker-compose.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
version: "3.9"
22
services:
33
tensorrt_llm-dev:
4-
image: urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.04-py3-x86_64-ubuntu24.04-trt10.10.0.31-skip-tritondevel-202506021004-9420
4+
image: urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.04-py3-x86_64-ubuntu24.04-trt10.10.0.31-skip-tritondevel-202506111045-4792
55
network_mode: host
66
ipc: host
77

docker/Dockerfile.multi

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,10 @@ RUN bash ./install_pytorch.sh $TORCH_INSTALL_TYPE && rm install_pytorch.sh
7272
RUN pip3 uninstall -y opencv && rm -rf /usr/local/lib/python3*/dist-packages/cv2/
7373
RUN pip3 install opencv-python-headless --force-reinstall --no-deps --no-cache-dir
7474

75+
# Install DeepEP
76+
COPY docker/common/install_deep_ep.sh install_deep_ep.sh
77+
RUN bash ./install_deep_ep.sh && rm install_deep_ep.sh
78+
7579
# WARs against security issues inherited from pytorch:25.04
7680
# * https://github.com/advisories/GHSA-vqfr-h8mv-ghfj
7781
# * https://github.com/advisories/GHSA-7cx3-6m66-7c5m

docker/common/install_deep_ep.sh

Lines changed: 47 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,47 @@
1+
#!/bin/bash
2+
3+
set -euxo pipefail
4+
5+
GITHUB_URL=${GITHUB_MIRROR:-https://github.com}
6+
DEEP_EP_COMMIT=2b266cf6452134f993ab0fcb3ef2d5de7683c561
7+
8+
if [ "$(. /etc/os-release && echo $ID)" == "rocky" ]; then
9+
echo "Skipping DeepEP installation in the Rocky distribution."
10+
exit 0
11+
fi
12+
libmlx5_dir=$(dirname $(ldconfig -p | grep libmlx5.so.1 | head -n1 | awk '{print $NF}'))
13+
14+
export NVCC_APPEND_FLAGS="--threads 4"
15+
16+
# Custom NVSHMEM
17+
curl -fsSL https://developer.download.nvidia.com/compute/redist/nvshmem/3.2.5/source/nvshmem_src_3.2.5-1.txz | tar xz
18+
pushd nvshmem_src
19+
curl -fsSL $GITHUB_URL/deepseek-ai/DeepEP/raw/$DEEP_EP_COMMIT/third-party/nvshmem.patch | patch -p1
20+
sed "s/TRANSPORT_VERSION_MAJOR 3/TRANSPORT_VERSION_MAJOR 103/" -i src/CMakeLists.txt
21+
ln -s libmlx5.so.1 "$libmlx5_dir/libmlx5.so"
22+
cmake -S . -B build \
23+
-DCMAKE_INSTALL_PREFIX=/opt/custom_nvshmem \
24+
-DGDRCOPY_HOME=/usr/include \
25+
-DNVSHMEM_SHMEM_SUPPORT=0 \
26+
-DNVSHMEM_UCX_SUPPORT=0 \
27+
-DNVSHMEM_USE_NCCL=0 \
28+
-DNVSHMEM_MPI_SUPPORT=0 \
29+
-DNVSHMEM_IBGDA_SUPPORT=1 \
30+
-DNVSHMEM_PMIX_SUPPORT=0 \
31+
-DNVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
32+
-DNVSHMEM_USE_GDRCOPY=1 \
33+
-DCMAKE_CUDA_ARCHITECTURES="90-real;100-real;120-real" \
34+
-DNVSHMEM_BUILD_TESTS=0 \
35+
-DNVSHMEM_BUILD_EXAMPLES=0
36+
cmake --build build -j`nproc`
37+
make -C build install
38+
popd
39+
40+
# DeepEP
41+
curl -fsSL $GITHUB_URL/deepseek-ai/DeepEP/archive/$DEEP_EP_COMMIT.tar.gz | tar xz
42+
TORCH_CUDA_ARCH_LIST="9.0;10.0;12.0" NVSHMEM_DIR=/opt/custom_nvshmem pip install -v --no-cache-dir ./DeepEP-$DEEP_EP_COMMIT
43+
44+
# Clean up
45+
rm -r nvshmem_src
46+
rm "$libmlx5_dir/libmlx5.so"
47+
rm -r DeepEP-$DEEP_EP_COMMIT

jenkins/L0_MergeRequest.groovy

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,10 +28,10 @@ UPLOAD_PATH = env.uploadPath ? env.uploadPath : "sw-tensorrt-generic/llm-artifac
2828
// Container configuration
2929
// available tags can be found in: https://urm.nvidia.com/artifactory/sw-tensorrt-docker/tensorrt-llm/
3030
// [base_image_name]-[arch]-[os](-[python_version])-[trt_version]-[torch_install_type]-[stage]-[date]-[mr_id]
31-
LLM_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.04-py3-x86_64-ubuntu24.04-trt10.10.0.31-skip-tritondevel-202506021004-9420"
32-
LLM_SBSA_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.04-py3-aarch64-ubuntu24.04-trt10.10.0.31-skip-tritondevel-202506021004-9420"
33-
LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.0-devel-rocky8-x86_64-rocky8-py310-trt10.10.0.31-skip-tritondevel-202506021004-9420"
34-
LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.0-devel-rocky8-x86_64-rocky8-py312-trt10.10.0.31-skip-tritondevel-202506021004-9420"
31+
LLM_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.04-py3-x86_64-ubuntu24.04-trt10.10.0.31-skip-tritondevel-202506111045-4792"
32+
LLM_SBSA_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.04-py3-aarch64-ubuntu24.04-trt10.10.0.31-skip-tritondevel-202506111045-4792"
33+
LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.0-devel-rocky8-x86_64-rocky8-py310-trt10.10.0.31-skip-tritondevel-202506111045-4792"
34+
LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.0-devel-rocky8-x86_64-rocky8-py312-trt10.10.0.31-skip-tritondevel-202506111045-4792"
3535

3636
// TODO: Move common variables to an unified location
3737
BUILD_CORES_REQUEST = "8"

jenkins/controlCCache.groovy

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11

22
import java.lang.InterruptedException
33

4-
DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.04-py3-x86_64-ubuntu24.04-trt10.10.0.31-skip-tritondevel-202506021004-9420"
4+
DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.04-py3-x86_64-ubuntu24.04-trt10.10.0.31-skip-tritondevel-202506111045-4792"
55

66
def createKubernetesPodConfig(image, arch = "amd64")
77
{

tensorrt_llm/_torch/model_config.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,9 @@ class ModelConfig(Generic[TConfig]):
8484
# If true, enable min-latency mode. Currently only used for Llama4.
8585
enable_min_latency: bool = False
8686

87+
# Allow models to select op according to whether CUDA Graphs are used.
88+
use_cuda_graph: bool = False
89+
8790
extra_attrs: Dict = field(default_factory=dict, repr=False, init=False)
8891

8992
_frozen: bool = field(default=False, init=False, repr=False)

tensorrt_llm/_torch/models/modeling_deepseekv3.py

Lines changed: 1 addition & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,6 @@
3838
from tqdm import tqdm
3939
from transformers import PretrainedConfig
4040

41-
from tensorrt_llm._mnnvl_utils import MnnvlMemory
4241
from tensorrt_llm.functional import PositionEmbeddingType
4342
from tensorrt_llm.llmapi.utils import enable_llm_debug
4443
from tensorrt_llm.mapping import Mapping
@@ -413,10 +412,6 @@ def __init__(self,
413412
config = model_config.pretrained_config
414413
self.top_k = top_k
415414
self.use_dp = model_config.mapping.enable_attention_dp
416-
self.enable_alltoall = Deepseekv3MoE.should_enable_alltoall(
417-
model_config, top_k)
418-
if self.enable_alltoall:
419-
MnnvlMemory.initialize()
420415
self.gate = DeepseekV3Gate(
421416
hidden_size,
422417
num_experts,
@@ -439,7 +434,6 @@ def __init__(self,
439434
model_config=model_config,
440435
override_quant_config=override_quant_config,
441436
aux_stream=aux_stream_dict[AuxStreamType.MoeChunkingOverlap],
442-
enable_alltoall=self.enable_alltoall,
443437
layer_idx=layer_idx)
444438

445439
self.mapping = model_config.mapping
@@ -505,33 +499,14 @@ def _compute_shared_expert_tp_size(self, intermediate_size: int,
505499

506500
return shared_tp_size, shared_output_scale
507501

508-
@staticmethod
509-
def should_enable_alltoall(model_config: ModelConfig, top_k: int) -> bool:
510-
if not model_config.mapping.enable_attention_dp:
511-
return False
512-
513-
if model_config.mapping.tp_size == 1:
514-
return False
515-
516-
if not MnnvlMemory.supports_mnnvl():
517-
return False
518-
519-
if os.environ.get("TRTLLM_MOE_DISABLE_ALLTOALLV", "0") == "1":
520-
return False
521-
522-
if model_config.mapping.moe_ep_size <= top_k:
523-
return False
524-
525-
return True
526-
527502
def compute_routed_output(self, hidden_states, hidden_states_fp4,
528503
all_rank_num_tokens, do_finalize):
529504
# max-throughput
530505
use_dp_padding = False
531506
if self.use_dp and self.mapping.tp_size > 1:
532507
# FP4 all_gather moves this bf16 allgather in to after topk and fp4 quantization
533508
# to reduce allreduce BW
534-
if disable_fp4_allgather() and not self.enable_alltoall:
509+
if disable_fp4_allgather() and not self.experts.enable_alltoall:
535510
hidden_states = allgather(hidden_states,
536511
self.mapping,
537512
dim=0,

tensorrt_llm/_torch/models/modeling_qwen3_moe.py

Lines changed: 1 addition & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,6 @@
66
from tqdm import tqdm
77
from transformers import Qwen3MoeConfig
88

9-
from tensorrt_llm._mnnvl_utils import MnnvlMemory
10-
119
from ..attention_backend import AttentionMetadata
1210
from ..distributed import (AllReduce, AllReduceFusionOp, AllReduceParams,
1311
allgather)
@@ -91,10 +89,6 @@ def __init__(
9189
self.mapping = model_config.mapping
9290
self.allreduce = AllReduce(mapping=model_config.mapping,
9391
strategy=model_config.allreduce_strategy)
94-
self.enable_alltoall = Qwen3MoE.should_enable_alltoall(
95-
model_config, self.top_k)
96-
if self.enable_alltoall:
97-
MnnvlMemory.initialize()
9892

9993
self.gate = Qwen3Gate(
10094
hidden_size=self.hidden_dim,
@@ -117,25 +111,6 @@ def __init__(
117111
model_config=model_config,
118112
)
119113

120-
@staticmethod
121-
def should_enable_alltoall(model_config: ModelConfig, top_k: int) -> bool:
122-
if not model_config.mapping.enable_attention_dp:
123-
return False
124-
125-
if model_config.mapping.tp_size == 1:
126-
return False
127-
128-
if not MnnvlMemory.supports_mnnvl():
129-
return False
130-
131-
if os.environ.get("TRTLLM_MOE_DISABLE_ALLTOALLV", "0") == "1":
132-
return False
133-
134-
if model_config.mapping.moe_ep_size <= top_k:
135-
return False
136-
137-
return True
138-
139114
def forward(
140115
self,
141116
hidden_states: torch.Tensor,
@@ -151,7 +126,7 @@ def forward(
151126
if self.enable_attention_dp and self.mapping.tp_size > 1:
152127
# FP4 all_gather moves this bf16 allgather in to after topk and fp4 quantization
153128
# to reduce allreduce BW
154-
if disable_fp4_allgather() and not self.enable_alltoall:
129+
if disable_fp4_allgather() and not self.experts.enable_alltoall:
155130
hidden_states = allgather(hidden_states,
156131
self.mapping,
157132
dim=0,

tensorrt_llm/_torch/modules/fused_moe/create_moe.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,6 @@ def create_moe(
5252
aux_stream: Optional[torch.cuda.Stream] = None,
5353
weight_loading_mode: MoEWeightLoadingMode = MoEWeightLoadingMode.VANILLA,
5454
apply_router_weight_on_input: bool = False,
55-
enable_alltoall: bool = False,
5655
layer_idx: Optional[int] = None,
5756
) -> MoE:
5857
moe_cls = get_moe_cls(model_config, override_quant_config)
@@ -63,7 +62,6 @@ def create_moe(
6362

6463
if moe_cls == TRTLLMGenFusedMoE:
6564
assert not apply_router_weight_on_input, "apply_router_weight_on_input is not supported in TRTLLMGenFusedMoE."
66-
assert not enable_alltoall, "enable_alltoall is not supported in TRTLLMGenFusedMoE."
6765

6866
return moe_cls(
6967
routing_method=routing_method,
@@ -88,12 +86,10 @@ def create_moe(
8886
aux_stream=aux_stream,
8987
weight_loading_mode=weight_loading_mode,
9088
apply_router_weight_on_input=apply_router_weight_on_input,
91-
enable_alltoall=enable_alltoall,
9289
layer_idx=layer_idx,
9390
)
9491
elif moe_cls == VanillaMoE:
9592
assert not apply_router_weight_on_input, "apply_router_weight_on_input is not supported in VanillaMoE."
96-
assert not enable_alltoall, "enable_alltoall is not supported in VanillaMoE."
9793

9894
return moe_cls(
9995
routing_method=routing_method,

0 commit comments

Comments
 (0)