Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
e911c07
Use DeepEP allToAll
yuantailing Jun 3, 2025
db7b8d9
Switch between three EP communication methods
yuantailing Jun 3, 2025
2df317a
Polish Dockerfile
yuantailing Jun 3, 2025
115c56e
Fix coding style
yuantailing Jun 3, 2025
b1d42a7
DeepEP buffer management
yuantailing Jun 3, 2025
b003dd1
Merge use_cuda_graph options
yuantailing Jun 4, 2025
a58497e
Change back num_sms
yuantailing Jun 4, 2025
1417ce4
Update staging image
yuantailing Jun 4, 2025
29d4f53
Refactor enable_alltoall
yuantailing Jun 5, 2025
67ee9fb
Fix: no pytorch_backend_config due to #4814
yuantailing Jun 6, 2025
fe86036
Update alltoall_method_type
yuantailing Jun 6, 2025
c13ab34
Refactor deep_ep_utils.py
yuantailing Jun 6, 2025
e4f2bdc
Add Logger.log_once
yuantailing Jun 6, 2025
a9c57e8
Add test_fused_moe_alltoall
yuantailing Jun 6, 2025
e87d9f5
Fix flashinfer-python version (issue #5002)
yuantailing Jun 7, 2025
7526680
Merge remote-tracking branch 'origin/main' into deepep
yuantailing Jun 9, 2025
5f43838
Update staging image
yuantailing Jun 9, 2025
04e3991
Move log
yuantailing Jun 9, 2025
28c0432
Lint: rename deepep => deep_ep
yuantailing Jun 10, 2025
2b10408
Update test_fused_moe_alltoall
yuantailing Jun 10, 2025
89fa4f3
Merge commit '137fe35539ea182f1495f5021bfda97c729e50c3' into deepep
yuantailing Jun 10, 2025
7cfa520
Update comments
yuantailing Jun 10, 2025
af56adc
Update docker images
yuantailing Jun 11, 2025
7bd78a1
Merge branch 'main' into deepep
yuantailing Jun 11, 2025
bf4e2b6
Merge branch 'main' into deepep
yuantailing Jun 11, 2025
00a3c47
Merge branch 'main' into deepep
yuantailing Jun 12, 2025
8a32674
Merge branch 'main' into deepep
yuantailing Jun 12, 2025
c8919e6
Merge branch 'main' into deepep
kaiyux Jun 13, 2025
4181a6e
Merge branch 'main' into deepep
yuantailing Jun 14, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .devcontainer/docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
version: "3.9"
services:
tensorrt_llm-dev:
image: urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.04-py3-x86_64-ubuntu24.04-trt10.10.0.31-skip-tritondevel-202506021004-9420
image: urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.04-py3-x86_64-ubuntu24.04-trt10.10.0.31-skip-tritondevel-202506111045-4792
network_mode: host
ipc: host

Expand Down
4 changes: 4 additions & 0 deletions docker/Dockerfile.multi
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,10 @@ RUN bash ./install_pytorch.sh $TORCH_INSTALL_TYPE && rm install_pytorch.sh
RUN pip3 uninstall -y opencv && rm -rf /usr/local/lib/python3*/dist-packages/cv2/
RUN pip3 install opencv-python-headless --force-reinstall --no-deps --no-cache-dir

# Install DeepEP
COPY docker/common/install_deep_ep.sh install_deep_ep.sh
RUN bash ./install_deep_ep.sh && rm install_deep_ep.sh

# WARs against security issues inherited from pytorch:25.04
# * https://github.com/advisories/GHSA-vqfr-h8mv-ghfj
# * https://github.com/advisories/GHSA-7cx3-6m66-7c5m
Expand Down
47 changes: 47 additions & 0 deletions docker/common/install_deep_ep.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
#!/bin/bash

set -euxo pipefail

GITHUB_URL=${GITHUB_MIRROR:-https://github.com}
DEEP_EP_COMMIT=2b266cf6452134f993ab0fcb3ef2d5de7683c561

if [ "$(. /etc/os-release && echo $ID)" == "rocky" ]; then
echo "Skipping DeepEP installation in the Rocky distribution."
exit 0
fi
libmlx5_dir=$(dirname $(ldconfig -p | grep libmlx5.so.1 | head -n1 | awk '{print $NF}'))

export NVCC_APPEND_FLAGS="--threads 4"

# Custom NVSHMEM
curl -fsSL https://developer.download.nvidia.com/compute/redist/nvshmem/3.2.5/source/nvshmem_src_3.2.5-1.txz | tar xz
pushd nvshmem_src
curl -fsSL $GITHUB_URL/deepseek-ai/DeepEP/raw/$DEEP_EP_COMMIT/third-party/nvshmem.patch | patch -p1
sed "s/TRANSPORT_VERSION_MAJOR 3/TRANSPORT_VERSION_MAJOR 103/" -i src/CMakeLists.txt
ln -s libmlx5.so.1 "$libmlx5_dir/libmlx5.so"
cmake -S . -B build \
-DCMAKE_INSTALL_PREFIX=/opt/custom_nvshmem \
-DGDRCOPY_HOME=/usr/include \
-DNVSHMEM_SHMEM_SUPPORT=0 \
-DNVSHMEM_UCX_SUPPORT=0 \
-DNVSHMEM_USE_NCCL=0 \
-DNVSHMEM_MPI_SUPPORT=0 \
-DNVSHMEM_IBGDA_SUPPORT=1 \
-DNVSHMEM_PMIX_SUPPORT=0 \
-DNVSHMEM_TIMEOUT_DEVICE_POLLING=0 \
-DNVSHMEM_USE_GDRCOPY=1 \
-DCMAKE_CUDA_ARCHITECTURES="90-real;100-real;120-real" \
-DNVSHMEM_BUILD_TESTS=0 \
-DNVSHMEM_BUILD_EXAMPLES=0
cmake --build build -j`nproc`
make -C build install
popd

# DeepEP
curl -fsSL $GITHUB_URL/deepseek-ai/DeepEP/archive/$DEEP_EP_COMMIT.tar.gz | tar xz
TORCH_CUDA_ARCH_LIST="9.0;10.0;12.0" NVSHMEM_DIR=/opt/custom_nvshmem pip install -v --no-cache-dir ./DeepEP-$DEEP_EP_COMMIT

# Clean up
rm -r nvshmem_src
rm "$libmlx5_dir/libmlx5.so"
rm -r DeepEP-$DEEP_EP_COMMIT
8 changes: 4 additions & 4 deletions jenkins/L0_MergeRequest.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,10 @@ UPLOAD_PATH = env.uploadPath ? env.uploadPath : "sw-tensorrt-generic/llm-artifac
// Container configuration
// available tags can be found in: https://urm.nvidia.com/artifactory/sw-tensorrt-docker/tensorrt-llm/
// [base_image_name]-[arch]-[os](-[python_version])-[trt_version]-[torch_install_type]-[stage]-[date]-[mr_id]
LLM_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.04-py3-x86_64-ubuntu24.04-trt10.10.0.31-skip-tritondevel-202506021004-9420"
LLM_SBSA_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.04-py3-aarch64-ubuntu24.04-trt10.10.0.31-skip-tritondevel-202506021004-9420"
LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.0-devel-rocky8-x86_64-rocky8-py310-trt10.10.0.31-skip-tritondevel-202506021004-9420"
LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.0-devel-rocky8-x86_64-rocky8-py312-trt10.10.0.31-skip-tritondevel-202506021004-9420"
LLM_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.04-py3-x86_64-ubuntu24.04-trt10.10.0.31-skip-tritondevel-202506111045-4792"
LLM_SBSA_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.04-py3-aarch64-ubuntu24.04-trt10.10.0.31-skip-tritondevel-202506111045-4792"
LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.0-devel-rocky8-x86_64-rocky8-py310-trt10.10.0.31-skip-tritondevel-202506111045-4792"
LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.0-devel-rocky8-x86_64-rocky8-py312-trt10.10.0.31-skip-tritondevel-202506111045-4792"

// TODO: Move common variables to an unified location
BUILD_CORES_REQUEST = "8"
Expand Down
2 changes: 1 addition & 1 deletion jenkins/controlCCache.groovy
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@

import java.lang.InterruptedException

DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.04-py3-x86_64-ubuntu24.04-trt10.10.0.31-skip-tritondevel-202506021004-9420"
DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.04-py3-x86_64-ubuntu24.04-trt10.10.0.31-skip-tritondevel-202506111045-4792"

def createKubernetesPodConfig(image, arch = "amd64")
{
Expand Down
3 changes: 3 additions & 0 deletions tensorrt_llm/_torch/model_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,9 @@ class ModelConfig(Generic[TConfig]):
# If true, enable min-latency mode. Currently only used for Llama4.
enable_min_latency: bool = False

# Allow models to select op according to whether CUDA Graphs are used.
use_cuda_graph: bool = False

extra_attrs: Dict = field(default_factory=dict, repr=False, init=False)

_frozen: bool = field(default=False, init=False, repr=False)
Expand Down
27 changes: 1 addition & 26 deletions tensorrt_llm/_torch/models/modeling_deepseekv3.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@
from tqdm import tqdm
from transformers import PretrainedConfig

from tensorrt_llm._mnnvl_utils import MnnvlMemory
from tensorrt_llm.functional import PositionEmbeddingType
from tensorrt_llm.llmapi.utils import enable_llm_debug
from tensorrt_llm.models.modeling_utils import QuantConfig
Expand Down Expand Up @@ -351,10 +350,6 @@ def __init__(self,
config = model_config.pretrained_config
self.top_k = top_k
self.use_dp = model_config.mapping.enable_attention_dp
self.enable_alltoall = Deepseekv3MoE.should_enable_alltoall(
model_config, top_k)
if self.enable_alltoall:
MnnvlMemory.initialize()
self.gate = DeepseekV3Gate(
hidden_size,
num_experts,
Expand All @@ -377,7 +372,6 @@ def __init__(self,
model_config=model_config,
override_quant_config=override_quant_config,
aux_stream=aux_stream_dict[AuxStreamType.MoeChunkingOverlap],
enable_alltoall=self.enable_alltoall,
layer_idx=layer_idx)

self.mapping = model_config.mapping
Expand Down Expand Up @@ -443,33 +437,14 @@ def _compute_shared_expert_tp_size(self, intermediate_size: int,

return shared_tp_size, shared_output_scale

@staticmethod
def should_enable_alltoall(model_config: ModelConfig, top_k: int) -> bool:
if not model_config.mapping.enable_attention_dp:
return False

if model_config.mapping.tp_size == 1:
return False

if not MnnvlMemory.supports_mnnvl():
return False

if os.environ.get("TRTLLM_MOE_DISABLE_ALLTOALLV", "0") == "1":
return False

if model_config.mapping.moe_ep_size <= top_k:
return False

return True

def compute_routed_output(self, hidden_states, hidden_states_fp4,
all_rank_num_tokens, do_finalize):
# max-throughput
use_dp_padding = False
if self.use_dp and self.mapping.tp_size > 1:
# FP4 all_gather moves this bf16 allgather in to after topk and fp4 quantization
# to reduce allreduce BW
if disable_fp4_allgather() and not self.enable_alltoall:
if disable_fp4_allgather() and not self.experts.enable_alltoall:
hidden_states = allgather(hidden_states,
self.mapping,
dim=0,
Expand Down
27 changes: 1 addition & 26 deletions tensorrt_llm/_torch/models/modeling_qwen3_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,6 @@
from tqdm import tqdm
from transformers import Qwen3MoeConfig

from tensorrt_llm._mnnvl_utils import MnnvlMemory

from ..attention_backend import AttentionMetadata
from ..distributed import (AllReduce, AllReduceFusionOp, AllReduceParams,
allgather)
Expand Down Expand Up @@ -91,10 +89,6 @@ def __init__(
self.mapping = model_config.mapping
self.allreduce = AllReduce(mapping=model_config.mapping,
strategy=model_config.allreduce_strategy)
self.enable_alltoall = Qwen3MoE.should_enable_alltoall(
model_config, self.top_k)
if self.enable_alltoall:
MnnvlMemory.initialize()

self.gate = Qwen3Gate(
hidden_size=self.hidden_dim,
Expand All @@ -117,25 +111,6 @@ def __init__(
model_config=model_config,
)

@staticmethod
def should_enable_alltoall(model_config: ModelConfig, top_k: int) -> bool:
if not model_config.mapping.enable_attention_dp:
return False

if model_config.mapping.tp_size == 1:
return False

if not MnnvlMemory.supports_mnnvl():
return False

if os.environ.get("TRTLLM_MOE_DISABLE_ALLTOALLV", "0") == "1":
return False

if model_config.mapping.moe_ep_size <= top_k:
return False

return True

def forward(
self,
hidden_states: torch.Tensor,
Expand All @@ -151,7 +126,7 @@ def forward(
if self.enable_attention_dp and self.mapping.tp_size > 1:
# FP4 all_gather moves this bf16 allgather in to after topk and fp4 quantization
# to reduce allreduce BW
if disable_fp4_allgather() and not self.enable_alltoall:
if disable_fp4_allgather() and not self.experts.enable_alltoall:
hidden_states = allgather(hidden_states,
self.mapping,
dim=0,
Expand Down
4 changes: 0 additions & 4 deletions tensorrt_llm/_torch/modules/fused_moe/create_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,6 @@ def create_moe(
aux_stream: Optional[torch.cuda.Stream] = None,
weight_loading_mode: MoEWeightLoadingMode = MoEWeightLoadingMode.VANILLA,
apply_router_weight_on_input: bool = False,
enable_alltoall: bool = False,
layer_idx: Optional[int] = None,
) -> MoE:
moe_cls = get_moe_cls(model_config, override_quant_config)
Expand All @@ -63,7 +62,6 @@ def create_moe(

if moe_cls == TRTLLMGenFusedMoE:
assert not apply_router_weight_on_input, "apply_router_weight_on_input is not supported in TRTLLMGenFusedMoE."
assert not enable_alltoall, "enable_alltoall is not supported in TRTLLMGenFusedMoE."

return moe_cls(
routing_method=routing_method,
Expand All @@ -88,12 +86,10 @@ def create_moe(
aux_stream=aux_stream,
weight_loading_mode=weight_loading_mode,
apply_router_weight_on_input=apply_router_weight_on_input,
enable_alltoall=enable_alltoall,
layer_idx=layer_idx,
)
elif moe_cls == VanillaMoE:
assert not apply_router_weight_on_input, "apply_router_weight_on_input is not supported in VanillaMoE."
assert not enable_alltoall, "enable_alltoall is not supported in VanillaMoE."

return moe_cls(
routing_method=routing_method,
Expand Down
Loading