Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions examples/ep_load_balancer/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ Run 32-way expert parallelism inference on the prepared dataset. Please refer to
cat > ./extra_llm_api_options.yaml <<EOF
enable_attention_dp: true
use_cuda_graph: true
moe_backend: WideEP
moe_max_num_tokens: 8192
EOF

Expand Down Expand Up @@ -116,6 +117,7 @@ Run 36-way expert parallelism inference with the EPLB configuration incorporated
cat > ./extra_llm_api_options_eplb.yaml <<EOF
enable_attention_dp: true
use_cuda_graph: true
moe_backend: WideEP
moe_max_num_tokens: 9216
moe_load_balancer: ./moe_load_balancer.yaml
EOF
Expand Down Expand Up @@ -181,6 +183,7 @@ Run 36-way expert parallelism inference with the EPLB configuration incorporated
cat > ./extra_llm_api_options_eplb.yaml <<EOF
enable_attention_dp: true
use_cuda_graph: true
moe_backend: WideEP
moe_max_num_tokens: 9216
moe_load_balancer: ./moe_load_balancer.yaml
EOF
Expand Down
8 changes: 4 additions & 4 deletions tensorrt_llm/_torch/modules/fused_moe/fused_moe_wide_ep.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ def __init__(
hidden_size, self.num_slots)
else:
raise NotImplementedError(
f"Not available alltoall method type: {alltoall_method_type!r}"
f"Not available alltoall method type: {self.alltoall_method_type!r}"
)

# If True, the router weight will be multiplied on the input rather than at the end of FC2
Expand Down Expand Up @@ -433,7 +433,7 @@ def forward_chunk(
token_selected_slots, dtype=token_final_scales.dtype)
else:
raise NotImplementedError(
f"Not available alltoall method type: {alltoall_method_type!r}"
f"Not available alltoall method type: {self.alltoall_method_type!r}"
)

x_sf = None
Expand Down Expand Up @@ -552,7 +552,7 @@ def forward_chunk(
)
else:
raise NotImplementedError(
f"Not available alltoall method type: {alltoall_method_type!r}"
f"Not available alltoall method type: {self.alltoall_method_type!r}"
)

if self.enable_alltoall:
Expand Down Expand Up @@ -631,7 +631,7 @@ def forward_chunk(
deep_ep_topk_weights, deep_ep_handle)
else:
raise NotImplementedError(
f"Not available alltoall method type: {alltoall_method_type!r}"
f"Not available alltoall method type: {self.alltoall_method_type!r}"
)

if self.layer_load_balancer and not self.layer_load_balancer.is_static_routing(
Expand Down
1 change: 0 additions & 1 deletion tests/integration/test_lists/waives.txt
Original file line number Diff line number Diff line change
Expand Up @@ -441,4 +441,3 @@ accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=vanilla-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5354946)
examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5354936)
examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5354936)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus_static_eplb SKIP (https://nvbugs/5354925)