NVIDIA · syuoni · Jun 25, 2025 · Jun 24, 2025 · Jun 24, 2025
@@ -27,6 +27,7 @@ Run 32-way expert parallelism inference on the prepared dataset. Please refer to
 cat > ./extra_llm_api_options.yaml <<EOF
 enable_attention_dp: true
 use_cuda_graph: true
+moe_backend: WideEP
 moe_max_num_tokens: 8192
 EOF
 
@@ -116,6 +117,7 @@ Run 36-way expert parallelism inference with the EPLB configuration incorporated
 cat > ./extra_llm_api_options_eplb.yaml <<EOF
 enable_attention_dp: true
 use_cuda_graph: true
+moe_backend: WideEP
 moe_max_num_tokens: 9216
 moe_load_balancer: ./moe_load_balancer.yaml
 EOF
@@ -181,6 +183,7 @@ Run 36-way expert parallelism inference with the EPLB configuration incorporated
 cat > ./extra_llm_api_options_eplb.yaml <<EOF
 enable_attention_dp: true
 use_cuda_graph: true
+moe_backend: WideEP
 moe_max_num_tokens: 9216
 moe_load_balancer: ./moe_load_balancer.yaml
 EOF

@@ -196,7 +196,7 @@ def __init__(
                                             hidden_size, self.num_slots)
             else:
                 raise NotImplementedError(
-                    f"Not available alltoall method type: {alltoall_method_type!r}"
+                    f"Not available alltoall method type: {self.alltoall_method_type!r}"
                 )
 
         # If True, the router weight will be multiplied on the input rather than at the end of FC2
@@ -433,7 +433,7 @@ def forward_chunk(
                         token_selected_slots, dtype=token_final_scales.dtype)
             else:
                 raise NotImplementedError(
-                    f"Not available alltoall method type: {alltoall_method_type!r}"
+                    f"Not available alltoall method type: {self.alltoall_method_type!r}"
                 )
 
         x_sf = None
@@ -552,7 +552,7 @@ def forward_chunk(
                 )
             else:
                 raise NotImplementedError(
-                    f"Not available alltoall method type: {alltoall_method_type!r}"
+                    f"Not available alltoall method type: {self.alltoall_method_type!r}"
                 )
 
         if self.enable_alltoall:
@@ -631,7 +631,7 @@ def forward_chunk(
                     deep_ep_topk_weights, deep_ep_handle)
             else:
                 raise NotImplementedError(
-                    f"Not available alltoall method type: {alltoall_method_type!r}"
+                    f"Not available alltoall method type: {self.alltoall_method_type!r}"
                 )
 
         if self.layer_load_balancer and not self.layer_load_balancer.is_static_routing(

diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
@@ -441,4 +441,3 @@ accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_auto_dtype[mtp_
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=vanilla-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5354946)
 examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5354936)
 examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5354936)
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus_static_eplb SKIP (https://nvbugs/5354925)