[nvbugs/5336321][fix] Enable attention dp = False test case, Fix TRTLLM Gen Moe workspace allocation (#5463)

yizhang-nv · dc3671 · commit 9cc4e5d50e43 · 2025-07-14T17:17:30.000+08:00
Signed-off-by: Yi Zhang &lt;187001205+yizhang-nv@users.noreply.github.com&gt;
Signed-off-by: yizhan &lt;187001205+yizhang-nv@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp b/cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp
@@ -86,6 +86,7 @@ std::vector<torch::Tensor> run_fp4_block_scale_moe_runner(torch::Tensor const& r
 
     TORCH_CHECK(num_experts % 4 == 0, "Routing kernel expects that num_experts must be divisible by 4");
     TORCH_CHECK(num_experts > top_k, "num_experts must be greater than top_k");
+    TORCH_CHECK(num_experts <= 256, "num_experts must be less than or equal to 256");
 
     tensorrt_llm::kernels::trtllmGenFp8BlockScaleMoe::MoE::MoERunnerArgs args;
     tensorrt_llm::kernels::trtllmGenFp8BlockScaleMoe::MoE::MoEWorkspace workspace;
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -1088,8 +1088,7 @@ def test_nvfp4_4gpus(self, fp8kv, attention_dp, cuda_graph,
             pytest.skip("https://nvbugs/5252313")
         if torch_compile and pp_size > 1:
             pytest.skip("PP with torch.compile is not supported yet.")
-        if not attention_dp and (tp_size > 1 or ep_size > 1):
-            pytest.skip("https://nvbugs/5336321")
+
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
         # Picewise Cuda Graph cannot be enabled for nvfp4 attention dp.
         torch_compile_config = TorchCompileConfig(