diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_trtllm_gen.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_trtllm_gen.py index 94e082a6670..2a8e1c30ea8 100644 --- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_trtllm_gen.py +++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_trtllm_gen.py @@ -2,6 +2,8 @@ import torch +from tensorrt_llm._utils import get_sm_version + from ...distributed.ops import reducescatter from ...model_config import ModelConfig from ...utils import Fp4QuantizedTensor @@ -68,6 +70,11 @@ def __init__( weight_loading_mode=weight_loading_mode, ) + sm_version = get_sm_version() + if sm_version >= 120: + raise NotImplementedError( + "TRTLLMGenFusedMoE does not support SM120 and above.") + assert not self.smart_router, "Smart router is not supported in TRTLLMGenFusedMoE." self.num_slots = self.num_experts diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 5112907c209..42e6fe0f7db 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -1334,6 +1334,11 @@ def test_nvfp4_4gpus_online_eplb(self, fp8kv): @parametrize_with_ids("moe_backend", ["CUTLASS", "TRTLLM"]) def test_nvfp4(self, fp8kv, attention_dp, cuda_graph, overlap_scheduler, torch_compile, mtp_nextn, moe_backend): + if moe_backend == "TRTLLM" and (get_sm_version() == 120 + or get_sm_version() == 121): + pytest.skip( + "MOE TRTLLM backend does not support SM version 120 or 121") + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75) torch_compile_config = TorchCompileConfig( enable_fullgraph=True, @@ -1381,8 +1386,10 @@ def test_nvfp4_4gpus(self, fp8kv, attention_dp, cuda_graph, torch_compile, mtp_nextn, moe_backend): if torch_compile and pp_size > 1: pytest.skip("PP with torch.compile is not supported yet.") - if moe_backend == "TRTLLM" and get_sm_version() == 120: - pytest.skip("MOE TRTLLM backend does not support SM version 120") + if moe_backend == "TRTLLM" and (get_sm_version() == 120 + or get_sm_version() == 121): + pytest.skip( + "MOE TRTLLM backend does not support SM version 120 or 121") kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75) # Picewise Cuda Graph cannot be enabled for nvfp4 attention dp. torch_compile_config = TorchCompileConfig( @@ -1597,6 +1604,11 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness): def test_nvfp4_multi_gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv, attention_dp, cuda_graph, overlap_scheduler, max_batch_size, moe_backend): + if moe_backend == "TRTLLM" and (get_sm_version() == 120 + or get_sm_version() == 121): + pytest.skip( + "MOE TRTLLM backend does not support SM version 120 or 121") + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.70) pytorch_config = dict( disable_overlap_scheduler=not overlap_scheduler, @@ -2153,6 +2165,11 @@ def test_nvfp4( torch_compile, ): + if moe_backend == "TRTLLM" and (get_sm_version() == 120 + or get_sm_version() == 121): + pytest.skip( + "MOE TRTLLM backend does not support SM version 120 or 121") + torch_compile_config = TorchCompileConfig( enable_fullgraph=True, enable_piecewise_cuda_graph=cuda_graph and not attention_dp, @@ -2273,6 +2290,11 @@ def test_fp8(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, def test_nvfp4(self, tp_size, pp_size, ep_size, attention_dp, cuda_graph, overlap_scheduler, moe_backend, eagle3): + if moe_backend == "TRTLLM" and (get_sm_version() == 120 + or get_sm_version() == 121): + pytest.skip( + "MOE TRTLLM backend does not support SM version 120 or 121") + pytorch_config = dict( disable_overlap_scheduler=not overlap_scheduler, cuda_graph_config=CudaGraphConfig() if cuda_graph else None, diff --git a/tests/integration/test_lists/qa/llm_function_rtx6kd.txt b/tests/integration/test_lists/qa/llm_function_rtx6kd.txt index fbabac6b84f..d30cd857dea 100644 --- a/tests/integration/test_lists/qa/llm_function_rtx6kd.txt +++ b/tests/integration/test_lists/qa/llm_function_rtx6kd.txt @@ -24,8 +24,6 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUT accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4 accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=False] accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=True] -accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=False] -accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm-torch_compile=True] test_e2e.py::test_ptp_quickstart_advanced_mixed_precision test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-BF16-llama-3.1-model/Meta-Llama-3.1-8B] test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-FP8-llama-3.1-model/Llama-3.1-8B-Instruct-FP8]