1616
1717from tensorrt_llm ._torch import LLM
1818from tensorrt_llm ._torch .pyexecutor .config import MoeLoadBalancerConfig
19- from tensorrt_llm .llmapi import (EagleDecodingConfig , KvCacheConfig ,
20- MTPDecodingConfig , NGramDecodingConfig ,
21- SamplingParams , TorchCompileConfig )
19+ from tensorrt_llm .llmapi import (CudaGraphConfig , EagleDecodingConfig ,
20+ KvCacheConfig , MTPDecodingConfig ,
21+ NGramDecodingConfig , SamplingParams ,
22+ TorchCompileConfig )
2223from tensorrt_llm .models .modeling_utils import QuantConfig
2324from tensorrt_llm .quantization import QuantAlgo
2425
@@ -389,11 +390,15 @@ class TestLlama4MaverickInstruct(LlmapiAccuracyTestHarness):
389390 (8 , 1 , 8 )],
390391 ids = ["tp8" , "tp8ep4" , "tp8ep8" ])
391392 def test_auto_dtype (self , cuda_graph , tp_size , pp_size , ep_size ):
392- with LLM (self .MODEL_PATH ,
393- tensor_parallel_size = tp_size ,
394- pipeline_parallel_size = pp_size ,
395- moe_expert_parallel_size = ep_size ,
396- use_cuda_graph = cuda_graph ) as llm :
393+ with LLM (
394+ self .MODEL_PATH ,
395+ tensor_parallel_size = tp_size ,
396+ # Keep this low to avoid warmup OOM in CI
397+ max_seq_len = 8192 ,
398+ pipeline_parallel_size = pp_size ,
399+ moe_expert_parallel_size = ep_size ,
400+ cuda_graph_config = CudaGraphConfig ()
401+ if cuda_graph else None ) as llm :
397402 task = MMLU (self .MODEL_NAME )
398403 task .evaluate (llm )
399404 task = GSM8K (self .MODEL_NAME )
@@ -411,11 +416,15 @@ class TestLlama4ScoutInstruct(LlmapiAccuracyTestHarness):
411416 (8 , 1 , 8 )],
412417 ids = ["tp8" , "tp8ep4" , "tp8ep8" ])
413418 def test_auto_dtype (self , cuda_graph , tp_size , pp_size , ep_size ):
414- with LLM (self .MODEL_PATH ,
415- tensor_parallel_size = tp_size ,
416- pipeline_parallel_size = pp_size ,
417- moe_expert_parallel_size = ep_size ,
418- use_cuda_graph = cuda_graph ) as llm :
419+ with LLM (
420+ self .MODEL_PATH ,
421+ tensor_parallel_size = tp_size ,
422+ # Keep this low to avoid warmup OOM in CI
423+ max_seq_len = 8192 ,
424+ pipeline_parallel_size = pp_size ,
425+ moe_expert_parallel_size = ep_size ,
426+ cuda_graph_config = CudaGraphConfig ()
427+ if cuda_graph else None ) as llm :
419428 task = MMLU (self .MODEL_NAME )
420429 task .evaluate (llm )
421430 task = GSM8K (self .MODEL_NAME )
0 commit comments