Skip to content

Commit 955f151

Browse files
committed
[nvbug/5341178][fix] Fix OOM in Llama 4 accuracy test
Signed-off-by: Balaram Buddharaju <[email protected]>
1 parent 8a8d2e9 commit 955f151

File tree

1 file changed

+22
-13
lines changed

1 file changed

+22
-13
lines changed

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 22 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -16,9 +16,10 @@
1616

1717
from tensorrt_llm._torch import LLM
1818
from tensorrt_llm._torch.pyexecutor.config import MoeLoadBalancerConfig
19-
from tensorrt_llm.llmapi import (EagleDecodingConfig, KvCacheConfig,
20-
MTPDecodingConfig, NGramDecodingConfig,
21-
SamplingParams, TorchCompileConfig)
19+
from tensorrt_llm.llmapi import (CudaGraphConfig, EagleDecodingConfig,
20+
KvCacheConfig, MTPDecodingConfig,
21+
NGramDecodingConfig, SamplingParams,
22+
TorchCompileConfig)
2223
from tensorrt_llm.models.modeling_utils import QuantConfig
2324
from tensorrt_llm.quantization import QuantAlgo
2425

@@ -389,11 +390,15 @@ class TestLlama4MaverickInstruct(LlmapiAccuracyTestHarness):
389390
(8, 1, 8)],
390391
ids=["tp8", "tp8ep4", "tp8ep8"])
391392
def test_auto_dtype(self, cuda_graph, tp_size, pp_size, ep_size):
392-
with LLM(self.MODEL_PATH,
393-
tensor_parallel_size=tp_size,
394-
pipeline_parallel_size=pp_size,
395-
moe_expert_parallel_size=ep_size,
396-
use_cuda_graph=cuda_graph) as llm:
393+
with LLM(
394+
self.MODEL_PATH,
395+
tensor_parallel_size=tp_size,
396+
# Keep this low to avoid warmup OOM in CI
397+
max_seq_len=8192,
398+
pipeline_parallel_size=pp_size,
399+
moe_expert_parallel_size=ep_size,
400+
cuda_graph_config=CudaGraphConfig()
401+
if cuda_graph else None) as llm:
397402
task = MMLU(self.MODEL_NAME)
398403
task.evaluate(llm)
399404
task = GSM8K(self.MODEL_NAME)
@@ -411,11 +416,15 @@ class TestLlama4ScoutInstruct(LlmapiAccuracyTestHarness):
411416
(8, 1, 8)],
412417
ids=["tp8", "tp8ep4", "tp8ep8"])
413418
def test_auto_dtype(self, cuda_graph, tp_size, pp_size, ep_size):
414-
with LLM(self.MODEL_PATH,
415-
tensor_parallel_size=tp_size,
416-
pipeline_parallel_size=pp_size,
417-
moe_expert_parallel_size=ep_size,
418-
use_cuda_graph=cuda_graph) as llm:
419+
with LLM(
420+
self.MODEL_PATH,
421+
tensor_parallel_size=tp_size,
422+
# Keep this low to avoid warmup OOM in CI
423+
max_seq_len=8192,
424+
pipeline_parallel_size=pp_size,
425+
moe_expert_parallel_size=ep_size,
426+
cuda_graph_config=CudaGraphConfig()
427+
if cuda_graph else None) as llm:
419428
task = MMLU(self.MODEL_NAME)
420429
task.evaluate(llm)
421430
task = GSM8K(self.MODEL_NAME)

0 commit comments

Comments
 (0)