Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion tests/integration/defs/accuracy/test_llm_api_pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -759,13 +759,16 @@ def test_auto_dtype(self):
kv_cache_config = KvCacheConfig(
enable_block_reuse=False,
enable_partial_reuse=False,
free_gpu_memory_fraction=0.5,
)
# We use FlashInfer as the attention backend for Gemma3 VLM to support custom mask for images.
# So, testing with it here.
with LLM(self.MODEL_PATH,
kv_cache_config=kv_cache_config,
attn_backend="FLASHINFER",
cuda_graph_config=None) as llm:
cuda_graph_config=None,
max_batch_size=128,
max_seq_len=4096) as llm:
task = CnnDailymail(self.MODEL_NAME)
task.evaluate(llm)
task = MMLU(self.MODEL_NAME)
Expand Down
15 changes: 12 additions & 3 deletions tests/integration/defs/test_e2e.py
Original file line number Diff line number Diff line change
Expand Up @@ -2232,7 +2232,8 @@ def test_ptp_quickstart_advanced_mixed_precision(llm_root, llm_venv):
marks=pytest.mark.skip_less_device_memory(80000)),
pytest.param("gemma-3-27b-it",
"gemma/gemma-3-27b-it",
marks=pytest.mark.skip_less_device_memory(80000)),
marks=(skip_post_blackwell,
pytest.mark.skip_less_device_memory(80000))),
])
def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
modality, use_cuda_graph):
Expand Down Expand Up @@ -2368,6 +2369,8 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
cmd.append("--image_format=pil")
cmd.append("--attention_backend=FLASHINFER")
cmd.append("--disable_kv_cache_reuse")
cmd.append("--kv_cache_fraction=0.5")
cmd.append("--max_seq_len=1024")

output = llm_venv.run_cmd(cmd, caller=check_output)

Expand Down Expand Up @@ -2528,9 +2531,10 @@ def test_ptp_quickstart_multimodal_phi4mm(llm_root, llm_venv, modality):
@pytest.mark.skip_less_device(2)
@pytest.mark.skip_less_device_memory(80000)
@pytest.mark.parametrize("model_name,model_path", [
("gemma-3-27b-it", "gemma/gemma-3-27b-it"),
("mistral-small-3.1-24b-instruct", "Mistral-Small-3.1-24B-Instruct-2503"),
("Phi-4-multimodal-instruct", "multimodals/Phi-4-multimodal-instruct"),
pytest.param(
"gemma-3-27b-it", "gemma/gemma-3-27b-it", marks=skip_post_blackwell),
])
def test_ptp_quickstart_multimodal_2gpu(llm_root, llm_venv, model_name,
model_path):
Expand Down Expand Up @@ -2602,6 +2606,8 @@ def test_ptp_quickstart_multimodal_2gpu(llm_root, llm_venv, model_name,
cmd.append("--image_format=pil")
cmd.append("--attention_backend=FLASHINFER")
cmd.append("--disable_kv_cache_reuse")
cmd.append("--kv_cache_fraction=0.5")
cmd.append("--max_seq_len=1024")
elif model_name == "Phi-4-multimodal-instruct":
# Set max_seq_len to 4096 to use short rope factor.
cmd.append("--max_seq_len=4096")
Expand Down Expand Up @@ -2630,9 +2636,10 @@ def test_ptp_quickstart_multimodal_2gpu(llm_root, llm_venv, model_name,

@pytest.mark.skip_less_device_memory(80000)
@pytest.mark.parametrize("model_name,model_path", [
("gemma-3-27b-it", "gemma/gemma-3-27b-it"),
("mistral-small-3.1-24b-instruct", "Mistral-Small-3.1-24B-Instruct-2503"),
("Phi-4-multimodal-instruct", "multimodals/Phi-4-multimodal-instruct"),
pytest.param(
"gemma-3-27b-it", "gemma/gemma-3-27b-it", marks=skip_post_blackwell),
])
def test_ptp_quickstart_multimodal_multiturn(llm_root, llm_venv, model_name,
model_path):
Expand Down Expand Up @@ -2698,6 +2705,8 @@ def test_ptp_quickstart_multimodal_multiturn(llm_root, llm_venv, model_name,
cmd.append("--image_format=pil")
cmd.append("--attention_backend=FLASHINFER")
cmd.append("--disable_kv_cache_reuse")
cmd.append("--kv_cache_fraction=0.5")
cmd.append("--max_seq_len=1024")
elif model_name == "Phi-4-multimodal-instruct":
# Set max_seq_len to 4096 to use short rope factor.
cmd.append("--max_seq_len=4096")
Expand Down
1 change: 0 additions & 1 deletion tests/integration/test_lists/waives.txt
Original file line number Diff line number Diff line change
Expand Up @@ -281,7 +281,6 @@ full:GB200/examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen1.5_7b
full:GB200/examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen2_7b_instruct-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4] SKIP (https://nvbugs/5247837)
full:GB200/examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen2_vl_7b_instruct-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4] SKIP (https://nvbugs/5359696)
full:GB200/examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen2.5_7b_chat-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4] SKIP (https://nvbugs/5247837)
accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype[/scratch.trt_llm_data/llm-models/Mistral-Small-3.1-24B-Instruct-2503-fp8-FP8] SKIP (https://nvbugs/5462007)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5410391)
accuracy/test_llm_api.py::TestMistral_Nemo_12B_Base::test_fp8 SKIP (https://nvbugs/5413197)
accuracy/test_cli_flow.py::TestLlama3_8BInstructGradient1048k::test_long_context_ppl SKIP (https://nvbugs/5413362)
Expand Down