diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 22d04b26145..4098e2565b3 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -759,13 +759,16 @@ def test_auto_dtype(self): kv_cache_config = KvCacheConfig( enable_block_reuse=False, enable_partial_reuse=False, + free_gpu_memory_fraction=0.5, ) # We use FlashInfer as the attention backend for Gemma3 VLM to support custom mask for images. # So, testing with it here. with LLM(self.MODEL_PATH, kv_cache_config=kv_cache_config, attn_backend="FLASHINFER", - cuda_graph_config=None) as llm: + cuda_graph_config=None, + max_batch_size=128, + max_seq_len=4096) as llm: task = CnnDailymail(self.MODEL_NAME) task.evaluate(llm) task = MMLU(self.MODEL_NAME) diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py index bda61ce177b..0d47cc7c6f7 100644 --- a/tests/integration/defs/test_e2e.py +++ b/tests/integration/defs/test_e2e.py @@ -2232,7 +2232,8 @@ def test_ptp_quickstart_advanced_mixed_precision(llm_root, llm_venv): marks=pytest.mark.skip_less_device_memory(80000)), pytest.param("gemma-3-27b-it", "gemma/gemma-3-27b-it", - marks=pytest.mark.skip_less_device_memory(80000)), + marks=(skip_post_blackwell, + pytest.mark.skip_less_device_memory(80000))), ]) def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path, modality, use_cuda_graph): @@ -2368,6 +2369,8 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path, cmd.append("--image_format=pil") cmd.append("--attention_backend=FLASHINFER") cmd.append("--disable_kv_cache_reuse") + cmd.append("--kv_cache_fraction=0.5") + cmd.append("--max_seq_len=1024") output = llm_venv.run_cmd(cmd, caller=check_output) @@ -2528,9 +2531,10 @@ def test_ptp_quickstart_multimodal_phi4mm(llm_root, llm_venv, modality): @pytest.mark.skip_less_device(2) @pytest.mark.skip_less_device_memory(80000) @pytest.mark.parametrize("model_name,model_path", [ - ("gemma-3-27b-it", "gemma/gemma-3-27b-it"), ("mistral-small-3.1-24b-instruct", "Mistral-Small-3.1-24B-Instruct-2503"), ("Phi-4-multimodal-instruct", "multimodals/Phi-4-multimodal-instruct"), + pytest.param( + "gemma-3-27b-it", "gemma/gemma-3-27b-it", marks=skip_post_blackwell), ]) def test_ptp_quickstart_multimodal_2gpu(llm_root, llm_venv, model_name, model_path): @@ -2602,6 +2606,8 @@ def test_ptp_quickstart_multimodal_2gpu(llm_root, llm_venv, model_name, cmd.append("--image_format=pil") cmd.append("--attention_backend=FLASHINFER") cmd.append("--disable_kv_cache_reuse") + cmd.append("--kv_cache_fraction=0.5") + cmd.append("--max_seq_len=1024") elif model_name == "Phi-4-multimodal-instruct": # Set max_seq_len to 4096 to use short rope factor. cmd.append("--max_seq_len=4096") @@ -2630,9 +2636,10 @@ def test_ptp_quickstart_multimodal_2gpu(llm_root, llm_venv, model_name, @pytest.mark.skip_less_device_memory(80000) @pytest.mark.parametrize("model_name,model_path", [ - ("gemma-3-27b-it", "gemma/gemma-3-27b-it"), ("mistral-small-3.1-24b-instruct", "Mistral-Small-3.1-24B-Instruct-2503"), ("Phi-4-multimodal-instruct", "multimodals/Phi-4-multimodal-instruct"), + pytest.param( + "gemma-3-27b-it", "gemma/gemma-3-27b-it", marks=skip_post_blackwell), ]) def test_ptp_quickstart_multimodal_multiturn(llm_root, llm_venv, model_name, model_path): @@ -2698,6 +2705,8 @@ def test_ptp_quickstart_multimodal_multiturn(llm_root, llm_venv, model_name, cmd.append("--image_format=pil") cmd.append("--attention_backend=FLASHINFER") cmd.append("--disable_kv_cache_reuse") + cmd.append("--kv_cache_fraction=0.5") + cmd.append("--max_seq_len=1024") elif model_name == "Phi-4-multimodal-instruct": # Set max_seq_len to 4096 to use short rope factor. cmd.append("--max_seq_len=4096") diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index ad1efc24b40..8c87ead0af0 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -281,7 +281,6 @@ full:GB200/examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen1.5_7b full:GB200/examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen2_7b_instruct-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4] SKIP (https://nvbugs/5247837) full:GB200/examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen2_vl_7b_instruct-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4] SKIP (https://nvbugs/5359696) full:GB200/examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen2.5_7b_chat-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4] SKIP (https://nvbugs/5247837) -accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype[/scratch.trt_llm_data/llm-models/Mistral-Small-3.1-24B-Instruct-2503-fp8-FP8] SKIP (https://nvbugs/5462007) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5410391) accuracy/test_llm_api.py::TestMistral_Nemo_12B_Base::test_fp8 SKIP (https://nvbugs/5413197) accuracy/test_cli_flow.py::TestLlama3_8BInstructGradient1048k::test_long_context_ppl SKIP (https://nvbugs/5413362)