diff --git a/tests/integration/defs/accuracy/test_cli_flow.py b/tests/integration/defs/accuracy/test_cli_flow.py index 0dab161a8b8..85b168d4535 100644 --- a/tests/integration/defs/accuracy/test_cli_flow.py +++ b/tests/integration/defs/accuracy/test_cli_flow.py @@ -987,7 +987,8 @@ def test_tp2(self): @pytest.mark.parametrize( "moe_tp_size", [1, 4, 8], ids=['expert_parallel', 'mixed_parallel', 'tensor_parallel']) - def test_ootb_except_mha_tp8(self, moe_tp_size): + def test_ootb_except_mha_tp8(self, moe_tp_size, mocker): + mocker.patch.object(CnnDailymail, "MAX_BATCH_SIZE", 1) self.run(tp_size=8, extra_convert_args=[ f"--moe_tp_size={moe_tp_size}", diff --git a/tests/integration/defs/accuracy/test_llm_api.py b/tests/integration/defs/accuracy/test_llm_api.py index d97f518616e..8e64ffacf87 100644 --- a/tests/integration/defs/accuracy/test_llm_api.py +++ b/tests/integration/defs/accuracy/test_llm_api.py @@ -113,6 +113,7 @@ class TestMixtral8x7B(LlmapiAccuracyTestHarness): MODEL_NAME = "mistralai/Mixtral-8x7B-v0.1" MODEL_PATH = f"{llm_models_root()}/Mixtral-8x7B-v0.1" + @pytest.mark.skip_less_device_memory(80000) @pytest.mark.skip_less_device(2) def test_tp2(self): with LLM(self.MODEL_PATH, tensor_parallel_size=2) as llm: diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 4b09d0d94d5..3a0781802a0 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -970,6 +970,7 @@ class TestLlama3_3NemotronSuper49Bv1(LlmapiAccuracyTestHarness): MODEL_PATH = f"{llm_models_root()}/nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1" @pytest.mark.skip_less_device(2) + @pytest.mark.skip_less_device_memory(80000) def test_auto_dtype_tp2(self): with LLM(self.MODEL_PATH, tensor_parallel_size=2) as llm: task = MMLU(self.MODEL_NAME) diff --git a/tests/integration/defs/examples/test_eagle.py b/tests/integration/defs/examples/test_eagle.py index a83be7556ae..985de7dc095 100644 --- a/tests/integration/defs/examples/test_eagle.py +++ b/tests/integration/defs/examples/test_eagle.py @@ -304,6 +304,7 @@ def test_mistral_eagle_1gpu(llm_mistral_model_root, @skip_pre_ada +@pytest.mark.skip_less_device_memory(80000) @pytest.mark.parametrize("use_dynamic_tree", [False, True], ids=['eagle1', 'eagle2']) @pytest.mark.parametrize("mistral_nemo_model_root", ['Mistral-Nemo-12b-Base'], diff --git a/tests/integration/defs/examples/test_multimodal.py b/tests/integration/defs/examples/test_multimodal.py index 7e8628340bd..eb88267780b 100644 --- a/tests/integration/defs/examples/test_multimodal.py +++ b/tests/integration/defs/examples/test_multimodal.py @@ -81,6 +81,8 @@ def _test_llm_multimodal_general(llm_venv, if "neva-22b" in tllm_model_name and get_device_memory() < 80000: pytest.skip("GPU memory is insufficient.") + if "Mistral-Small" in tllm_model_name and get_device_memory() < 80000: + pytest.skip("GPU memory is insufficient.") print("Converting huggingface model into binary format...") # ckpt from llm_models/ --> cmodels// diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py index 17de97de744..59b64ad2db4 100644 --- a/tests/integration/defs/test_e2e.py +++ b/tests/integration/defs/test_e2e.py @@ -1318,7 +1318,6 @@ def test_ptp_quickstart(llm_root, llm_venv): ("Llama3.2-11B-BF16", "llama-3.2-models/Llama-3.2-11B-Vision"), ("Nemotron4_4B-BF16", "nemotron/Minitron-4B-Base"), ("Nemotron-H-8B", "Nemotron-H-8B-Base-8K"), - ("Qwen3-30B-A3B", "Qwen3/Qwen3-30B-A3B"), pytest.param('Llama3.1-8B-NVFP4', 'nvfp4-quantized/Meta-Llama-3.1-8B', marks=skip_pre_blackwell), @@ -1343,6 +1342,9 @@ def test_ptp_quickstart(llm_root, llm_venv): pytest.param('Mixtral-8x7B-FP8', 'Mixtral-8x7B-Instruct-v0.1-fp8', marks=skip_pre_blackwell), + pytest.param('Qwen3-30B-A3B', + 'Qwen3/Qwen3-30B-A3B', + marks=pytest.mark.skip_less_device_memory(80000)), ]) def test_ptp_quickstart_advanced(llm_root, llm_venv, model_name, model_path): print(f"Testing {model_name}.") diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index df349636472..d4583e06469 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -420,7 +420,6 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mt accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5294983) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=2-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5239087) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-mtp_nextn=2-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5239087) -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5234002) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[-] SKIP (https://nvbugs/5234002) examples/test_gemma.py::test_llm_hf_gemma_quantization_1gpu[gemma-2-27b-it-fp8-bfloat16-8] SKIP (https://nvbugs/5234164) full::GH200/examples/test_commandr.py::test_llm_commandr_v01_single_gpu_summary[disable_weight_only] SKIP (https://nvbugs/5250460)