From c2f9cd630e30032bf23a5aac01f5cbb0a640418f Mon Sep 17 00:00:00 2001 From: Yaran Wu <28771492+aalanwyr@users.noreply.github.com> Date: Tue, 26 Aug 2025 08:30:00 +0000 Subject: [PATCH 01/14] add TestQwQ_32B Signed-off-by: Yaran Wu <28771492+aalanwyr@users.noreply.github.com> --- .../defs/accuracy/test_llm_api_pytorch.py | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index cce443bf7ba..dbcbd7680ac 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -2845,3 +2845,27 @@ def test_auto_dtype(self): kv_cache_config=self.kv_cache_config) as llm: task = MMMU(self.MODEL_NAME) task.evaluate(llm, sampling_params=self.sampling_params) +@pytest.mark.skip_less_device_memory(80000) +@pytest.mark.skip_less_host_memory(100000) +class TestQwQ_32B(LlmapiAccuracyTestHarness): + MODEL_NAME = "Qwen/QwQ-32B" + MODEL_PATH = f"{llm_models_root()}/QwQ-32B" + + # NOTE: according to Sampling Parameters section + sampling_params = SamplingParams( + temperature=0.6, + top_p=0.95, + top_k=30, + presence_penalty=1.0, + ) + + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6) + + def test_auto_dtype(self): + with LLM(self.MODEL_PATH, + max_num_tokens=16384, + kv_cache_config=self.kv_cache_config) as llm: + task = CnnDailymail(self.MODEL_NAME) + task.evaluate(llm) + task = MMLU(self.MODEL_NAME) + task.evaluate(llm, sampling_params=self.sampling_params) \ No newline at end of file From 3e3d3a48b2b17fbbb0776a55a55abf1a11e53309 Mon Sep 17 00:00:00 2001 From: Yaran Wu <28771492+aalanwyr@users.noreply.github.com> Date: Tue, 26 Aug 2025 11:40:53 +0000 Subject: [PATCH 02/14] add reference Signed-off-by: Yaran Wu <28771492+aalanwyr@users.noreply.github.com> --- tests/integration/defs/accuracy/references/cnn_dailymail.yaml | 2 ++ tests/integration/defs/accuracy/references/mmlu.yaml | 2 ++ tests/integration/defs/accuracy/test_llm_api_pytorch.py | 2 +- 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml index dbf2be50f39..392987af33f 100644 --- a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml +++ b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml @@ -322,6 +322,8 @@ Qwen/Qwen2.5-7B-Instruct: - quant_algo: FP8 kv_cache_quant_algo: FP8 accuracy: 33.248 +Qwen/QwQ-32B: + - accuracy: 0.0 nvidia/Nemotron-Mini-4B-Instruct: - quant_algo: FP8 accuracy: 25.247 diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml index 05816c0613d..953434983b8 100644 --- a/tests/integration/defs/accuracy/references/mmlu.yaml +++ b/tests/integration/defs/accuracy/references/mmlu.yaml @@ -141,6 +141,8 @@ Qwen/Qwen2.5-7B-Instruct: - quant_algo: FP8 kv_cache_quant_algo: FP8 accuracy: 75.32 +Qwen/QwQ-32B: + - accuracy: 0.0 deepseek-ai/DeepSeek-V3-Lite: - accuracy: 71.40 - quant_algo: NVFP4 diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index dbcbd7680ac..64c70645dc8 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -2868,4 +2868,4 @@ def test_auto_dtype(self): task = CnnDailymail(self.MODEL_NAME) task.evaluate(llm) task = MMLU(self.MODEL_NAME) - task.evaluate(llm, sampling_params=self.sampling_params) \ No newline at end of file + task.evaluate(llm) \ No newline at end of file From 243809ce8b3b75bfc511786530fbbccf680fbe4d Mon Sep 17 00:00:00 2001 From: Yaran Wu <28771492+aalanwyr@users.noreply.github.com> Date: Wed, 27 Aug 2025 03:02:48 +0000 Subject: [PATCH 03/14] add test_auto_dtype_tp2 Signed-off-by: Yaran Wu <28771492+aalanwyr@users.noreply.github.com> --- .../defs/accuracy/references/mmlu.yaml | 2 +- .../defs/accuracy/test_llm_api_pytorch.py | 17 +++++++++++++++-- 2 files changed, 16 insertions(+), 3 deletions(-) diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml index 953434983b8..8d84ebda779 100644 --- a/tests/integration/defs/accuracy/references/mmlu.yaml +++ b/tests/integration/defs/accuracy/references/mmlu.yaml @@ -142,7 +142,7 @@ Qwen/Qwen2.5-7B-Instruct: kv_cache_quant_algo: FP8 accuracy: 75.32 Qwen/QwQ-32B: - - accuracy: 0.0 + - accuracy: 82.60 deepseek-ai/DeepSeek-V3-Lite: - accuracy: 71.40 - quant_algo: NVFP4 diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 64c70645dc8..c7c1c6ced3f 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -2859,13 +2859,26 @@ class TestQwQ_32B(LlmapiAccuracyTestHarness): presence_penalty=1.0, ) - kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6) - def test_auto_dtype(self): + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=1) with LLM(self.MODEL_PATH, max_num_tokens=16384, kv_cache_config=self.kv_cache_config) as llm: task = CnnDailymail(self.MODEL_NAME) task.evaluate(llm) task = MMLU(self.MODEL_NAME) + task.evaluate(llm) + + @pytest.mark.skip_less_device(2) + def test_auto_dtype_tp2(self): + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=1) + + with LLM(self.MODEL_PATH, + max_num_tokens=16384, + kv_cache_config=kv_cache_config, + tensor_parallel_size=2, + max_batch_size=8) as llm: + task = CnnDailymail(self.MODEL_NAME) + task.evaluate(llm) + task = MMLU(self.MODEL_NAME) task.evaluate(llm) \ No newline at end of file From 1b6410e9009ebbe82b17ded036581b50452699a0 Mon Sep 17 00:00:00 2001 From: Yaran Wu <28771492+aalanwyr@users.noreply.github.com> Date: Wed, 27 Aug 2025 03:09:14 +0000 Subject: [PATCH 04/14] fix issue Signed-off-by: Yaran Wu <28771492+aalanwyr@users.noreply.github.com> --- tests/integration/defs/accuracy/test_llm_api_pytorch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index c7c1c6ced3f..3f8a45cd23b 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -2863,7 +2863,7 @@ def test_auto_dtype(self): kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=1) with LLM(self.MODEL_PATH, max_num_tokens=16384, - kv_cache_config=self.kv_cache_config) as llm: + kv_cache_config=kv_cache_config) as llm: task = CnnDailymail(self.MODEL_NAME) task.evaluate(llm) task = MMLU(self.MODEL_NAME) From 4082ad627c2f42e0b4b29b10814a531e80c8cd14 Mon Sep 17 00:00:00 2001 From: Yaran Wu <28771492+aalanwyr@users.noreply.github.com> Date: Wed, 27 Aug 2025 03:15:19 +0000 Subject: [PATCH 05/14] fix issue Signed-off-by: Yaran Wu <28771492+aalanwyr@users.noreply.github.com> --- tests/integration/defs/accuracy/test_llm_api_pytorch.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 3f8a45cd23b..55596279efa 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -2860,7 +2860,7 @@ class TestQwQ_32B(LlmapiAccuracyTestHarness): ) def test_auto_dtype(self): - kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=1) + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9) with LLM(self.MODEL_PATH, max_num_tokens=16384, kv_cache_config=kv_cache_config) as llm: @@ -2871,7 +2871,7 @@ def test_auto_dtype(self): @pytest.mark.skip_less_device(2) def test_auto_dtype_tp2(self): - kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=1) + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9) with LLM(self.MODEL_PATH, max_num_tokens=16384, From 1595670a3be64b32f7345fe3190488a935e989aa Mon Sep 17 00:00:00 2001 From: Yaran Wu <28771492+aalanwyr@users.noreply.github.com> Date: Wed, 27 Aug 2025 04:54:05 +0000 Subject: [PATCH 06/14] adjust kvcache fraction Signed-off-by: Yaran Wu <28771492+aalanwyr@users.noreply.github.com> --- tests/integration/defs/accuracy/test_llm_api_pytorch.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 55596279efa..c133309b2e7 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -2860,18 +2860,19 @@ class TestQwQ_32B(LlmapiAccuracyTestHarness): ) def test_auto_dtype(self): - kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9) + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5) with LLM(self.MODEL_PATH, max_num_tokens=16384, kv_cache_config=kv_cache_config) as llm: task = CnnDailymail(self.MODEL_NAME) task.evaluate(llm) + print("end cnndaily\n") task = MMLU(self.MODEL_NAME) task.evaluate(llm) @pytest.mark.skip_less_device(2) def test_auto_dtype_tp2(self): - kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9) + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5) with LLM(self.MODEL_PATH, max_num_tokens=16384, @@ -2880,5 +2881,6 @@ def test_auto_dtype_tp2(self): max_batch_size=8) as llm: task = CnnDailymail(self.MODEL_NAME) task.evaluate(llm) + print("end cnndaily\n") task = MMLU(self.MODEL_NAME) task.evaluate(llm) \ No newline at end of file From 8e927ddac0a3b78feab7d5dfddb1b602551ef3d8 Mon Sep 17 00:00:00 2001 From: Yaran Wu <28771492+aalanwyr@users.noreply.github.com> Date: Wed, 27 Aug 2025 05:11:35 +0000 Subject: [PATCH 07/14] add cnn_daily_mail accuracy Signed-off-by: Yaran Wu <28771492+aalanwyr@users.noreply.github.com> --- tests/integration/defs/accuracy/references/cnn_dailymail.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml index 392987af33f..a9ad9a5da81 100644 --- a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml +++ b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml @@ -323,7 +323,7 @@ Qwen/Qwen2.5-7B-Instruct: kv_cache_quant_algo: FP8 accuracy: 33.248 Qwen/QwQ-32B: - - accuracy: 0.0 + - accuracy: 30.358 nvidia/Nemotron-Mini-4B-Instruct: - quant_algo: FP8 accuracy: 25.247 From 5ef70e99220739c482bb34d4f355180f15328a0f Mon Sep 17 00:00:00 2001 From: Yaran Wu <28771492+aalanwyr@users.noreply.github.com> Date: Wed, 27 Aug 2025 05:59:40 +0000 Subject: [PATCH 08/14] add tp4 and tp8 Signed-off-by: Yaran Wu <28771492+aalanwyr@users.noreply.github.com> --- .../defs/accuracy/test_llm_api_pytorch.py | 37 +++++++++++-------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index c133309b2e7..a5d2910bc5d 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -2846,41 +2846,48 @@ def test_auto_dtype(self): task = MMMU(self.MODEL_NAME) task.evaluate(llm, sampling_params=self.sampling_params) @pytest.mark.skip_less_device_memory(80000) -@pytest.mark.skip_less_host_memory(100000) class TestQwQ_32B(LlmapiAccuracyTestHarness): MODEL_NAME = "Qwen/QwQ-32B" MODEL_PATH = f"{llm_models_root()}/QwQ-32B" - # NOTE: according to Sampling Parameters section - sampling_params = SamplingParams( - temperature=0.6, - top_p=0.95, - top_k=30, - presence_penalty=1.0, - ) + @pytest.mark.skip_less_device(2) + def test_auto_dtype_tp2(self): + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5) - def test_auto_dtype(self): + with LLM(self.MODEL_PATH, + max_num_tokens=16384, + kv_cache_config=kv_cache_config, + tensor_parallel_size=2, + max_batch_size=8) as llm: + task = CnnDailymail(self.MODEL_NAME) + task.evaluate(llm) + task = MMLU(self.MODEL_NAME) + task.evaluate(llm) + + @pytest.mark.skip_less_device(4) + def test_auto_dtype_tp4(self): kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5) + with LLM(self.MODEL_PATH, max_num_tokens=16384, - kv_cache_config=kv_cache_config) as llm: + kv_cache_config=kv_cache_config, + tensor_parallel_size=4, + max_batch_size=8) as llm: task = CnnDailymail(self.MODEL_NAME) task.evaluate(llm) - print("end cnndaily\n") task = MMLU(self.MODEL_NAME) task.evaluate(llm) - @pytest.mark.skip_less_device(2) - def test_auto_dtype_tp2(self): + @pytest.mark.skip_less_device(8) + def test_auto_dtype_tp8(self): kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5) with LLM(self.MODEL_PATH, max_num_tokens=16384, kv_cache_config=kv_cache_config, - tensor_parallel_size=2, + tensor_parallel_size=8, max_batch_size=8) as llm: task = CnnDailymail(self.MODEL_NAME) task.evaluate(llm) - print("end cnndaily\n") task = MMLU(self.MODEL_NAME) task.evaluate(llm) \ No newline at end of file From 57a90dba88118c34e6516a9eb833042800d25731 Mon Sep 17 00:00:00 2001 From: Yaran Wu <28771492+aalanwyr@users.noreply.github.com> Date: Wed, 27 Aug 2025 06:28:47 +0000 Subject: [PATCH 09/14] add tp parameter Signed-off-by: Yaran Wu <28771492+aalanwyr@users.noreply.github.com> --- .../defs/accuracy/test_llm_api_pytorch.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index a5d2910bc5d..de4ae68b94f 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -2850,6 +2850,24 @@ class TestQwQ_32B(LlmapiAccuracyTestHarness): MODEL_NAME = "Qwen/QwQ-32B" MODEL_PATH = f"{llm_models_root()}/QwQ-32B" + @pytest.mark.parametrize( + "tp_size", [8, 4, 2], + ids=["tp8", "tp4", "tp2"]) + def test_auto_dtype(self, tp_size): + if get_device_count() != tp_size: + pytest.skip("Device count mismatch with world size") + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5) + + with LLM(self.MODEL_PATH, + max_num_tokens=16384, + kv_cache_config=kv_cache_config, + tensor_parallel_size=tp_size, + max_batch_size=8) as llm: + task = CnnDailymail(self.MODEL_NAME) + task.evaluate(llm) + task = MMLU(self.MODEL_NAME) + task.evaluate(llm) + @pytest.mark.skip_less_device(2) def test_auto_dtype_tp2(self): kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5) From f9c5f9ba57b527bd5c8ba5d099c8423be92d6e3d Mon Sep 17 00:00:00 2001 From: Yaran Wu <28771492+aalanwyr@users.noreply.github.com> Date: Wed, 27 Aug 2025 06:37:48 +0000 Subject: [PATCH 10/14] update the case Signed-off-by: Yaran Wu <28771492+aalanwyr@users.noreply.github.com> --- .../defs/accuracy/test_llm_api_pytorch.py | 42 ------------------- 1 file changed, 42 deletions(-) diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index de4ae68b94f..56987069513 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -2866,46 +2866,4 @@ def test_auto_dtype(self, tp_size): task = CnnDailymail(self.MODEL_NAME) task.evaluate(llm) task = MMLU(self.MODEL_NAME) - task.evaluate(llm) - - @pytest.mark.skip_less_device(2) - def test_auto_dtype_tp2(self): - kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5) - - with LLM(self.MODEL_PATH, - max_num_tokens=16384, - kv_cache_config=kv_cache_config, - tensor_parallel_size=2, - max_batch_size=8) as llm: - task = CnnDailymail(self.MODEL_NAME) - task.evaluate(llm) - task = MMLU(self.MODEL_NAME) - task.evaluate(llm) - - @pytest.mark.skip_less_device(4) - def test_auto_dtype_tp4(self): - kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5) - - with LLM(self.MODEL_PATH, - max_num_tokens=16384, - kv_cache_config=kv_cache_config, - tensor_parallel_size=4, - max_batch_size=8) as llm: - task = CnnDailymail(self.MODEL_NAME) - task.evaluate(llm) - task = MMLU(self.MODEL_NAME) - task.evaluate(llm) - - @pytest.mark.skip_less_device(8) - def test_auto_dtype_tp8(self): - kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5) - - with LLM(self.MODEL_PATH, - max_num_tokens=16384, - kv_cache_config=kv_cache_config, - tensor_parallel_size=8, - max_batch_size=8) as llm: - task = CnnDailymail(self.MODEL_NAME) - task.evaluate(llm) - task = MMLU(self.MODEL_NAME) task.evaluate(llm) \ No newline at end of file From 92bf1859270e03e0c0f922955394e9ef7e7c035e Mon Sep 17 00:00:00 2001 From: Yaran Wu <28771492+aalanwyr@users.noreply.github.com> Date: Wed, 27 Aug 2025 06:53:26 +0000 Subject: [PATCH 11/14] update the test cases Signed-off-by: Yaran Wu <28771492+aalanwyr@users.noreply.github.com> --- tests/integration/defs/accuracy/test_llm_api_pytorch.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 56987069513..fb3389b587a 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -2845,14 +2845,14 @@ def test_auto_dtype(self): kv_cache_config=self.kv_cache_config) as llm: task = MMMU(self.MODEL_NAME) task.evaluate(llm, sampling_params=self.sampling_params) + + @pytest.mark.skip_less_device_memory(80000) class TestQwQ_32B(LlmapiAccuracyTestHarness): MODEL_NAME = "Qwen/QwQ-32B" MODEL_PATH = f"{llm_models_root()}/QwQ-32B" - @pytest.mark.parametrize( - "tp_size", [8, 4, 2], - ids=["tp8", "tp4", "tp2"]) + @pytest.mark.parametrize("tp_size", [8, 4, 2], ids=["tp8", "tp4", "tp2"]) def test_auto_dtype(self, tp_size): if get_device_count() != tp_size: pytest.skip("Device count mismatch with world size") @@ -2866,4 +2866,4 @@ def test_auto_dtype(self, tp_size): task = CnnDailymail(self.MODEL_NAME) task.evaluate(llm) task = MMLU(self.MODEL_NAME) - task.evaluate(llm) \ No newline at end of file + task.evaluate(llm) From 86d702a2ca22c4e101256b4dc6aa9c36abb4e85d Mon Sep 17 00:00:00 2001 From: Yaran Wu <28771492+aalanwyr@users.noreply.github.com> Date: Wed, 27 Aug 2025 09:37:11 +0000 Subject: [PATCH 12/14] just test tensor parallel 4 Signed-off-by: Yaran Wu <28771492+aalanwyr@users.noreply.github.com> --- tests/integration/defs/accuracy/test_llm_api_pytorch.py | 9 ++++----- tests/integration/test_lists/qa/llm_function_nim.txt | 1 + 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index fb3389b587a..1791b02ca5a 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -2852,16 +2852,15 @@ class TestQwQ_32B(LlmapiAccuracyTestHarness): MODEL_NAME = "Qwen/QwQ-32B" MODEL_PATH = f"{llm_models_root()}/QwQ-32B" - @pytest.mark.parametrize("tp_size", [8, 4, 2], ids=["tp8", "tp4", "tp2"]) - def test_auto_dtype(self, tp_size): - if get_device_count() != tp_size: - pytest.skip("Device count mismatch with world size") + @pytest.mark.skip_less_device_memory(320000) + @pytest.mark.skip_less_device(4) + def test_auto_dtype_tp4(self): kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5) with LLM(self.MODEL_PATH, max_num_tokens=16384, kv_cache_config=kv_cache_config, - tensor_parallel_size=tp_size, + tensor_parallel_size=4, max_batch_size=8) as llm: task = CnnDailymail(self.MODEL_NAME) task.evaluate(llm) diff --git a/tests/integration/test_lists/qa/llm_function_nim.txt b/tests/integration/test_lists/qa/llm_function_nim.txt index 90b6406806b..71bd82043b9 100644 --- a/tests/integration/test_lists/qa/llm_function_nim.txt +++ b/tests/integration/test_lists/qa/llm_function_nim.txt @@ -21,6 +21,7 @@ accuracy/test_llm_api_pytorch.py::TestNemotronH_56B_Base::test_auto_dtype[tp8-cu accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_auto_dtype[tp8ep4-cuda_graph=True] accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_fp8_prequantized[tp8ep4-cuda_graph=True] accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_fp8_prequantized[tp8-cuda_graph=True] +accuracy/test_llm_api_pytorch.py::TestQwQ_32B::test_auto_dtype_tp4 accuracy/test_llm_api.py::TestStarCoder2_7B::test_auto_dtype accuracy/test_llm_api.py::TestStarCoder2_7B::test_fp8 accuracy/test_llm_api.py::TestCodestral_22B_V01::test_auto_dtype From 641c3f4acdccb7c58dab451a7ca49dfeefd30faa Mon Sep 17 00:00:00 2001 From: Yaran Wu <28771492+aalanwyr@users.noreply.github.com> Date: Wed, 27 Aug 2025 10:06:53 +0000 Subject: [PATCH 13/14] remove redundant pytest skip Signed-off-by: Yaran Wu <28771492+aalanwyr@users.noreply.github.com> --- tests/integration/defs/accuracy/test_llm_api_pytorch.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 1791b02ca5a..fe1282578d8 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -2847,7 +2847,6 @@ def test_auto_dtype(self): task.evaluate(llm, sampling_params=self.sampling_params) -@pytest.mark.skip_less_device_memory(80000) class TestQwQ_32B(LlmapiAccuracyTestHarness): MODEL_NAME = "Qwen/QwQ-32B" MODEL_PATH = f"{llm_models_root()}/QwQ-32B" From 4fd4966c5aadf8d5928408705f2ae84fe9fed83c Mon Sep 17 00:00:00 2001 From: Yaran Wu <28771492+aalanwyr@users.noreply.github.com> Date: Wed, 27 Aug 2025 10:44:34 +0000 Subject: [PATCH 14/14] decrease skip_less_device_memory Signed-off-by: Yaran Wu <28771492+aalanwyr@users.noreply.github.com> --- tests/integration/defs/accuracy/test_llm_api_pytorch.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index fe1282578d8..8573cf1f726 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -2851,7 +2851,7 @@ class TestQwQ_32B(LlmapiAccuracyTestHarness): MODEL_NAME = "Qwen/QwQ-32B" MODEL_PATH = f"{llm_models_root()}/QwQ-32B" - @pytest.mark.skip_less_device_memory(320000) + @pytest.mark.skip_less_device_memory(80000) @pytest.mark.skip_less_device(4) def test_auto_dtype_tp4(self): kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5)