From c2f9cd630e30032bf23a5aac01f5cbb0a640418f Mon Sep 17 00:00:00 2001
From: Yaran Wu <28771492+aalanwyr@users.noreply.github.com>
Date: Tue, 26 Aug 2025 08:30:00 +0000
Subject: [PATCH 01/14] add TestQwQ_32B

Signed-off-by: Yaran Wu <28771492+aalanwyr@users.noreply.github.com>
---
 .../defs/accuracy/test_llm_api_pytorch.py     | 24 +++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index cce443bf7ba..dbcbd7680ac 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -2845,3 +2845,27 @@ def test_auto_dtype(self):
                  kv_cache_config=self.kv_cache_config) as llm:
             task = MMMU(self.MODEL_NAME)
             task.evaluate(llm, sampling_params=self.sampling_params)
+@pytest.mark.skip_less_device_memory(80000)
+@pytest.mark.skip_less_host_memory(100000)
+class TestQwQ_32B(LlmapiAccuracyTestHarness):
+    MODEL_NAME = "Qwen/QwQ-32B"
+    MODEL_PATH = f"{llm_models_root()}/QwQ-32B"
+
+    # NOTE: according to Sampling Parameters section
+    sampling_params = SamplingParams(
+        temperature=0.6,
+        top_p=0.95,
+        top_k=30,
+        presence_penalty=1.0,
+    )
+
+    kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6)
+
+    def test_auto_dtype(self):
+        with LLM(self.MODEL_PATH,
+                 max_num_tokens=16384,
+                 kv_cache_config=self.kv_cache_config) as llm:
+            task = CnnDailymail(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm, sampling_params=self.sampling_params)
\ No newline at end of file

From 3e3d3a48b2b17fbbb0776a55a55abf1a11e53309 Mon Sep 17 00:00:00 2001
From: Yaran Wu <28771492+aalanwyr@users.noreply.github.com>
Date: Tue, 26 Aug 2025 11:40:53 +0000
Subject: [PATCH 02/14] add reference

Signed-off-by: Yaran Wu <28771492+aalanwyr@users.noreply.github.com>
---
 tests/integration/defs/accuracy/references/cnn_dailymail.yaml | 2 ++
 tests/integration/defs/accuracy/references/mmlu.yaml          | 2 ++
 tests/integration/defs/accuracy/test_llm_api_pytorch.py       | 2 +-
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml
index dbf2be50f39..392987af33f 100644
--- a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml
+++ b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml
@@ -322,6 +322,8 @@ Qwen/Qwen2.5-7B-Instruct:
   - quant_algo: FP8
     kv_cache_quant_algo: FP8
     accuracy: 33.248
+Qwen/QwQ-32B:
+  - accuracy: 0.0
 nvidia/Nemotron-Mini-4B-Instruct:
   - quant_algo: FP8
     accuracy: 25.247
diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml
index 05816c0613d..953434983b8 100644
--- a/tests/integration/defs/accuracy/references/mmlu.yaml
+++ b/tests/integration/defs/accuracy/references/mmlu.yaml
@@ -141,6 +141,8 @@ Qwen/Qwen2.5-7B-Instruct:
   - quant_algo: FP8
     kv_cache_quant_algo: FP8
     accuracy: 75.32
+Qwen/QwQ-32B:
+  - accuracy: 0.0
 deepseek-ai/DeepSeek-V3-Lite:
   - accuracy: 71.40
   - quant_algo: NVFP4
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index dbcbd7680ac..64c70645dc8 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -2868,4 +2868,4 @@ def test_auto_dtype(self):
             task = CnnDailymail(self.MODEL_NAME)
             task.evaluate(llm)
             task = MMLU(self.MODEL_NAME)
-            task.evaluate(llm, sampling_params=self.sampling_params)
\ No newline at end of file
+            task.evaluate(llm)
\ No newline at end of file

From 243809ce8b3b75bfc511786530fbbccf680fbe4d Mon Sep 17 00:00:00 2001
From: Yaran Wu <28771492+aalanwyr@users.noreply.github.com>
Date: Wed, 27 Aug 2025 03:02:48 +0000
Subject: [PATCH 03/14] add test_auto_dtype_tp2

Signed-off-by: Yaran Wu <28771492+aalanwyr@users.noreply.github.com>
---
 .../defs/accuracy/references/mmlu.yaml          |  2 +-
 .../defs/accuracy/test_llm_api_pytorch.py       | 17 +++++++++++++++--
 2 files changed, 16 insertions(+), 3 deletions(-)

diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml
index 953434983b8..8d84ebda779 100644
--- a/tests/integration/defs/accuracy/references/mmlu.yaml
+++ b/tests/integration/defs/accuracy/references/mmlu.yaml
@@ -142,7 +142,7 @@ Qwen/Qwen2.5-7B-Instruct:
     kv_cache_quant_algo: FP8
     accuracy: 75.32
 Qwen/QwQ-32B:
-  - accuracy: 0.0
+  - accuracy: 82.60
 deepseek-ai/DeepSeek-V3-Lite:
   - accuracy: 71.40
   - quant_algo: NVFP4
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index 64c70645dc8..c7c1c6ced3f 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -2859,13 +2859,26 @@ class TestQwQ_32B(LlmapiAccuracyTestHarness):
         presence_penalty=1.0,
     )
 
-    kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6)
-
     def test_auto_dtype(self):
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=1)
         with LLM(self.MODEL_PATH,
                  max_num_tokens=16384,
                  kv_cache_config=self.kv_cache_config) as llm:
             task = CnnDailymail(self.MODEL_NAME)
             task.evaluate(llm)
             task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+
+    @pytest.mark.skip_less_device(2)
+    def test_auto_dtype_tp2(self):
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=1)
+
+        with LLM(self.MODEL_PATH,
+                 max_num_tokens=16384,
+                 kv_cache_config=kv_cache_config,
+                 tensor_parallel_size=2,
+                 max_batch_size=8) as llm:
+            task = CnnDailymail(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = MMLU(self.MODEL_NAME)
             task.evaluate(llm)
\ No newline at end of file

From 1b6410e9009ebbe82b17ded036581b50452699a0 Mon Sep 17 00:00:00 2001
From: Yaran Wu <28771492+aalanwyr@users.noreply.github.com>
Date: Wed, 27 Aug 2025 03:09:14 +0000
Subject: [PATCH 04/14] fix issue

Signed-off-by: Yaran Wu <28771492+aalanwyr@users.noreply.github.com>
---
 tests/integration/defs/accuracy/test_llm_api_pytorch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index c7c1c6ced3f..3f8a45cd23b 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -2863,7 +2863,7 @@ def test_auto_dtype(self):
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=1)
         with LLM(self.MODEL_PATH,
                  max_num_tokens=16384,
-                 kv_cache_config=self.kv_cache_config) as llm:
+                 kv_cache_config=kv_cache_config) as llm:
             task = CnnDailymail(self.MODEL_NAME)
             task.evaluate(llm)
             task = MMLU(self.MODEL_NAME)

From 4082ad627c2f42e0b4b29b10814a531e80c8cd14 Mon Sep 17 00:00:00 2001
From: Yaran Wu <28771492+aalanwyr@users.noreply.github.com>
Date: Wed, 27 Aug 2025 03:15:19 +0000
Subject: [PATCH 05/14] fix issue

Signed-off-by: Yaran Wu <28771492+aalanwyr@users.noreply.github.com>
---
 tests/integration/defs/accuracy/test_llm_api_pytorch.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index 3f8a45cd23b..55596279efa 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -2860,7 +2860,7 @@ class TestQwQ_32B(LlmapiAccuracyTestHarness):
     )
 
     def test_auto_dtype(self):
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=1)
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
         with LLM(self.MODEL_PATH,
                  max_num_tokens=16384,
                  kv_cache_config=kv_cache_config) as llm:
@@ -2871,7 +2871,7 @@ def test_auto_dtype(self):
 
     @pytest.mark.skip_less_device(2)
     def test_auto_dtype_tp2(self):
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=1)
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
 
         with LLM(self.MODEL_PATH,
                  max_num_tokens=16384,

From 1595670a3be64b32f7345fe3190488a935e989aa Mon Sep 17 00:00:00 2001
From: Yaran Wu <28771492+aalanwyr@users.noreply.github.com>
Date: Wed, 27 Aug 2025 04:54:05 +0000
Subject: [PATCH 06/14] adjust kvcache fraction

Signed-off-by: Yaran Wu <28771492+aalanwyr@users.noreply.github.com>
---
 tests/integration/defs/accuracy/test_llm_api_pytorch.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index 55596279efa..c133309b2e7 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -2860,18 +2860,19 @@ class TestQwQ_32B(LlmapiAccuracyTestHarness):
     )
 
     def test_auto_dtype(self):
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5)
         with LLM(self.MODEL_PATH,
                  max_num_tokens=16384,
                  kv_cache_config=kv_cache_config) as llm:
             task = CnnDailymail(self.MODEL_NAME)
             task.evaluate(llm)
+            print("end cnndaily\n")
             task = MMLU(self.MODEL_NAME)
             task.evaluate(llm)
 
     @pytest.mark.skip_less_device(2)
     def test_auto_dtype_tp2(self):
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5)
 
         with LLM(self.MODEL_PATH,
                  max_num_tokens=16384,
@@ -2880,5 +2881,6 @@ def test_auto_dtype_tp2(self):
                  max_batch_size=8) as llm:
             task = CnnDailymail(self.MODEL_NAME)
             task.evaluate(llm)
+            print("end cnndaily\n")
             task = MMLU(self.MODEL_NAME)
             task.evaluate(llm)
\ No newline at end of file

From 8e927ddac0a3b78feab7d5dfddb1b602551ef3d8 Mon Sep 17 00:00:00 2001
From: Yaran Wu <28771492+aalanwyr@users.noreply.github.com>
Date: Wed, 27 Aug 2025 05:11:35 +0000
Subject: [PATCH 07/14] add cnn_daily_mail accuracy

Signed-off-by: Yaran Wu <28771492+aalanwyr@users.noreply.github.com>
---
 tests/integration/defs/accuracy/references/cnn_dailymail.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml
index 392987af33f..a9ad9a5da81 100644
--- a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml
+++ b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml
@@ -323,7 +323,7 @@ Qwen/Qwen2.5-7B-Instruct:
     kv_cache_quant_algo: FP8
     accuracy: 33.248
 Qwen/QwQ-32B:
-  - accuracy: 0.0
+  - accuracy: 30.358
 nvidia/Nemotron-Mini-4B-Instruct:
   - quant_algo: FP8
     accuracy: 25.247

From 5ef70e99220739c482bb34d4f355180f15328a0f Mon Sep 17 00:00:00 2001
From: Yaran Wu <28771492+aalanwyr@users.noreply.github.com>
Date: Wed, 27 Aug 2025 05:59:40 +0000
Subject: [PATCH 08/14] add tp4 and tp8

Signed-off-by: Yaran Wu <28771492+aalanwyr@users.noreply.github.com>
---
 .../defs/accuracy/test_llm_api_pytorch.py     | 37 +++++++++++--------
 1 file changed, 22 insertions(+), 15 deletions(-)

diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index c133309b2e7..a5d2910bc5d 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -2846,41 +2846,48 @@ def test_auto_dtype(self):
             task = MMMU(self.MODEL_NAME)
             task.evaluate(llm, sampling_params=self.sampling_params)
 @pytest.mark.skip_less_device_memory(80000)
-@pytest.mark.skip_less_host_memory(100000)
 class TestQwQ_32B(LlmapiAccuracyTestHarness):
     MODEL_NAME = "Qwen/QwQ-32B"
     MODEL_PATH = f"{llm_models_root()}/QwQ-32B"
 
-    # NOTE: according to Sampling Parameters section
-    sampling_params = SamplingParams(
-        temperature=0.6,
-        top_p=0.95,
-        top_k=30,
-        presence_penalty=1.0,
-    )
+    @pytest.mark.skip_less_device(2)
+    def test_auto_dtype_tp2(self):
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5)
 
-    def test_auto_dtype(self):
+        with LLM(self.MODEL_PATH,
+                 max_num_tokens=16384,
+                 kv_cache_config=kv_cache_config,
+                 tensor_parallel_size=2,
+                 max_batch_size=8) as llm:
+            task = CnnDailymail(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+
+    @pytest.mark.skip_less_device(4)
+    def test_auto_dtype_tp4(self):
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5)
+
         with LLM(self.MODEL_PATH,
                  max_num_tokens=16384,
-                 kv_cache_config=kv_cache_config) as llm:
+                 kv_cache_config=kv_cache_config,
+                 tensor_parallel_size=4,
+                 max_batch_size=8) as llm:
             task = CnnDailymail(self.MODEL_NAME)
             task.evaluate(llm)
-            print("end cnndaily\n")
             task = MMLU(self.MODEL_NAME)
             task.evaluate(llm)
 
-    @pytest.mark.skip_less_device(2)
-    def test_auto_dtype_tp2(self):
+    @pytest.mark.skip_less_device(8)
+    def test_auto_dtype_tp8(self):
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5)
 
         with LLM(self.MODEL_PATH,
                  max_num_tokens=16384,
                  kv_cache_config=kv_cache_config,
-                 tensor_parallel_size=2,
+                 tensor_parallel_size=8,
                  max_batch_size=8) as llm:
             task = CnnDailymail(self.MODEL_NAME)
             task.evaluate(llm)
-            print("end cnndaily\n")
             task = MMLU(self.MODEL_NAME)
             task.evaluate(llm)
\ No newline at end of file

From 57a90dba88118c34e6516a9eb833042800d25731 Mon Sep 17 00:00:00 2001
From: Yaran Wu <28771492+aalanwyr@users.noreply.github.com>
Date: Wed, 27 Aug 2025 06:28:47 +0000
Subject: [PATCH 09/14] add tp parameter

Signed-off-by: Yaran Wu <28771492+aalanwyr@users.noreply.github.com>
---
 .../defs/accuracy/test_llm_api_pytorch.py      | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index a5d2910bc5d..de4ae68b94f 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -2850,6 +2850,24 @@ class TestQwQ_32B(LlmapiAccuracyTestHarness):
     MODEL_NAME = "Qwen/QwQ-32B"
     MODEL_PATH = f"{llm_models_root()}/QwQ-32B"
 
+    @pytest.mark.parametrize(
+        "tp_size", [8, 4, 2],
+        ids=["tp8", "tp4", "tp2"])
+    def test_auto_dtype(self, tp_size):
+        if get_device_count() != tp_size:
+            pytest.skip("Device count mismatch with world size")
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5)
+
+        with LLM(self.MODEL_PATH,
+                 max_num_tokens=16384,
+                 kv_cache_config=kv_cache_config,
+                 tensor_parallel_size=tp_size,
+                 max_batch_size=8) as llm:
+            task = CnnDailymail(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+
     @pytest.mark.skip_less_device(2)
     def test_auto_dtype_tp2(self):
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5)

From f9c5f9ba57b527bd5c8ba5d099c8423be92d6e3d Mon Sep 17 00:00:00 2001
From: Yaran Wu <28771492+aalanwyr@users.noreply.github.com>
Date: Wed, 27 Aug 2025 06:37:48 +0000
Subject: [PATCH 10/14] update the case

Signed-off-by: Yaran Wu <28771492+aalanwyr@users.noreply.github.com>
---
 .../defs/accuracy/test_llm_api_pytorch.py     | 42 -------------------
 1 file changed, 42 deletions(-)

diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index de4ae68b94f..56987069513 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -2866,46 +2866,4 @@ def test_auto_dtype(self, tp_size):
             task = CnnDailymail(self.MODEL_NAME)
             task.evaluate(llm)
             task = MMLU(self.MODEL_NAME)
-            task.evaluate(llm)
-
-    @pytest.mark.skip_less_device(2)
-    def test_auto_dtype_tp2(self):
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5)
-
-        with LLM(self.MODEL_PATH,
-                 max_num_tokens=16384,
-                 kv_cache_config=kv_cache_config,
-                 tensor_parallel_size=2,
-                 max_batch_size=8) as llm:
-            task = CnnDailymail(self.MODEL_NAME)
-            task.evaluate(llm)
-            task = MMLU(self.MODEL_NAME)
-            task.evaluate(llm)
-
-    @pytest.mark.skip_less_device(4)
-    def test_auto_dtype_tp4(self):
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5)
-
-        with LLM(self.MODEL_PATH,
-                 max_num_tokens=16384,
-                 kv_cache_config=kv_cache_config,
-                 tensor_parallel_size=4,
-                 max_batch_size=8) as llm:
-            task = CnnDailymail(self.MODEL_NAME)
-            task.evaluate(llm)
-            task = MMLU(self.MODEL_NAME)
-            task.evaluate(llm)
-
-    @pytest.mark.skip_less_device(8)
-    def test_auto_dtype_tp8(self):
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5)
-
-        with LLM(self.MODEL_PATH,
-                 max_num_tokens=16384,
-                 kv_cache_config=kv_cache_config,
-                 tensor_parallel_size=8,
-                 max_batch_size=8) as llm:
-            task = CnnDailymail(self.MODEL_NAME)
-            task.evaluate(llm)
-            task = MMLU(self.MODEL_NAME)
             task.evaluate(llm)
\ No newline at end of file

From 92bf1859270e03e0c0f922955394e9ef7e7c035e Mon Sep 17 00:00:00 2001
From: Yaran Wu <28771492+aalanwyr@users.noreply.github.com>
Date: Wed, 27 Aug 2025 06:53:26 +0000
Subject: [PATCH 11/14] update the test cases

Signed-off-by: Yaran Wu <28771492+aalanwyr@users.noreply.github.com>
---
 tests/integration/defs/accuracy/test_llm_api_pytorch.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index 56987069513..fb3389b587a 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -2845,14 +2845,14 @@ def test_auto_dtype(self):
                  kv_cache_config=self.kv_cache_config) as llm:
             task = MMMU(self.MODEL_NAME)
             task.evaluate(llm, sampling_params=self.sampling_params)
+
+
 @pytest.mark.skip_less_device_memory(80000)
 class TestQwQ_32B(LlmapiAccuracyTestHarness):
     MODEL_NAME = "Qwen/QwQ-32B"
     MODEL_PATH = f"{llm_models_root()}/QwQ-32B"
 
-    @pytest.mark.parametrize(
-        "tp_size", [8, 4, 2],
-        ids=["tp8", "tp4", "tp2"])
+    @pytest.mark.parametrize("tp_size", [8, 4, 2], ids=["tp8", "tp4", "tp2"])
     def test_auto_dtype(self, tp_size):
         if get_device_count() != tp_size:
             pytest.skip("Device count mismatch with world size")
@@ -2866,4 +2866,4 @@ def test_auto_dtype(self, tp_size):
             task = CnnDailymail(self.MODEL_NAME)
             task.evaluate(llm)
             task = MMLU(self.MODEL_NAME)
-            task.evaluate(llm)
\ No newline at end of file
+            task.evaluate(llm)

From 86d702a2ca22c4e101256b4dc6aa9c36abb4e85d Mon Sep 17 00:00:00 2001
From: Yaran Wu <28771492+aalanwyr@users.noreply.github.com>
Date: Wed, 27 Aug 2025 09:37:11 +0000
Subject: [PATCH 12/14] just test tensor parallel 4

Signed-off-by: Yaran Wu <28771492+aalanwyr@users.noreply.github.com>
---
 tests/integration/defs/accuracy/test_llm_api_pytorch.py | 9 ++++-----
 tests/integration/test_lists/qa/llm_function_nim.txt    | 1 +
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index fb3389b587a..1791b02ca5a 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -2852,16 +2852,15 @@ class TestQwQ_32B(LlmapiAccuracyTestHarness):
     MODEL_NAME = "Qwen/QwQ-32B"
     MODEL_PATH = f"{llm_models_root()}/QwQ-32B"
 
-    @pytest.mark.parametrize("tp_size", [8, 4, 2], ids=["tp8", "tp4", "tp2"])
-    def test_auto_dtype(self, tp_size):
-        if get_device_count() != tp_size:
-            pytest.skip("Device count mismatch with world size")
+    @pytest.mark.skip_less_device_memory(320000)
+    @pytest.mark.skip_less_device(4)
+    def test_auto_dtype_tp4(self):
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5)
 
         with LLM(self.MODEL_PATH,
                  max_num_tokens=16384,
                  kv_cache_config=kv_cache_config,
-                 tensor_parallel_size=tp_size,
+                 tensor_parallel_size=4,
                  max_batch_size=8) as llm:
             task = CnnDailymail(self.MODEL_NAME)
             task.evaluate(llm)
diff --git a/tests/integration/test_lists/qa/llm_function_nim.txt b/tests/integration/test_lists/qa/llm_function_nim.txt
index 90b6406806b..71bd82043b9 100644
--- a/tests/integration/test_lists/qa/llm_function_nim.txt
+++ b/tests/integration/test_lists/qa/llm_function_nim.txt
@@ -21,6 +21,7 @@ accuracy/test_llm_api_pytorch.py::TestNemotronH_56B_Base::test_auto_dtype[tp8-cu
 accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_auto_dtype[tp8ep4-cuda_graph=True]
 accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_fp8_prequantized[tp8ep4-cuda_graph=True]
 accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_fp8_prequantized[tp8-cuda_graph=True]
+accuracy/test_llm_api_pytorch.py::TestQwQ_32B::test_auto_dtype_tp4
 accuracy/test_llm_api.py::TestStarCoder2_7B::test_auto_dtype
 accuracy/test_llm_api.py::TestStarCoder2_7B::test_fp8
 accuracy/test_llm_api.py::TestCodestral_22B_V01::test_auto_dtype

From 641c3f4acdccb7c58dab451a7ca49dfeefd30faa Mon Sep 17 00:00:00 2001
From: Yaran Wu <28771492+aalanwyr@users.noreply.github.com>
Date: Wed, 27 Aug 2025 10:06:53 +0000
Subject: [PATCH 13/14] remove redundant pytest skip

Signed-off-by: Yaran Wu <28771492+aalanwyr@users.noreply.github.com>
---
 tests/integration/defs/accuracy/test_llm_api_pytorch.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index 1791b02ca5a..fe1282578d8 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -2847,7 +2847,6 @@ def test_auto_dtype(self):
             task.evaluate(llm, sampling_params=self.sampling_params)
 
 
-@pytest.mark.skip_less_device_memory(80000)
 class TestQwQ_32B(LlmapiAccuracyTestHarness):
     MODEL_NAME = "Qwen/QwQ-32B"
     MODEL_PATH = f"{llm_models_root()}/QwQ-32B"

From 4fd4966c5aadf8d5928408705f2ae84fe9fed83c Mon Sep 17 00:00:00 2001
From: Yaran Wu <28771492+aalanwyr@users.noreply.github.com>
Date: Wed, 27 Aug 2025 10:44:34 +0000
Subject: [PATCH 14/14] decrease skip_less_device_memory

Signed-off-by: Yaran Wu <28771492+aalanwyr@users.noreply.github.com>
---
 tests/integration/defs/accuracy/test_llm_api_pytorch.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index fe1282578d8..8573cf1f726 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -2851,7 +2851,7 @@ class TestQwQ_32B(LlmapiAccuracyTestHarness):
     MODEL_NAME = "Qwen/QwQ-32B"
     MODEL_PATH = f"{llm_models_root()}/QwQ-32B"
 
-    @pytest.mark.skip_less_device_memory(320000)
+    @pytest.mark.skip_less_device_memory(80000)
     @pytest.mark.skip_less_device(4)
     def test_auto_dtype_tp4(self):
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5)