NVIDIA · QiJune · Aug 27, 2025 · Aug 25, 2025 · Aug 27, 2025
diff --git a/tests/integration/defs/agg_unit_mem_df.csv b/tests/integration/defs/agg_unit_mem_df.csv
diff --git a/tests/integration/defs/test_unittests.py b/tests/integration/defs/test_unittests.py
@@ -13,6 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import os
+import warnings
 from subprocess import CalledProcessError
 
 from defs.conftest import tests_path
@@ -100,8 +101,8 @@ def test_unittests_v2(llm_root, llm_venv, case: str, output_dir, request):
         num_workers = parallel_dict[cur_key]
         num_workers = min(num_workers, 8)
     else:
-        print(
-            f'unittest {case} on "{gpu_name}" is not recorded in parallel config. Need to profile.'
+        warnings.warn(
+            f'Cannot find parallel config entry for unittest {case} on "{gpu_name}". Fallback to serial test. Please add config entry to agg_unit_mem_df.csv.'
         )
 
     num_workers = max(1, num_workers)

diff --git a/tests/integration/defs/trt_test_alternative.py b/tests/integration/defs/trt_test_alternative.py
@@ -93,16 +93,32 @@ def cleanup_process_tree(p: subprocess.Popen,
             time.sleep(5)
 
             lines = []
+            torch_inductors = []
+
             for pid in sorted(target_pids):
                 try:
                     sp = psutil.Process(pid)
                     if verbose_message:
                         cmdline = sp.cmdline()
+
+                        # Detect repetitive torch inductor worker processes
+                        if len(cmdline) > 3 and \
+                            'python' in cmdline[0] and \
+                            'torch/_inductor/compile_worker/__main__.py' in cmdline[1] and \
+                            '--pickler=torch._inductor.compile_worker.subproc_pool.SubprocPickler' == cmdline[2]:
+                            torch_inductors.append(pid)
+                            continue
+
                         lines.append(f"{pid}: {cmdline}")
                     persist_pids.append(pid)
                 except psutil.Error:
                     pass
 
+            if torch_inductors:
+                lines.append(
+                    f"{len(torch_inductors)}*torch inductor workers: {torch_inductors}"
+                )
+
             if persist_pids:
                 msg = f"Found leftover subprocesses: {persist_pids} launched by {p.args}"
                 if verbose_message:

diff --git a/tests/unittest/_torch/speculative/test_draft_target.py b/tests/unittest/_torch/speculative/test_draft_target.py
@@ -28,7 +28,7 @@ def test_llama_draft_target(use_cuda_graph: bool, attn_backend: str):
 
     max_batch_size = 2
     max_draft_len = 4
-    kv_cache_config = KvCacheConfig(enable_block_reuse=False)
+    kv_cache_config = KvCacheConfig(enable_block_reuse=False, max_tokens=8192)
     cuda_graph_config = CudaGraphConfig(
         batch_sizes=[1]) if use_cuda_graph else None
 

diff --git a/tests/unittest/_torch/speculative/test_dynamic_spec_decode.py b/tests/unittest/_torch/speculative/test_dynamic_spec_decode.py
@@ -26,8 +26,7 @@ def test_dynamic_spec_decode():
 
     max_batch_size = 1
     max_draft_len = 4
-    kv_cache_config = KvCacheConfig(enable_block_reuse=True,
-                                    free_gpu_memory_fraction=0.5)
+    kv_cache_config = KvCacheConfig(enable_block_reuse=True, max_tokens=8192)
     cuda_graph_config = CudaGraphConfig(batch_sizes=[1])
 
     llm_common_config = dict(

diff --git a/tests/unittest/_torch/speculative/test_eagle3.py b/tests/unittest/_torch/speculative/test_eagle3.py
@@ -46,7 +46,7 @@ def test_llama_eagle3(use_cuda_graph: bool, attn_backend: str,
     max_batch_size = 1
     max_draft_len = 4
     kv_cache_config = KvCacheConfig(enable_block_reuse=enable_block_reuse,
-                                    free_gpu_memory_fraction=0.5)
+                                    max_tokens=8192)
     cuda_graph_config = CudaGraphConfig(
         batch_sizes=[1]) if use_cuda_graph else None
 
@@ -190,7 +190,7 @@ def test_deepseek_eagle3():
         max_batch_size = 16
         max_draft_len = 3
         kv_cache_config = KvCacheConfig(enable_block_reuse=enable_block_reuse,
-                                        free_gpu_memory_fraction=0.5)
+                                        max_tokens=8192)
         cuda_graph_config = CudaGraphConfig(
             batch_sizes=[1]) if use_cuda_graph else None
 

diff --git a/tests/unittest/_torch/speculative/test_kv_cache_reuse.py b/tests/unittest/_torch/speculative/test_kv_cache_reuse.py
@@ -32,8 +32,7 @@ def test_kv_cache_reuse(use_cuda_graph: bool, attn_backend: str):
     # that ref and spec does not match 100%
     max_batch_size = 1
     max_draft_len = 4
-    kv_cache_config = KvCacheConfig(enable_block_reuse=True,
-                                    free_gpu_memory_fraction=0.5)
+    kv_cache_config = KvCacheConfig(enable_block_reuse=True, max_tokens=8192)
     cuda_graph_config = CudaGraphConfig(
         batch_sizes=[1]) if use_cuda_graph else None
 

diff --git a/tests/unittest/_torch/speculative/test_ngram.py b/tests/unittest/_torch/speculative/test_ngram.py
@@ -27,7 +27,7 @@ def test_llama_ngram(disable_overlap_scheduler: bool, use_cuda_graph: bool,
 
     max_batch_size = 2
     max_draft_len = 4
-    kv_cache_config = KvCacheConfig(enable_block_reuse=False)
+    kv_cache_config = KvCacheConfig(enable_block_reuse=False, max_tokens=8192)
     cuda_graph_config = CudaGraphConfig(
         batch_sizes=[1]) if use_cuda_graph else None
 

diff --git a/tests/unittest/_torch/speculative/test_user_provided.py b/tests/unittest/_torch/speculative/test_user_provided.py
@@ -28,7 +28,7 @@ def test_llama_user_provided(disable_overlap_scheduler: bool,
 
     max_batch_size = 2
     max_draft_len = 4
-    kv_cache_config = KvCacheConfig(enable_block_reuse=False)
+    kv_cache_config = KvCacheConfig(enable_block_reuse=False, max_tokens=8192)
     cuda_graph_config = CudaGraphConfig(
         batch_sizes=[1]) if use_cuda_graph else None
 

diff --git a/tests/unittest/conftest.py b/tests/unittest/conftest.py
@@ -13,7 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # # Force resource release after test
-
+import os
+import sys
 import traceback
 from typing import Any
 
@@ -100,6 +101,33 @@ def pytest_sessionstart(session):
     import torch._inductor.async_compile  # noqa: F401
 
 
+@pytest.fixture(autouse=True)
+def cuda_error_early_quit(capfd):
+    """
+    Fixture to handle CUDA error.
+
+    CUDA error are usually persistent that requires restart process to recover.
+    Immediately stop the current worker when CUDA error occurred.
+    It will then be restarted by the master process.
+    """
+    if torch.cuda.is_available() and os.environ.get("PYTEST_XDIST_WORKER",
+                                                    None):
+        try:
+            yield
+            torch.cuda.synchronize()
+        except RuntimeError as e:
+            msg = str(e)
+            if 'CUDA error:' in msg:
+                with capfd.disabled():
+                    traceback.print_exception(e, file=sys.stderr)
+                    print("CUDA Error occurred, worker must quit now",
+                          file=sys.stderr)
+                os._exit(1)
+            raise
+    else:
+        yield
+
+
 @pytest.fixture(autouse=True)
 def torch_empty_cache() -> None:
     """