modified the test params

MrGeva · MrGeva · commit 97d4b18ef20f · 2025-07-30T07:54:42.000-07:00
Signed-off-by: Eran Geva &lt;19514940+MrGeva@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py
@@ -932,8 +932,8 @@ def _build_model(self):
             max_batch_size=max_batch_size,
             max_num_tokens=max_num_tokens,
             gather_generation_logits=self.args.gather_generation_logits,
-            fail_fast_on_attention_window_too_large=getattr(
-                self.args, 'fail_fast_on_attention_window_too_large', False),
+            # fail_fast_on_attention_window_too_large=getattr(
+            #     self.args, 'fail_fast_on_attention_window_too_large', False),
             **kwargs)
 
         if self.args.kv_cache_config is not None:
diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_bench.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_trtllm_bench.py
@@ -448,7 +448,7 @@ def print_kv_cache_metrics(kv_cache_metrics):
 def trtllm_bench_unified_comparison(
     llm_root,  # noqa: F811
     comparison_mode="backend",
-    free_mem_ratio=0.5,
+    free_mem_ratio=0.1,
     num_hidden_layers=2,
     max_batch_size=32,  # below this value the kv cache resizing is skipped
     golden_tokens_per_sec=1400,
@@ -483,7 +483,7 @@ def trtllm_bench_unified_comparison(
             yaml.dump(
                 {
                     "model_kwargs": {"num_hidden_layers": num_hidden_layers},
-                    # "cuda_graph_batch_sizes": [1, 2],
+                    "cuda_graph_batch_sizes": [1, 2, 4, 8, 16, 32],
                     "compile_backend": "torch-opt",
                     "free_mem_ratio": free_mem_ratio,
                     "runtime": "trtllm",