From 78d1d3ce49ff2b41d4a82637b1066eb3ac34c5f8 Mon Sep 17 00:00:00 2001
From: Pengyun Lin <81065165+LinPoly@users.noreply.github.com>
Date: Thu, 26 Jun 2025 14:44:58 +0000
Subject: [PATCH 1/4] Remove untouched branch

Signed-off-by: Pengyun Lin <81065165+LinPoly@users.noreply.github.com>
---
 tensorrt_llm/llmapi/llm.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py
index 5a8c68643ee..42fb2183215 100644
--- a/tensorrt_llm/llmapi/llm.py
+++ b/tensorrt_llm/llmapi/llm.py
@@ -530,8 +530,6 @@ def _check_arguments(self, prompt_len: int, query_len: int,
                     f"PyTorch backend currently only supports `logprobs=1`. Received `logprobs={sampling_params.logprobs}` (Top{sampling_params.logprobs} logprobs). Please set `logprobs=1` in `sampling_params` instead."
                 )
             return
-        elif self.args.backend == "_autodeploy":
-            return
 
         build_config = self.args.build_config
 

From ae5248db360e3ff0f6a6787077f8b847688c4af5 Mon Sep 17 00:00:00 2001
From: Pengyun Lin <81065165+LinPoly@users.noreply.github.com>
Date: Fri, 27 Jun 2025 07:27:09 +0000
Subject: [PATCH 2/4] Enhance _check_arguments to filter illegal requests

Signed-off-by: Pengyun Lin <81065165+LinPoly@users.noreply.github.com>
---
 tensorrt_llm/llmapi/llm.py                  | 8 +++++++-
 tests/unittest/llmapi/test_llm.py           | 9 ++++++---
 tests/unittest/llmapi/test_llm_multi_gpu.py | 5 +++--
 3 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py
index 42fb2183215..355d17c65bd 100644
--- a/tensorrt_llm/llmapi/llm.py
+++ b/tensorrt_llm/llmapi/llm.py
@@ -529,6 +529,12 @@ def _check_arguments(self, prompt_len: int, query_len: int,
                 raise ValueError(
                     f"PyTorch backend currently only supports `logprobs=1`. Received `logprobs={sampling_params.logprobs}` (Top{sampling_params.logprobs} logprobs). Please set `logprobs=1` in `sampling_params` instead."
                 )
+            # Check prompt length and query length against max_num_tokens to filter illegal requests.
+            max_num_tokens = self.args.max_num_tokens
+            if max_num_tokens and prompt_len / self.args.parallel_config.cp_size + query_len > max_num_tokens:
+                raise ValueError(
+                    f"The sum of prompt length ({prompt_len/self.args.parallel_config.cp_size}), query length ({query_len}) and max_tokens ({sampling_params.max_tokens}) should not exceed "
+                    f"max_num_tokens ({max_num_tokens})")
             return
 
         build_config = self.args.build_config
@@ -545,7 +551,7 @@ def _check_arguments(self, prompt_len: int, query_len: int,
             (sampling_params.max_tokens or 0) > max_seq_len):
             raise ValueError(
                 f"The sum of prompt length ({prompt_len/self.args.parallel_config.cp_size}) and query length ({query_len}) max_tokens ({sampling_params.max_tokens}) should not exceed "
-                f"max_seq_len ({build_config.max_seq_len})")
+                f"max_seq_len ({max_seq_len})")
 
         if sampling_params.use_beam_search and sampling_params.best_of > build_config.max_beam_width:
             if sampling_params.n == sampling_params.best_of:
diff --git a/tests/unittest/llmapi/test_llm.py b/tests/unittest/llmapi/test_llm.py
index 043c13c22f3..79bb8d4cfce 100644
--- a/tests/unittest/llmapi/test_llm.py
+++ b/tests/unittest/llmapi/test_llm.py
@@ -2061,13 +2061,15 @@ def success_path():
     success_path()
 
 
-def _test_llm_capture_request_error(tp_size: int = 1):
+def _test_llm_capture_request_error(backend: Optional[str], tp_size: int = 1):
     build_config = BuildConfig()
     build_config.max_num_tokens = 64
 
     llm = LLM(
         model=llama_model_path,
         build_config=build_config,
+        tensor_parallel_size=tp_size,
+        backend=backend,
         fast_build=True,
     )
 
@@ -2077,8 +2079,9 @@ def _test_llm_capture_request_error(tp_size: int = 1):
         llm.generate(prompt)
 
 
-def test_llm_capture_request_error():
-    _test_llm_capture_request_error(tp_size=1)
+@pytest.mark.parametrize('backend', [None, 'pytorch'])
+def test_llm_capture_request_error(backend: Optional[str]):
+    _test_llm_capture_request_error(backend=backend, tp_size=1)
 
 
 def test_llm_api_jupyter_scenario():
diff --git a/tests/unittest/llmapi/test_llm_multi_gpu.py b/tests/unittest/llmapi/test_llm_multi_gpu.py
index f14b358f63a..f3812f0cd1a 100644
--- a/tests/unittest/llmapi/test_llm_multi_gpu.py
+++ b/tests/unittest/llmapi/test_llm_multi_gpu.py
@@ -451,8 +451,9 @@ def test_llm_get_stats_async_tp2(pytorch_backend):
     llm_get_stats_async_test_harness(tp_size=2, pytorch_backend=pytorch_backend)
 
 
-def test_llm_capture_request_error():
-    _test_llm_capture_request_error(tp_size=2)
+@pytest.mark.parametrize('backend', [None, 'pytorch'])
+def test_llm_capture_request_error(backend: Optional[str]):
+    _test_llm_capture_request_error(backend=backend, tp_size=2)
 
 
 def test_llm_with_postprocess_parallel_tp2():

From 41eece1d9e680d6e8072ca66bba289bbd134d865 Mon Sep 17 00:00:00 2001
From: Pengyun Lin <81065165+LinPoly@users.noreply.github.com>
Date: Fri, 27 Jun 2025 11:16:51 +0000
Subject: [PATCH 3/4] Fix chunk prefill failure

Signed-off-by: Pengyun Lin <81065165+LinPoly@users.noreply.github.com>
---
 tensorrt_llm/llmapi/llm.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py
index 355d17c65bd..979452d45fc 100644
--- a/tensorrt_llm/llmapi/llm.py
+++ b/tensorrt_llm/llmapi/llm.py
@@ -530,11 +530,12 @@ def _check_arguments(self, prompt_len: int, query_len: int,
                     f"PyTorch backend currently only supports `logprobs=1`. Received `logprobs={sampling_params.logprobs}` (Top{sampling_params.logprobs} logprobs). Please set `logprobs=1` in `sampling_params` instead."
                 )
             # Check prompt length and query length against max_num_tokens to filter illegal requests.
-            max_num_tokens = self.args.max_num_tokens
-            if max_num_tokens and prompt_len / self.args.parallel_config.cp_size + query_len > max_num_tokens:
-                raise ValueError(
-                    f"The sum of prompt length ({prompt_len/self.args.parallel_config.cp_size}), query length ({query_len}) and max_tokens ({sampling_params.max_tokens}) should not exceed "
-                    f"max_num_tokens ({max_num_tokens})")
+            if not self.args.enable_chunked_prefill:
+                max_num_tokens = self.args.max_num_tokens
+                if max_num_tokens and prompt_len / self.args.parallel_config.cp_size + query_len > max_num_tokens:
+                    raise ValueError(
+                        f"The sum of prompt length ({prompt_len/self.args.parallel_config.cp_size}), query length ({query_len}) and max_tokens ({sampling_params.max_tokens}) should not exceed "
+                        f"max_num_tokens ({max_num_tokens})")
             return
 
         build_config = self.args.build_config

From a0362837b36c6b08498eb179569d919e51beeeb1 Mon Sep 17 00:00:00 2001
From: Pengyun Lin <81065165+LinPoly@users.noreply.github.com>
Date: Wed, 2 Jul 2025 07:02:07 +0000
Subject: [PATCH 4/4] Refactor test

Signed-off-by: Pengyun Lin <81065165+LinPoly@users.noreply.github.com>
---
 tensorrt_llm/llmapi/llm.py                    |  2 +-
 tests/unittest/llmapi/test_llm.py             | 36 ++++++++++++-------
 tests/unittest/llmapi/test_llm_multi_gpu.py   |  5 ++-
 .../llmapi/test_llm_multi_gpu_pytorch.py      |  7 +++-
 tests/unittest/llmapi/test_llm_pytorch.py     | 15 ++++----
 5 files changed, 41 insertions(+), 24 deletions(-)

diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py
index 979452d45fc..1a590aaebad 100644
--- a/tensorrt_llm/llmapi/llm.py
+++ b/tensorrt_llm/llmapi/llm.py
@@ -530,7 +530,7 @@ def _check_arguments(self, prompt_len: int, query_len: int,
                     f"PyTorch backend currently only supports `logprobs=1`. Received `logprobs={sampling_params.logprobs}` (Top{sampling_params.logprobs} logprobs). Please set `logprobs=1` in `sampling_params` instead."
                 )
             # Check prompt length and query length against max_num_tokens to filter illegal requests.
-            if not self.args.enable_chunked_prefill:
+            if self.args.backend == "pytorch" and not self.args.enable_chunked_prefill:
                 max_num_tokens = self.args.max_num_tokens
                 if max_num_tokens and prompt_len / self.args.parallel_config.cp_size + query_len > max_num_tokens:
                     raise ValueError(
diff --git a/tests/unittest/llmapi/test_llm.py b/tests/unittest/llmapi/test_llm.py
index 79bb8d4cfce..0631496ab17 100644
--- a/tests/unittest/llmapi/test_llm.py
+++ b/tests/unittest/llmapi/test_llm.py
@@ -2061,27 +2061,37 @@ def success_path():
     success_path()
 
 
-def _test_llm_capture_request_error(backend: Optional[str], tp_size: int = 1):
-    build_config = BuildConfig()
-    build_config.max_num_tokens = 64
+def _test_llm_capture_request_error(pytorch_backend: bool, tp_size: int = 1):
+    llm_args_extra = {}
+    if pytorch_backend:
+        from tensorrt_llm._torch import LLM as LLM_torch
+        LLM_CLASS = LLM_torch
+        llm_args_extra["max_num_tokens"] = 64
+    else:
+        LLM_CLASS = LLM
+        build_config = BuildConfig()
+        build_config.max_num_tokens = 64
+        llm_args_extra["fast_build"] = True
+        llm_args_extra["build_config"] = build_config
 
-    llm = LLM(
+    llm = LLM_CLASS(
         model=llama_model_path,
-        build_config=build_config,
         tensor_parallel_size=tp_size,
-        backend=backend,
-        fast_build=True,
+        **llm_args_extra,
     )
 
     prompt = 'A ' * 65  # the minimum max_num_tokens is 64
-
-    with pytest.raises(RequestError):
-        llm.generate(prompt)
+    if pytorch_backend:
+        # pytorch backend will raise ValueError for max_num_tokens
+        with pytest.raises(ValueError):
+            llm.generate(prompt)
+    else:
+        with pytest.raises(RequestError):
+            llm.generate(prompt)
 
 
-@pytest.mark.parametrize('backend', [None, 'pytorch'])
-def test_llm_capture_request_error(backend: Optional[str]):
-    _test_llm_capture_request_error(backend=backend, tp_size=1)
+def test_llm_capture_request_error():
+    _test_llm_capture_request_error(pytorch_backend=False, tp_size=1)
 
 
 def test_llm_api_jupyter_scenario():
diff --git a/tests/unittest/llmapi/test_llm_multi_gpu.py b/tests/unittest/llmapi/test_llm_multi_gpu.py
index f3812f0cd1a..55b9d6f4c25 100644
--- a/tests/unittest/llmapi/test_llm_multi_gpu.py
+++ b/tests/unittest/llmapi/test_llm_multi_gpu.py
@@ -451,9 +451,8 @@ def test_llm_get_stats_async_tp2(pytorch_backend):
     llm_get_stats_async_test_harness(tp_size=2, pytorch_backend=pytorch_backend)
 
 
-@pytest.mark.parametrize('backend', [None, 'pytorch'])
-def test_llm_capture_request_error(backend: Optional[str]):
-    _test_llm_capture_request_error(backend=backend, tp_size=2)
+def test_llm_capture_request_error():
+    _test_llm_capture_request_error(pytorch_backend=False, tp_size=2)
 
 
 def test_llm_with_postprocess_parallel_tp2():
diff --git a/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py b/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py
index 8dc1450f339..55ba1927eea 100644
--- a/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py
+++ b/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py
@@ -5,12 +5,17 @@
 from tensorrt_llm.llmapi import KvCacheConfig
 from .test_llm_pytorch import (llama_v2_13b_lora_test_harness,
                                llama_7b_multi_lora_test_harness)
-
+from .test_llm import _test_llm_capture_request_error
 # isort: on
 
 global_kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4)
 
 
+@pytest.mark.gpu2
+def test_llm_capture_request_error():
+    _test_llm_capture_request_error(pytorch_backend=True, tp_size=2)
+
+
 @pytest.mark.gpu4
 def test_tinyllama_logits_processor_tp2pp2():
     tinyllama_logits_processor_test_harness(backend="pytorch",
diff --git a/tests/unittest/llmapi/test_llm_pytorch.py b/tests/unittest/llmapi/test_llm_pytorch.py
index 65f3d16ac69..411ccfb8158 100644
--- a/tests/unittest/llmapi/test_llm_pytorch.py
+++ b/tests/unittest/llmapi/test_llm_pytorch.py
@@ -4,12 +4,11 @@
 from tensorrt_llm.sampling_params import SamplingParams
 
 # isort: off
-from .test_llm import (get_model_path, global_kvcache_config, llama_model_path,
-                       llm_get_stats_async_test_harness,
-                       llm_get_stats_test_harness, prompts,
-                       run_llm_abort_request,
-                       run_llm_with_postprocess_parallel_and_result_handler,
-                       tinyllama_logits_processor_test_harness)
+from .test_llm import (
+    get_model_path, global_kvcache_config, llama_model_path,
+    llm_get_stats_async_test_harness, llm_get_stats_test_harness, prompts,
+    run_llm_abort_request, run_llm_with_postprocess_parallel_and_result_handler,
+    tinyllama_logits_processor_test_harness, _test_llm_capture_request_error)
 from utils.util import force_ampere, similar, skip_gpu_memory_less_than_40gb, skip_gpu_memory_less_than_80gb, skip_gpu_memory_less_than_138gb
 from utils.llm_data import llm_models_root
 from tensorrt_llm.lora_manager import LoraConfig
@@ -64,6 +63,10 @@ def test_llm_get_stats_async(return_context_logits, use_overlap,
         enable_iter_req_stats=enable_iter_req_stats)
 
 
+def test_llm_capture_request_error():
+    _test_llm_capture_request_error(pytorch_backend=True, tp_size=1)
+
+
 @force_ampere
 @pytest.mark.parametrize(
     "sampling_params",