Auto-enable ngram with concurrency <= 32.

SimengLiu-nv · SimengLiu-nv · commit bcb72e0542d0 · 2025-07-21T12:07:30.000-07:00
Signed-off-by: Simeng Liu &lt;simengl@nvidia.com&gt;
diff --git a/examples/llm-api/quickstart_advanced.py b/examples/llm-api/quickstart_advanced.py
@@ -108,9 +108,9 @@ def add_llm_args(parser):
 
     # Speculative decoding
     parser.add_argument('--spec_decode_algo', type=str, default=None)
-    parser.add_argument('--spec_decode_max_draft_len', type=int, default=1)
+    parser.add_argument('--spec_decode_max_draft_len', type=int, default=0)
     parser.add_argument('--draft_model_dir', type=str, default=None)
-    parser.add_argument('--max_matching_ngram_size', type=int, default=5)
+    parser.add_argument('--max_matching_ngram_size', type=int, default=0)
     parser.add_argument('--use_one_model', default=False, action='store_true')
 
     # Relaxed acceptance
@@ -152,6 +152,11 @@ def setup_llm(args, **kwargs):
     spec_decode_algo = args.spec_decode_algo.upper(
     ) if args.spec_decode_algo is not None else None
 
+    # Update spec_decode_max_draft_len to 1 if unset by the user for non-NGRAM spec_decode_algo
+    # NGRAM spec_decode_algo will use default heuristic to set spec_decode_max_draft_len and max_matching_ngram_size
+    if spec_decode_algo != "NGRAM" and args.spec_decode_max_draft_len == 0:
+        args.spec_decode_max_draft_len = 1
+
     if spec_decode_algo == 'MTP':
         if not args.use_one_model:
             print(
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -922,7 +922,7 @@ def _executor_loop(self):
                 self._pad_attention_dp_dummy_request()
 
                 if self.drafter is not None:
-                    self._prepare_draft_requests(self.active_requests)
+                    self._prepare_draft_requests()
 
                 scheduled_batch, fitting_disagg_gen_init_requests, num_fitting_reqs = self._schedule(
                 )
@@ -1009,14 +1009,15 @@ def _executor_loop(self):
                                    iter_stats=iter_stats,
                                    iter_start_time=iter_start_time))
 
-    def _prepare_draft_requests(self, requests):
+    def _prepare_draft_requests(self):
         try:
             # Set draft tokens here to make the KV cache manager
             # and scheduler aware of them.
-            for req in requests:
+            for req in self.active_requests:
                 if req.state not in (LlmRequestState.GENERATION_IN_PROGRESS,
                                      LlmRequestState.DISAGG_GENERATION_INIT):
                     continue
+
                 req.py_last_draft_tokens = req.py_draft_tokens
                 max_draft_len = self.model_engine.spec_config.max_draft_len
 
diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py
@@ -964,13 +964,41 @@ def _build_model(self):
             self._executor_config.cache_transceiver_config = PybindMirror.maybe_to_pybind(
                 self.args.cache_transceiver_config)
         from tensorrt_llm._torch.pyexecutor.config import update_executor_config
+
+        spec_config = self.args.speculative_config
+        max_batch_size = self._executor_config.max_batch_size
+        # Apply heuristic to incomplete NGramDecodingConfig based on benchmark results
+        # With concurrency <= 4, max_draft_len = 5, max_matching_ngram_size = 3
+        # With concurrency <= 32, max_draft_len = 3, max_matching_ngram_size = 5
+        if spec_config.spec_dec_mode() == "NGRAM" and max_batch_size <= 32:
+            if not self.args.disable_overlap_scheduler:
+                logger.info(
+                    "Disable overlap scheduler to enable NGram speculative decoding."
+                )
+                # From benchmark results, we found that NGram speculative decoding provides better performance than overlap scheduler with low concurrency <= 32.
+                # Therefore, we disable overlap scheduler to enable NGram speculative decoding.
+                self.args.disable_overlap_scheduler = True
+
+            if spec_config.max_draft_len != 0 and spec_config.max_matching_ngram_size != 0:
+                pass
+            else:
+                if max_batch_size <= 4:
+                    spec_config.max_draft_len = 5 if spec_config.max_draft_len == 0 else spec_config.max_draft_len
+                    spec_config.max_matching_ngram_size = 3 if spec_config.max_matching_ngram_size == 0 else spec_config.max_matching_ngram_size
+                elif max_batch_size <= 32:
+                    spec_config.max_draft_len = 3 if spec_config.max_draft_len == 0 else spec_config.max_draft_len
+                    spec_config.max_matching_ngram_size = 5 if spec_config.max_matching_ngram_size == 0 else spec_config.max_matching_ngram_size
+                logger.info(
+                    f"Apply heuristic to incomplete NGramDecodingConfig: max_draft_len={spec_config.max_draft_len}, max_matching_ngram_size={spec_config.max_matching_ngram_size}"
+                )
+
         update_executor_config(
             self._executor_config,
             backend=self.args.backend,
             pytorch_backend_config=self.args.get_pytorch_backend_config()
             if self.args.backend in ["pytorch", "_autodeploy"] else None,
             mapping=self.args.parallel_config.to_mapping(),
-            speculative_config=self.args.speculative_config,
+            speculative_config=spec_config,
             hf_model_dir=self._hf_model_dir,
             max_input_len=self.args.max_input_len,
             max_seq_len=max_seq_len,
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
@@ -385,8 +385,12 @@ class NGramDecodingConfig(DecodingBaseConfig):
         is_public_pool: bool = True
             Whether to use a common pool for all requests, or the pool is private for each request if False.
     """
-
-    max_matching_ngram_size: int = 4
+    # If max_draft_len or max_matching_ngram_size are not set by user
+    # Default heuristic will be use
+    # With concurrency <= 4, max_draft_len = 5, max_matching_ngram_size = 3
+    # With concurrency <= 32, max_draft_len = 3, max_matching_ngram_size = 5
+    max_draft_len: int = 0
+    max_matching_ngram_size: int = 0
     is_keep_all: bool = True
     is_use_oldest: bool = True
     is_public_pool: bool = True