imp(torchsampler):support openai stop in text level

xq25478 · huwen.hu@antgroup.com · commit da386ad2f505 · 2025-08-04T10:50:15.000+08:00
Signed-off-by: xq25478 &lt;xq25478@qq.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py
@@ -34,6 +34,8 @@
                         SimpleScheduler)
 from .seq_slot_manager import SeqSlotManager
 
+from transformers import PreTrainedTokenizerBase
+
 GB = 1 << 30
 
 
@@ -542,7 +544,8 @@ def create_py_executor_instance(
 
 
 def create_torch_sampler_args(executor_config: ExecutorConfig, mapping: Mapping,
-                              *, max_seq_len: int, enable_mixed_sampler: bool):
+                              *, max_seq_len: int, enable_mixed_sampler: bool,
+                              tokenizer: PreTrainedTokenizerBase):
     max_num_sequences = executor_config.max_batch_size * mapping.pp_size
     max_draft_len = (0 if executor_config.speculative_config is None else
                      executor_config.speculative_config.max_draft_len)
@@ -552,18 +555,22 @@ def create_torch_sampler_args(executor_config: ExecutorConfig, mapping: Mapping,
         max_num_sequences=max_num_sequences,
         max_beam_width=executor_config.max_beam_width,
         enable_mixed_sampler=enable_mixed_sampler,
+        tokenizer=tokenizer
     )
 
 
 def instantiate_sampler(engine: PyTorchModelEngine,
                         executor_config: ExecutorConfig,
                         pytorch_backend_config: PyTorchConfig,
-                        mapping: Mapping):
+                        mapping: Mapping,
+                        tokenizer: Optional[PreTrainedTokenizerBase]):
     sampler_args = create_torch_sampler_args(
         executor_config,
         mapping,
         max_seq_len=engine.max_seq_len,
-        enable_mixed_sampler=pytorch_backend_config.enable_mixed_sampler)
+        enable_mixed_sampler=pytorch_backend_config.enable_mixed_sampler,
+        tokenizer=tokenizer
+        )
     if mapping.cp_config.get('cp_type') == 'star_attention':
         assert pytorch_backend_config.attn_backend == "FLASHINFER_STAR_ATTENTION", "attention backend of star attention should be 'FLASHINFER_STAR_ATTENTION'"
         return TorchSampler(sampler_args)
@@ -574,7 +581,8 @@ def instantiate_sampler(engine: PyTorchModelEngine,
         decoding_mode = get_decoding_mode(executor_config)
         return TRTLLMSampler(executor_config, engine.model, engine.dtype,
                              mapping, decoding_mode,
-                             pytorch_backend_config.disable_overlap_scheduler)
+                             pytorch_backend_config.disable_overlap_scheduler,
+                             tokenizer)
     if not engine.model.model_config.is_generation:
         # NOTE: choose sampler based on model type
         return EarlyStopSampler()
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
@@ -28,6 +28,7 @@
 from .model_engine import PyTorchModelEngine
 from .py_executor import PyExecutor
 
+from transformers import PreTrainedTokenizerBase
 
 class _ExecutorCreationStage(enum.Enum):
     SAMPLER = "Sampler"
@@ -185,7 +186,8 @@ def create_py_executor(
         executor_config: ExecutorConfig,
         checkpoint_dir: str = None,
         lora_config: Optional[LoraConfig] = None,
-        garbage_collection_gen0_threshold: Optional[int] = None) -> PyExecutor:
+        garbage_collection_gen0_threshold: Optional[int] = None,
+        tokenizer:PreTrainedTokenizerBase = None) -> PyExecutor:
     _mangle_executor_config(executor_config)
     pytorch_backend_config = executor_config.pytorch_backend_config
 
@@ -327,7 +329,7 @@ def create_py_executor(
 
     with mem_monitor.observe_creation_stage(_ExecutorCreationStage.SAMPLER):
         sampler = instantiate_sampler(model_engine, executor_config,
-                                      pytorch_backend_config, mapping)
+                                      pytorch_backend_config, mapping,tokenizer)
 
     guided_decoder: Optional[GuidedDecoder] = None
     if executor_config.guided_decoding_config is not None:
diff --git a/tensorrt_llm/_torch/pyexecutor/sampler.py b/tensorrt_llm/_torch/pyexecutor/sampler.py
@@ -1,7 +1,7 @@
 from abc import ABC, abstractmethod
 from collections.abc import Iterable
 from dataclasses import dataclass
-from typing import Literal
+from typing import Literal,Union,List
 
 import torch
 
@@ -26,6 +26,7 @@
 from .llm_request import LlmRequest, LlmRequestState
 from .scheduler import ScheduledRequests
 
+from transformers import PreTrainedTokenizerBase
 
 @dataclass(kw_only=True)
 class SampleStateTensors:
@@ -224,13 +225,15 @@ class Args:
         max_num_sequences: int
         max_beam_width: int
         enable_mixed_sampler: bool
+        tokenizer: PreTrainedTokenizerBase
 
     def __init__(self, args: Args):
         self.max_seq_len = args.max_seq_len
         self.enable_mixed_sampler = args.enable_mixed_sampler
         self.max_tokens = args.max_draft_len + 1
         assert args.max_beam_width == self.MAX_BEAM_WIDTH, "TorchSampler only supports beam_width = 1"
         self.num_seq_slots = args.max_num_sequences
+        self.tokenizer = args.tokenizer
 
         self.NEW_TOKENS_SHAPE = (self.max_tokens, self.num_seq_slots,
                                  self.MAX_BEAM_WIDTH)
@@ -247,22 +250,39 @@ def _meet_max_token_stop_criteria(self, request: LlmRequest):
                                                   >= self.max_seq_len)
 
     @staticmethod
-    def _meet_stop_token_criteria(request: LlmRequest):
+    def _meet_stop_token_criteria(
+        request: LlmRequest,
+        tokenizer: PreTrainedTokenizerBase, 
+        new_token: Union[int, List[int], torch.Tensor]
+        ):
         if request.py_stop_words_list:
             assert isinstance(
                 request.py_stop_words_list,
                 list), "request.py_stop_words_list should be a list"
+
             stop_words_list, prefix_sum = request.py_stop_words_list
             tokens = request.get_tokens(0)
+            try: 
+                new_words = tokenizer.decode(new_token,skip_special_tokens=False,clean_up_tokenization_spaces=False)
+            except Exception:
+                # If decode fails, fall back to token-based matching only
+                new_words = ""
             offset = 0
             for i, offset_end in enumerate(prefix_sum):
                 if i > 0:
                     offset = prefix_sum[i - 1]
                 stop_word = stop_words_list[offset:offset_end]
+                try:
+                    stop_text = tokenizer.decode(stop_word, skip_special_tokens=False, clean_up_tokenization_spaces=False)
+                except Exception:
+                    continue
                 if len(stop_word) > len(tokens):
                     continue
                 if tokens[-len(stop_word):] == stop_word:
                     return True
+                if stop_text in new_words:
+                    return True
+
         return False
 
     def _handle_stop_criteria(self, request: LlmRequest,
@@ -277,7 +297,7 @@ def _handle_stop_criteria(self, request: LlmRequest,
             request.finish_by(FinishReason.LENGTH, self.BEAM)
             return True
 
-        if self._meet_stop_token_criteria(request):
+        if self._meet_stop_token_criteria(request, self.tokenizer, new_token):
             request.finish_by(FinishReason.STOP_WORDS, self.BEAM)
             return True
 
@@ -365,6 +385,7 @@ def gen_logits_host(self, requests: Iterable[LlmRequest], vocab_size: int):
 
     def sample_async(self, scheduled_requests: ScheduledRequests,
                      model_outputs: dict[str, torch.Tensor]) -> SampleState:
+
         requests = scheduled_requests.all_requests()
         new_tokens = self.store.new_tokens
         vocab_size = model_outputs["logits"].shape[-1]
@@ -492,6 +513,7 @@ def __init__(
         mapping: Mapping,
         decoding_mode: DecodingMode,
         disable_overlap_scheduler: bool,
+        tokenizer: PreTrainedTokenizerBase
     ):
 
         vocab_size = model.config.vocab_size
@@ -520,6 +542,8 @@ def __init__(
                                         num_hidden_layers, 0, num_heads,
                                         hidden_size, self.model_datatype)
 
+        self.tokenizer = tokenizer
+
         self._initialize_store()
         self._instantiate_algorithms()
 
@@ -625,7 +649,6 @@ def _update_cache_indirection_buffer(self,
     @nvtx_range("sample_async")
     def sample_async(self, scheduled_requests: ScheduledRequests,
                      model_outputs) -> SampleStateTRTLLM:
-
         batch_size = scheduled_requests.batch_size
         beam_width = self.beam_width(scheduled_requests.all_requests())
         if (batch_size > 1 and beam_width > 1
diff --git a/tensorrt_llm/executor/executor.py b/tensorrt_llm/executor/executor.py
@@ -35,6 +35,8 @@
 from .result import GenerationResult, IterationResult
 from .utils import IntraProcessQueue, ProcessPoolExecutorSession, RequestError
 
+from transformers import PreTrainedTokenizerBase
+
 if TYPE_CHECKING:
     from .proxy import GenerationExecutorProxy
     from .worker import GenerationExecutorWorker
@@ -352,6 +354,7 @@ def create(
         is_llm_executor: Optional[bool] = None,
         lora_config: Optional[LoraConfig] = None,
         garbage_collection_gen0_threshold: Optional[int] = None,
+        tokenizer: Optional[PreTrainedTokenizerBase] = None
     ) -> Union["GenerationExecutorProxy", "GenerationExecutorWorker"]:
         # local imports to avoid cyclic importing
         from .proxy import GenerationExecutorProxy
@@ -396,8 +399,8 @@ def create(
                 mpi_session=mpi_session,
                 postproc_worker_config=postproc_worker_config,
                 is_llm_executor=is_llm_executor,
-                garbage_collection_gen0_threshold=
-                garbage_collection_gen0_threshold)
+                garbage_collection_gen0_threshold=garbage_collection_gen0_threshold,
+                tokenizer=tokenizer)
 
         # WAR: For the performance of gathering logits, we use single process worker
         # for TP1 to avoid the large overhead of IPC.
@@ -409,8 +412,8 @@ def create(
             )
             return GenerationExecutorWorker(**worker_kwargs,
                                             is_llm_executor=is_llm_executor,
-                                            garbage_collection_gen0_threshold=
-                                            garbage_collection_gen0_threshold)
+                                            garbage_collection_gen0_threshold=garbage_collection_gen0_threshold,
+                                            tokenizer=tokenizer)
 
         # For single-gpu case:
         # Partition the workload to multiple process for streaming performance.
@@ -423,8 +426,8 @@ def create(
                 mpi_session=None,  # use mpi4py
                 postproc_worker_config=postproc_worker_config,
                 is_llm_executor=is_llm_executor,
-                garbage_collection_gen0_threshold=
-                garbage_collection_gen0_threshold)
+                garbage_collection_gen0_threshold=garbage_collection_gen0_threshold,
+                tokenizer=tokenizer)
         else:
             ctx = multiprocessing.get_context("spawn")
             # The ProcessPoolExecutorSession is used to support Windows, as mpi4py cannot.
@@ -436,8 +439,8 @@ def create(
                 mpi_session=mpi_session,
                 postproc_worker_config=postproc_worker_config,
                 is_llm_executor=is_llm_executor,
-                garbage_collection_gen0_threshold=
-                garbage_collection_gen0_threshold)
+                garbage_collection_gen0_threshold=garbage_collection_gen0_threshold,
+                tokenizer=tokenizer)
 
     def wait_first_completed(
         self, futures: List[GenerationResult]
diff --git a/tensorrt_llm/executor/proxy.py b/tensorrt_llm/executor/proxy.py
@@ -28,6 +28,8 @@
                     is_llm_response, print_alive_threads)
 from .worker import GenerationExecutorWorker, worker_main
 
+from transformers import PreTrainedTokenizerBase
+
 __all__ = [
     "GenerationExecutorProxy",
 ]
@@ -46,6 +48,7 @@ def __init__(
         postproc_worker_config: Optional[PostprocWorkerConfig] = None,
         is_llm_executor: Optional[bool] = None,
         garbage_collection_gen0_threshold: Optional[int] = None,
+        tokenizer: Optional[PreTrainedTokenizerBase] = None,
     ) -> None:
         postproc_worker_config = postproc_worker_config or PostprocWorkerConfig(
         )
@@ -59,6 +62,7 @@ def __init__(
 
         self.workers_started = False
         self.worker_cls = worker_cls
+        self.tokenizer = tokenizer
 
         mpi_process_pre_spawned: bool = get_spawn_proxy_process_env()
 
@@ -94,7 +98,8 @@ def __init__(
                              postproc_worker_config=postproc_worker_config,
                              is_llm_executor=False,
                              garbage_collection_gen0_threshold=self.
-                             garbage_collection_gen0_threshold)
+                             garbage_collection_gen0_threshold,
+                             tokenizer=tokenizer)
 
         if "log_level" not in worker_kwargs:
             worker_kwargs["log_level"] = logger.level
@@ -410,7 +415,9 @@ def submit(self, request: GenerationRequest) -> GenerationResult:
             background_error_handler=self._handle_background_error,
             executor=self,
             disaggregated_params=request.disaggregated_params,
-            logprob_params=logprob_params)
+            logprob_params=logprob_params,
+            tokenizer = self.tokenizer
+            )
         self._results[request.id] = result
 
         with nvtx_range_debug("request_queue.put"):
diff --git a/tensorrt_llm/executor/result.py b/tensorrt_llm/executor/result.py
@@ -18,6 +18,8 @@
 from ..sampling_params import LogprobParams, SamplingParams
 from .utils import ErrorResponse, has_event_loop, is_llm_response
 
+from transformers import PreTrainedTokenizerBase
+
 if TYPE_CHECKING:
     from .executor import GenerationExecutor
     from .postproc_worker import PostprocParams, PostprocWorker
@@ -139,13 +141,16 @@ def __init__(self,
                  id: int,
                  sampling_params: SamplingParams,
                  background_error_handler: Optional[Callable] = None,
-                 postproc_params: "Optional[PostprocParams]" = None):
+                 postproc_params: "Optional[PostprocParams]" = None,
+                 tokenizer: Optional[PreTrainedTokenizerBase] = None):
         self.id = id
         self.sampling_params = sampling_params
         self.postproc_params = postproc_params
         self.disaggregated_params = None
         self.decoding_iter = 0
         self._done = False
+        self.tokenizer = tokenizer
+
 
         if has_event_loop():
             self.aqueue = AsyncQueue()
@@ -197,6 +202,28 @@ def outputs(self) -> List[CompletionOutput]:
     def context_logits(self) -> Optional[torch.Tensor]:
         return self._context_logits
 
+    def _check_text_stop_criteria(self, output, stop_reason: str, stop_ids: list) -> bool:
+        """Check if the stop text is found in newly generated tokens."""
+        now_token_ids_len = len(output.token_ids)
+        new_generated_token_ids = output.token_ids[output._last_token_ids_len:now_token_ids_len]
+        
+        for idx in range(len(new_generated_token_ids)):
+            if self.tokenizer is None:
+                continue
+            new_generated_text = self.tokenizer.decode(
+                new_generated_token_ids[idx],
+                skip_special_tokens=False,
+                clean_up_tokenization_spaces=False
+            )
+            if stop_reason in new_generated_text:
+                output.stop_reason = stop_reason
+                if not self.sampling_params.include_stop_str_in_output:
+                    output.token_ids = output.token_ids[:output._last_token_ids_len + idx]
+                else:
+                    output.token_ids = output.token_ids[:output._last_token_ids_len + idx] + stop_ids
+                return True
+        return False
+
     def _handle_sequence(self,
                          finish_reasons,
                          response_tensors,
@@ -249,11 +276,15 @@ def _handle_sequence(self,
                 output.finish_reason = 'stop'
                 for stop_reason, stop_ids in self.sampling_params._get_stop_reasons_and_words(
                 ):
-                    if output.token_ids[-len(stop_ids):] == stop_ids:
-                        output.stop_reason = stop_reason
-                        if not self.sampling_params.include_stop_str_in_output:
-                            output.token_ids = output.token_ids[:-len(stop_ids)]
-                        break
+                    if isinstance(stop_reason, str):
+                        if self._check_text_stop_criteria(output, stop_reason, stop_ids):
+                            break
+                    else:
+                        if output.token_ids[-len(stop_ids):] == stop_ids:
+                            output.stop_reason = stop_reason
+                            if not self.sampling_params.include_stop_str_in_output:
+                                output.token_ids = output.token_ids[:-len(stop_ids)]
+                            break
             elif finish_reasons[src_idx] == tllm.FinishReason.LENGTH:
                 output.finish_reason = 'length'
             elif finish_reasons[src_idx] == tllm.FinishReason.TIMED_OUT:
@@ -412,12 +443,14 @@ def __init__(
         executor: Optional["GenerationExecutor"] = None,
         disaggregated_params: Optional[DisaggregatedParams] = None,
         logprob_params: Optional[LogprobParams] = None,
+        tokenizer: Optional[PreTrainedTokenizerBase] = None
     ) -> None:
         super().__init__(
             generation_request.id,
             generation_request.sampling_params,
             background_error_handler,
             postproc_params=generation_request.postproc_params,
+            tokenizer=tokenizer
         )
         self._generation_request = generation_request
         self._streaming = generation_request.streaming
diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py
diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py
diff --git a/tensorrt_llm/sampling_params.py b/tensorrt_llm/sampling_params.py