lancelly
diff --git a/‎tensorrt_llm/_torch/pyexecutor/llm_request.py‎
Lines changed: 9 additions & 3 deletions b/‎tensorrt_llm/_torch/pyexecutor/llm_request.py‎
Lines changed: 9 additions & 3 deletions
diff --git a/‎tensorrt_llm/_torch/pyexecutor/sampler.py‎
Lines changed: 66 additions & 1 deletion b/‎tensorrt_llm/_torch/pyexecutor/sampler.py‎
Lines changed: 66 additions & 1 deletion
diff --git a/‎tensorrt_llm/sampling_params.py‎
Lines changed: 7 additions & 50 deletions b/‎tensorrt_llm/sampling_params.py‎
Lines changed: 7 additions & 50 deletions
diff --git a/‎tensorrt_llm/serve/openai_protocol.py‎
Lines changed: 35 additions & 7 deletions b/‎tensorrt_llm/serve/openai_protocol.py‎
Lines changed: 35 additions & 7 deletions
diff --git a/‎tensorrt_llm/serve/openai_server.py‎
Lines changed: 8 additions & 3 deletions b/‎tensorrt_llm/serve/openai_server.py‎
Lines changed: 8 additions & 3 deletions
diff --git a/‎tests/integration/defs/test_e2e.py‎
Lines changed: 22 additions & 2 deletions b/‎tests/integration/defs/test_e2e.py‎
Lines changed: 22 additions & 2 deletions
diff --git a/‎tests/integration/test_lists/test-db/l0_l40s.yml‎
Lines changed: 4 additions & 0 deletions b/‎tests/integration/test_lists/test-db/l0_l40s.yml‎
Lines changed: 4 additions & 0 deletions
@@ -336,6 +336,14 @@ def __init__(
                                   exclude_last_generation_logits)
         self.child_requests = []
 
+        self._py_embedding_bias_1d = None
+        if hasattr(self, 'embedding_bias') and self.embedding_bias is not None:
+            # Pre-squeeze to 1D if needed (remove batch dimension)
+            if self.embedding_bias.dim() > 1:
+                self._py_embedding_bias_1d = self.embedding_bias.squeeze(0)
+            else:
+                self._py_embedding_bias_1d = self.embedding_bias
+
     def is_generation_only_request(self):
         return self.py_llm_request_type == LlmRequestType.LLMREQUEST_TYPE_GENERATION_ONLY
 
@@ -463,9 +471,7 @@ def executor_request_to_llm_request(
         is_streaming=executor_request.streaming,
         end_id=executor_request.end_id,
         pad_id=executor_request.pad_id,
-        embedding_bias=torch.tensor(executor_request.embedding_bias,
-                                    dtype=torch.int32)
-        if executor_request.embedding_bias else None,
+        embedding_bias=executor_request.embedding_bias,
         bad_words_list=torch.tensor(
             convert_wordlist(executor_request.bad_words), dtype=torch.int32)
         if executor_request.bad_words else None,
 
@@ -391,6 +391,58 @@ def append_eagle3(tokens: torch.Tensor, model_outputs):
             d2t = model_outputs["d2t"][tokens]
             tokens += d2t
 
+    @staticmethod
+    def _apply_embedding_bias(
+            logits: torch.Tensor,
+            requests: list[LlmRequest],
+            steps_per_request: list[int] = None) -> torch.Tensor:
+        """Apply embedding bias (aka logit bias) to logits.
+        If steps_per_request is None, assumes 1 step per request (non-batched path).
+        """
+        # Collect biases and their associated data
+        bias_list = []
+        bias_data = []  # Either indices (fast path) or steps (batched path)
+
+        for i, req in enumerate(requests):
+            bias = req._py_embedding_bias_1d
+            if bias is not None:
+                bias_list.append(bias)
+                bias_data.append(i if steps_per_request is
+                                 None else steps_per_request[i])
+
+        if not bias_list:
+            return logits
+
+        bias_tensor = torch.stack(bias_list).to(logits.device,
+                                                non_blocking=True)
+        logits = logits.clone()
+
+        if steps_per_request is None:
+            # Fast path: direct indexing
+            indices = torch.tensor(bias_data, device=logits.device)
+            logits[indices] += bias_tensor
+        else:
+            # Batched path: expand biases and use boolean mask
+            expanded_biases = torch.repeat_interleave(bias_tensor,
+                                                      torch.tensor(
+                                                          bias_data,
+                                                          device=logits.device),
+                                                      dim=0)
+
+            mask = torch.zeros(sum(steps_per_request),
+                               dtype=torch.bool,
+                               device=logits.device)
+            offset = 0
+            for i, req in enumerate(requests):
+                steps = steps_per_request[i]
+                if req._py_embedding_bias_1d is not None:
+                    mask[offset:offset + steps] = True
+                offset += steps
+
+            logits[mask] += expanded_biases
+
+        return logits
+
     def _process_requests(self,
                           requests: list[LlmRequest],
                           model_outputs: dict[str, torch.Tensor],
@@ -411,6 +463,7 @@ def _process_requests(self,
 
         if fast_path:
             logits = raw_logits[:len(requests)]
+            logits = self._apply_embedding_bias(logits, requests)
             next_tokens = torch.argmax(logits, dim=-1)
             self.append_eagle3(next_tokens, model_outputs)
             int_next_tokens = next_tokens.to(torch.int, non_blocking=True)
@@ -430,17 +483,29 @@ def _process_requests(self,
 
         if batched_strategy is not None:
             logits = raw_logits[:sum_steps]
+            # Collect steps per request for batched strategy
+            steps_per_request = [
+                1 + len(req.py_draft_tokens) for req in requests
+            ]
+            logits = self._apply_embedding_bias(logits, requests,
+                                                steps_per_request)
             batched_next_tokens, batched_softmax = sample(
                 batched_strategy, logits)
             self.append_eagle3(batched_next_tokens, model_outputs)
 
         offset = 0
-        for strategy, slot, steps in zip(strategies, seq_slots, num_steps):
+        for i, (strategy, slot,
+                steps) in enumerate(zip(strategies, seq_slots, num_steps)):
             input_slice = slice(offset, offset + steps)
             logits = raw_logits[input_slice]
+
+            req = requests[i]
+
             if batched_next_tokens is None:
+                logits = self._apply_embedding_bias(logits, [req])
                 next_tokens, softmax = sample(strategy, logits)
             else:
+                # Batched processing already applied bias, just use the results
                 next_tokens = batched_next_tokens[input_slice]
                 softmax = batched_softmax[input_slice]
             current_slice = slice(0, steps), slot, beam
 
@@ -2,7 +2,7 @@
 import os
 from abc import ABC, abstractmethod
 from dataclasses import dataclass, field, fields
-from typing import Dict, List, NamedTuple, Optional, Tuple, Union
+from typing import List, NamedTuple, Optional, Tuple, Union
 
 import torch
 from pydantic import BaseModel
@@ -108,55 +108,6 @@ def __call__(
         pass  # noqa
 
 
-class LogitBiasLogitsProcessor(LogitsProcessor):
-    def __init__(self, logit_bias: Dict[str, float]) -> None:
-        super().__init__()
-        self.logit_bias = logit_bias
-        self.tokens_to_adjust = self.process_logit_bias(logit_bias)
-        if not self.tokens_to_adjust:
-            raise ValueError("Empty logit_bias provided - no tokens to adjust")
-
-    def process_logit_bias(self, logit_bias: Dict[str, float]) -> Dict[int, float]:
-        valid = {}
-        invalid = {}
-
-        for k, v in logit_bias.items():
-            try:
-                token_id = int(k)
-                valid[token_id] = v
-            except (ValueError, TypeError):
-                invalid[k] = v
-
-        if invalid:
-            raise ValueError(
-                f"Invalid token_ids in logit_bias: {list(invalid.keys())}. "
-                f"All keys must be integers."
-            )
-        return valid
-
-    def __call__(
-        self,
-        req_id: int,
-        logits: torch.Tensor,
-        token_ids: List[List[int]],
-        stream_ptr: Optional[int],
-        client_id: Optional[int],
-    ) -> None:
-        vocab_size = logits.size(-1)
-        token_ids_list = list(self.tokens_to_adjust.keys())
-        bias_values = torch.tensor(list(self.tokens_to_adjust.values()), device=logits.device)
-
-        invalid_token_ids = [tid for tid in token_ids_list if tid >= vocab_size]
-        if invalid_token_ids:
-            raise ValueError(
-                f"Token ID(s) {invalid_token_ids} exceed vocabulary size (vocab_size={vocab_size})"
-            )
-
-        stream = None if stream_ptr is None else torch.cuda.ExternalStream(stream_ptr)
-        with torch.cuda.stream(stream):
-            logits[:, :, token_ids_list] += bias_values
-
-
 @dataclass(slots=True, kw_only=True)
 class AdditionalModelOutput:
     """An additional output to gather from the model.
@@ -328,6 +279,12 @@ def __post_init__(self):
 
         self.best_of = self.best_of or self.n
 
+        if self.embedding_bias is not None:
+            if isinstance(self.embedding_bias, torch.Tensor):
+                self.embedding_bias = self.embedding_bias.detach().clone()
+            else:
+                self.embedding_bias = torch.tensor(self.embedding_bias, dtype=torch.float32)
+
         self._validate()
 
     def _validate(self):
 
@@ -5,6 +5,7 @@
 import uuid
 from typing import Any, Dict, List, Literal, Optional, Union
 
+import torch
 from openai.types.chat import \
     ChatCompletionContentPartParam as OpenAIChatCompletionContentPartParam
 from openai.types.chat import \
@@ -16,7 +17,34 @@
 from tensorrt_llm.llmapi import DisaggregatedParams as LlmDisaggregatedParams
 from tensorrt_llm.llmapi import GuidedDecodingParams, SamplingParams
 
-from ..sampling_params import LogitBiasLogitsProcessor
+
+def _logit_bias_to_embedding_bias(logit_bias: Optional[Dict[str, float]],
+                                  vocab_size: int) -> Optional[torch.Tensor]:
+    """Convert OpenAI logit_bias dict to embedding_bias tensor for sampling."""
+    if logit_bias is None:
+        return None
+
+    # Create 1D zeros tensor as expected by executor API (will be unsqueezed to [1, vocab_size] internally)
+    embedding_bias = torch.zeros(vocab_size, dtype=torch.float32)
+
+    # Apply biases for specified token IDs
+    for token_str, bias in logit_bias.items():
+        try:
+            token_id = int(token_str)
+            if 0 <= token_id < vocab_size:
+                embedding_bias[token_id] = bias
+            else:
+                raise ValueError(
+                    f"Token ID {token_id} out of vocabulary range [0, {vocab_size})"
+                )
+        except ValueError as e:
+            if "invalid literal" in str(e):
+                raise ValueError(
+                    f"Invalid logit_bias key '{token_str}': must be a valid integer token ID"
+                )
+            raise
+
+    return embedding_bias
 
 
 class OpenAIBaseModel(BaseModel):
@@ -225,7 +253,7 @@ class CompletionRequest(OpenAIBaseModel):
 
     # doc: end-completion-extra-params
 
-    def to_sampling_params(self) -> SamplingParams:
+    def to_sampling_params(self, vocab_size: int = 32000) -> SamplingParams:
         sampling_params = SamplingParams(
             best_of=self.best_of,
             frequency_penalty=self.frequency_penalty,
@@ -258,8 +286,8 @@ def to_sampling_params(self) -> SamplingParams:
             detokenize=self.detokenize,
 
             # logits_bias
-            logits_processor=None if not self.logit_bias else
-            LogitBiasLogitsProcessor(self.logit_bias),
+            embedding_bias=_logit_bias_to_embedding_bias(
+                self.logit_bias, vocab_size),
 
             # completion-extra-params
             add_special_tokens=self.add_special_tokens,
@@ -521,7 +549,7 @@ class ChatCompletionRequest(OpenAIBaseModel):
 
     # doc: end-chat-completion-extra-params
 
-    def to_sampling_params(self) -> SamplingParams:
+    def to_sampling_params(self, vocab_size: int = 32000) -> SamplingParams:
 
         sampling_params = SamplingParams(
             frequency_penalty=self.frequency_penalty,
@@ -553,8 +581,8 @@ def to_sampling_params(self) -> SamplingParams:
                 self.response_format),
 
             # logits_bias
-            logits_processor=None if not self.logit_bias else
-            LogitBiasLogitsProcessor(self.logit_bias),
+            embedding_bias=_logit_bias_to_embedding_bias(
+                self.logit_bias, vocab_size),
 
             # chat-completion-extra-params
             add_special_tokens=self.add_special_tokens,
 
@@ -253,11 +253,13 @@ async def create_chat_response(
             tool_dicts = None if request.tools is None else [
                 tool.model_dump() for tool in request.tools
             ]
-            sampling_params = request.to_sampling_params()
+            # Pass the tokenizer vocabulary size so ``logit_bias`` can be
+            # expanded into an embedding bias tensor in the sampler.
+            sampling_params = request.to_sampling_params(
+                vocab_size=self.tokenizer.tokenizer.vocab_size)
             # TODO: better way to enable metrics
             if len(os.getenv("TRTLLM_KVCACHE_TIME_OUTPUT_PATH", "")) > 0:
                 sampling_params.return_perf_metrics = True
-
             postproc_args = ChatPostprocArgs.from_request(request)
             disaggregated_params = to_llm_disaggregated_params(request.disaggregated_params)
 
@@ -406,7 +408,10 @@ async def generator_wrapper(generator: AsyncIterator[Any]):
 
             promises: List[RequestOutput] = []
             postproc_params_collection: List[Optional[PostprocParams]] = []
-            sampling_params = request.to_sampling_params()
+            # Pass the tokenizer vocabulary size so ``logit_bias`` can be
+            # expanded into an embedding bias tensor in the sampler.
+            sampling_params = request.to_sampling_params(
+                vocab_size=self.tokenizer.tokenizer.vocab_size)
             # TODO: better way to enable metrics
             if len(os.getenv("TRTLLM_KVCACHE_TIME_OUTPUT_PATH", "")) > 0:
                 sampling_params.return_perf_metrics = True
 
@@ -1399,18 +1399,20 @@ def test_openai_misc_example(llm_root, llm_venv, backend: str):
 @pytest.mark.parametrize("backend", ["pytorch", "trt"])
 def test_openai_completions_example(llm_root, llm_venv, backend: str):
     test_root = unittest_path() / "llmapi" / "apps"
+    filter_expr = f"{backend} and not sampler"
     llm_venv.run_cmd([
         "-m", "pytest",
-        str(test_root / "_test_openai_completions.py"), "-k", backend
+        str(test_root / "_test_openai_completions.py"), "-k", filter_expr
     ])
 
 
 @pytest.mark.parametrize("backend", ["pytorch", "trt"])
 def test_openai_chat_example(llm_root, llm_venv, backend: str):
     test_root = unittest_path() / "llmapi" / "apps"
+    filter_expr = f"{backend} and not sampler"
     llm_venv.run_cmd([
         "-m", "pytest",
-        str(test_root / "_test_openai_chat.py"), "-k", backend
+        str(test_root / "_test_openai_chat.py"), "-k", filter_expr
     ])
 
 
@@ -1423,6 +1425,24 @@ def test_openai_reasoning(llm_root, llm_venv, backend: str):
     ])
 
 
+@pytest.mark.parametrize("sampler", ["torch_sampler", "trtllm_sampler"])
+def test_openai_completions_with_logit_bias(llm_root, llm_venv, sampler: str):
+    test_root = unittest_path() / "llmapi" / "apps"
+    llm_venv.run_cmd([
+        "-m", "pytest",
+        str(test_root / "_test_openai_completions.py"), "-k", sampler
+    ])
+
+
+@pytest.mark.parametrize("sampler", ["torch_sampler", "trtllm_sampler"])
+def test_openai_chat_with_logit_bias(llm_root, llm_venv, sampler: str):
+    test_root = unittest_path() / "llmapi" / "apps"
+    llm_venv.run_cmd([
+        "-m", "pytest",
+        str(test_root / "_test_openai_chat.py"), "-k", sampler
+    ])
+
+
 def test_openai_lora(llm_root, llm_venv):
     test_root = unittest_path() / "llmapi" / "apps"
     llm_venv.run_cmd(["-m", "pytest", str(test_root / "_test_openai_lora.py")])
 
@@ -33,6 +33,10 @@ l0_l40s:
   - test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[image_audio]
   - test_e2e.py::test_ptp_quickstart_bert[VANILLA-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]
   - test_e2e.py::test_ptp_quickstart_bert[TRTLLM-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]
+  - test_e2e.py::test_openai_completions_with_logit_bias[torch_sampler]
+  - test_e2e.py::test_openai_chat_with_logit_bias[torch_sampler]
+  - test_e2e.py::test_openai_completions_with_logit_bias[trtllm_sampler]
+  - test_e2e.py::test_openai_chat_with_logit_bias[trtllm_sampler]
 - condition:
     ranges:
       system_gpu_count: