NVIDIA
diff --git a/‎pyproject.toml‎
Lines changed: 3 additions & 0 deletions b/‎pyproject.toml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/shim/demollm.py‎
Lines changed: 1 addition & 1 deletion b/‎tensorrt_llm/_torch/auto_deploy/shim/demollm.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tensorrt_llm/_torch/pyexecutor/sampler.py‎
Lines changed: 153 additions & 115 deletions b/‎tensorrt_llm/_torch/pyexecutor/sampler.py‎
Lines changed: 153 additions & 115 deletions
@@ -34,6 +34,7 @@ extend_skip_glob = [
     "tests/unittest/_torch/modeling/test_modeling_mistral.py",
     "tests/unittest/_torch/modeling/test_modeling_pixtral.py",
     "tests/unittest/_torch/models/checkpoints/hf/test_weight_loader.py",
+    "tests/unittest/_torch/sampler/test_torch_sampler.py",
 ]
 
 [tool.yapf]
@@ -65,6 +66,7 @@ ignore_patterns = [
     "tests/unittest/_torch/modeling/test_modeling_mistral.py",
     "tests/unittest/_torch/modeling/test_modeling_pixtral.py",
     "tests/unittest/_torch/models/checkpoints/hf/test_weight_loader.py",
+    "tests/unittest/_torch/sampler/test_torch_sampler.py",
 ]
 
 [tool.codespell]
@@ -144,6 +146,7 @@ include = [
     "tests/unittest/_torch/modeling/test_modeling_mistral.py",
     "tests/unittest/_torch/modeling/test_modeling_pixtral.py",
     "tests/unittest/_torch/models/checkpoints/hf/test_weight_loader.py",
+    "tests/unittest/_torch/sampler/test_torch_sampler.py",
 ]
 exclude = [
     "**3rdparty/**",
 
@@ -235,7 +235,7 @@ def _sample(
         logits_shape = logits.shape
         logits = logits.view(-1, logits_shape[-1])  # sampling_batch expects 2D logits
         if isinstance(sampling_params.top_k, int):
-            idx_next, probs = top_k_sampling_batch(logits, sampling_params.top_k)
+            idx_next, probs = top_k_sampling_batch(logits, top_k=sampling_params.top_k)
         else:
             idx_next, probs = greedy_search_sampling_batch(logits)
         idx_next = idx_next.view(logits_shape[:-1])
 
@@ -6,7 +6,7 @@
 from collections.abc import Iterable
 from dataclasses import dataclass
 from itertools import repeat
-from typing import Any, List, Literal, Optional, cast
+from typing import Any, List, Literal, Optional, TypeVar, cast
 
 import torch
 import torch.nn.functional as F
@@ -26,6 +26,7 @@
                                                     GptDecoderBatched)
 from tensorrt_llm.executor.result import Logprob
 from tensorrt_llm.mapping import Mapping
+from tensorrt_llm.sampling_params import SamplingParams
 
 from ..speculative.spec_tree_manager import SpecTreeManager
 from .finish_reason import FinishedState
@@ -195,106 +196,104 @@ def is_generation_model(self) -> bool:
 
 def top_k_sampling_batch(
     logits,
-    top_k=50,
-    generator: Optional[torch.Generator] = None
+    *,
+    top_k: int,
+    temperature: float,
+    generator: Optional[torch.Generator] = None,
 ) -> tuple[torch.Tensor, torch.Tensor]:
-    logits_dim = logits.dim()
-    if logits_dim == 1:
-        logits = logits.unsqueeze(0)
-    # logits should be 2D ：[batch_size, vocab_size]
-    batch_size, vocab_size = logits.size()
+    # NB: To be replaced by a more efficient implementation.
+    return top_k_top_p_sampling_batch(
+        logits,
+        top_k=top_k,
+        temperature=temperature,
+        generator=generator,
+        top_p=1,
+    )
 
-    # get first top_k logits of each sample and their indices
-    if top_k > 0:
-        values, indices = torch.topk(logits, top_k, dim=-1)
-        min_values = values[:, -1].unsqueeze(-1).expand(batch_size, vocab_size)
 
-        # set the logits who is less than first top_k logits to -inf
-        logits = torch.where(logits < min_values,
-                             torch.full_like(logits, float('-inf')), logits)
+def top_p_sampling_batch(
+    logits: torch.Tensor,
+    *,
+    top_p: float,
+    temperature: float,
+    generator: Optional[torch.Generator] = None,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    # NB: To be replaced by a more efficient implementation.
+    return top_k_top_p_sampling_batch(
+        logits,
+        top_p=top_p,
+        top_k=logits.size(1),
+        temperature=temperature,
+        generator=generator,
+    )
 
-    # compute probability distribution
-    softmax = torch.softmax(logits, dim=-1)
 
-    # sample from the distribution and generate result of [batch_size, 1]
-    next_tokens = torch.multinomial(softmax, num_samples=1,
-                                    generator=generator).squeeze(-1)
-    return next_tokens, softmax
+def temperature_sampling_batch(
+    logits: torch.Tensor,
+    *,
+    temperature: float,
+    generator: Optional[torch.Generator] = None,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    # NB: To be replaced by a more efficient implementation.
+    return top_k_top_p_sampling_batch(
+        logits,
+        top_p=1,
+        top_k=logits.size(1),
+        temperature=temperature,
+        generator=generator,
+    )
 
 
-def top_p_sampling_batch(
+def top_k_top_p_sampling_batch(
     logits: torch.Tensor,
     *,
-    top_p: float = 0.9,
-    temperature: float = 1.0,
+    top_k: int,
+    top_p: float,
+    temperature: float,
     generator: Optional[torch.Generator] = None
 ) -> tuple[torch.Tensor, torch.Tensor]:
     logits_dim = logits.dim()
     assert logits_dim == 2, "logits should be 2D: [batch_size, vocab_size]"
+    assert temperature > 0, "non-greedy sampling requires valid temperature"
+    logits = logits / max(temperature, 1e-5)
+    batch_size, vocab_size = logits.size()
 
-    if temperature != 0:
-        logits = logits / max(temperature, 1e-5)
-
-    # sort the logits of each sample in descending order
-    sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
-
-    # compute  cumulative probability distribution of each sample
-    cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1),
-                                    dim=-1)
-    # get the location of top_p
-    sorted_indices_to_remove = cumulative_probs > top_p
-    sorted_indices_to_remove[:, 1:] = sorted_indices_to_remove[:, :-1].clone()
-    sorted_indices_to_remove[:, 0] = 0
-
-    # set the logits to -inf whose is outside top_p
-    indices_to_remove = sorted_indices_to_remove.scatter(
-        1, sorted_indices, sorted_indices_to_remove)
-    logits = logits.masked_fill(indices_to_remove, float('-inf'))
-
-    # compute probability distribution
-    softmax = torch.softmax(logits, dim=-1)
-
-    # sample from the distribution and generate result of [batch_size, 1]
-    next_tokens = torch.multinomial(softmax, num_samples=1,
-                                    generator=generator).squeeze(-1)
-    return next_tokens, softmax
-
+    assert top_k > 1, "non-greedy sampling requires valid top_k"
+    need_top_k = top_k < vocab_size
+    assert top_p > 0, "non-greedy sampling requires valid top_p"
+    need_top_p = top_p < 1
 
-def top_k_top_p_sampling_batch(logits: torch.Tensor,
-                               *,
-                               top_k: int,
-                               top_p: float,
-                               temperature: float = 1.0,
-                               generator: Optional[torch.Generator] = None):
-    logits_dim = logits.dim()
-    assert logits_dim == 2, "logits should be 2D: [batch_size, vocab_size]"
-    if temperature != 0:
-        logits = logits / max(temperature, 1e-5)
-    batch_size, vocab_size = logits.size()
-    # get first top_k logits of each sample and their indices
-    if top_k > 0:
+    # top-K: mask out logits not belonging to the top-K for each sample
+    if need_top_k:
         values, _ = torch.topk(logits, top_k, dim=-1)
         min_values = values[:, -1].unsqueeze(-1).expand(batch_size, vocab_size)
 
         # set the logits who is less than first top_k logits to -inf
         logits = torch.where(logits < min_values,
                              torch.full_like(logits, float('-inf')), logits)
 
-    sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
-
-    # compute  cumulative probability distribution of each sample
-    cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1),
-                                    dim=-1)
-
-    # get the location of top_p
-    sorted_indices_to_remove = cumulative_probs > top_p
-    sorted_indices_to_remove[:, 1:] = sorted_indices_to_remove[:, :-1].clone()
-    sorted_indices_to_remove[:, 0] = 0
-
-    # set the logits to -inf whose is outside top_p
-    indices_to_remove = sorted_indices_to_remove.scatter(
-        1, sorted_indices, sorted_indices_to_remove)
-    logits = logits.masked_fill(indices_to_remove, float('-inf'))
+    # top-p: mask out logits outside the nucleus
+    if need_top_p:
+        sorted_logits, sorted_indices = torch.sort(logits,
+                                                   descending=True,
+                                                   dim=-1)
+
+        # compute cumulative probability distribution of each sample
+        cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1),
+                                        dim=-1)
+
+        # get the location of top_p
+        # NB: Currently selecting the smallest index with cumulative_probs > top_p.
+        #     Thus, top_p -> 0 resembles greedy; agreement requires torch.sort(..., stable=True).
+        sorted_indices_to_remove = cumulative_probs > top_p
+        sorted_indices_to_remove[:,
+                                 1:] = sorted_indices_to_remove[:, :-1].clone()
+        sorted_indices_to_remove[:, 0] = 0
+
+        # set the logits to -inf for token indices outside top_p
+        indices_to_remove = sorted_indices_to_remove.scatter(
+            1, sorted_indices, sorted_indices_to_remove)
+        logits = logits.masked_fill(indices_to_remove, float('-inf'))
 
     # compute probability distribution
     softmax = torch.softmax(logits, dim=-1)
@@ -359,48 +358,78 @@ def sample_rejected(draft_probs: torch.Tensor, target_probs: torch.Tensor,
     return new_token
 
 
-TopK = tuple[Literal["top_k"], int]
+TemperatureOnly = tuple[Literal["temperature"], float]
+TopK = tuple[Literal["top_k"], int, float]
 TopP = tuple[Literal["top_p"], float, float]
 TopKTopP = tuple[Literal["top_k_top_p"], int, float, float]
 Greedy = tuple[Literal["greedy"], None]
 GREEDY: Greedy = ("greedy", None)
-Strategy = TopK | TopP | Greedy | TopKTopP
-
-
-def _request_strategy(request: LlmRequest) -> Strategy:
-    # top_p and top_K with temperature=0.0 reduces to greedy
-    # sampling
-    temperature = request.sampling_config.temperature
-    if temperature is not None:
-        temperature = temperature[0]
-        if temperature == 0.0:
-            return GREEDY
-
-    if request.sampling_config.top_k is not None and len(
-            request.sampling_config.top_k
-    ) > 0 and request.sampling_config.top_p is not None and len(
-            request.sampling_config.top_p) > 0:
-        return ("top_k_top_p", request.sampling_config.top_k[0],
-                request.sampling_config.top_p[0], temperature)
-    elif request.sampling_config.top_p is not None and len(
-            request.sampling_config.top_p) > 0:
-        top_p = request.sampling_config.top_p[0]
-        return ("top_p", top_p, temperature)
-    elif request.sampling_config.top_k is not None and len(
-            request.sampling_config.top_k) > 0:
-        return ("top_k", request.sampling_config.top_k[0])
-    else:
+Strategy = TopK | TopP | Greedy | TopKTopP | TemperatureOnly
+
+T = TypeVar('T')
+
+
+# Due to tensorrt_llm::runtime::SamplingConfig using vectors, params
+# in LlmRequest.sampling_params are either None or single-element lists.
+# This helper method simplifies code using such params.
+def _unwrap_singleton(p: Optional[List[T]]) -> Optional[T]:
+    if p is None:
+        return None
+    t, = p
+    return t
+
+
+def _request_strategy(request: LlmRequest, *, vocab_size: int) -> Strategy:
+    # The semantics are specified in the doc-string of SamplingParams
+
+    sampling_config = request.sampling_config
+    temperature = _unwrap_singleton(
+        cast(Optional[List[float]], sampling_config.temperature))
+    top_p = _unwrap_singleton(cast(Optional[List[float]],
+                                   sampling_config.top_p))
+    top_k = _unwrap_singleton(cast(Optional[List[int]], sampling_config.top_k))
+
+    if SamplingParams.params_imply_greedy_decoding(
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+    ):
         return GREEDY
 
+    # --- resolving default values
+    # NB: not greedy, hence temperature != 0 if specified
+    temperature = temperature or 1.0
+
+    # NB: not greedy, hence top_p != 0 if specified
+    top_p = top_p or 1.0
+    # NB: not greedy, hence top_k != 1 if specified
+    #     (0 and vocab_size are equivalent)
+    top_k = top_k or vocab_size
+
+    assert top_k > 1, "non-greedy sampling requires valid top_k"
+    need_top_k = top_k < vocab_size
+    assert top_p > 0, "non-greedy sampling requires valid top_p"
+    need_top_p = top_p < 1
+
+    if need_top_p:
+        if need_top_k:
+            return ("top_k_top_p", top_k, top_p, temperature)
+        return ("top_p", top_p, temperature)
+    if need_top_k:
+        return ("top_k", top_k, temperature)
+    return ("temperature", temperature)
+
 
 def _group_requests_by_sampling_strategy(
         requests: Iterable[LlmRequest],
         *,
-        pin_memory: bool = False) -> dict[Strategy, torch.Tensor]:
+        pin_memory: bool = False,
+        vocab_size: int) -> dict[Strategy, torch.Tensor]:
     # NB: Client code relies on request indices in returned torch.Tensor being sorted.
     strategy_dict: dict[Strategy, list[int]] = defaultdict(list)
     for req_index, req in enumerate(requests):
-        strategy_dict[_request_strategy(req)].append(req_index)
+        strategy_dict[_request_strategy(
+            req, vocab_size=vocab_size)].append(req_index)
     return {
         strategy: torch.tensor(indices,
                                pin_memory=pin_memory,
@@ -418,23 +447,32 @@ def sample(
 ) -> tuple[torch.Tensor, torch.Tensor]:
     filter_softmax = True
     match strategy:
-        case ("top_k", top_k):
-            tokens, softmax = top_k_sampling_batch(logits, top_k, generator)
+        case ("top_k", top_k, temperature):
+            tokens, softmax = top_k_sampling_batch(logits,
+                                                   top_k=top_k,
+                                                   temperature=temperature,
+                                                   generator=generator)
         case ("top_p", top_p, temperature):
             tokens, softmax = top_p_sampling_batch(
                 logits,
                 top_p=top_p,
                 generator=generator,
-                **(dict(temperature=temperature)
-                   if temperature is not None else dict()))
+                temperature=temperature,
+            )
         case ("top_k_top_p", top_k, top_p, temperature):
             tokens, softmax = top_k_top_p_sampling_batch(
                 logits,
                 top_k=top_k,
                 top_p=top_p,
+                temperature=temperature,
                 generator=generator,
-                **(dict(temperature=temperature)
-                   if temperature is not None else dict()))
+            )
+        case ("temperature", temperature):
+            tokens, softmax = temperature_sampling_batch(
+                logits,
+                temperature=temperature,
+                generator=generator,
+            )
         case ("greedy", None):
             tokens, softmax = greedy_search_sampling_batch(
                 logits, softmax_indices=softmax_indices)
@@ -1323,7 +1361,7 @@ def _sample_batched_by_strategy(
                                                  dim=-1)
 
         requests_by_strategy = _group_requests_by_sampling_strategy(
-            requests, pin_memory=True)
+            requests, pin_memory=True, vocab_size=logits_cuda.size(1))
         generator_cuda = self.get_generator(cuda_device)
 
         # FIXME: This check should/could be performed in ModelDrafter.prepare_draft_tokens