From 0d5b43d133dcc80d59eba3dd25f113f1894001f2 Mon Sep 17 00:00:00 2001 From: Harry Mellor <19981378+hmellor@users.noreply.github.com> Date: Fri, 23 May 2025 13:49:51 +0100 Subject: [PATCH] Include private attributes in API documentation Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> --- mkdocs.yaml | 1 + .../layers/rejection_sampler.py | 35 ++++++------ .../layers/typical_acceptance_sampler.py | 56 ++++++++----------- 3 files changed, 43 insertions(+), 49 deletions(-) diff --git a/mkdocs.yaml b/mkdocs.yaml index a1c6319bb008..b6fabbeed15a 100644 --- a/mkdocs.yaml +++ b/mkdocs.yaml @@ -66,6 +66,7 @@ plugins: options: show_symbol_type_heading: true show_symbol_type_toc: true + filters: [] summary: modules: true show_if_no_docstring: true diff --git a/vllm/model_executor/layers/rejection_sampler.py b/vllm/model_executor/layers/rejection_sampler.py index af82b9dc93b7..3db73495827c 100644 --- a/vllm/model_executor/layers/rejection_sampler.py +++ b/vllm/model_executor/layers/rejection_sampler.py @@ -262,16 +262,16 @@ def _get_accepted( True, then a token can be accepted, else it should be rejected. - Given {math}`q(\hat{x}_{n+1}|x_1, \dots, x_n)`, the probability of - {math}`\hat{x}_{n+1}` given context {math}`x_1, \dots, x_n` according - to the target model, and {math}`p(\hat{x}_{n+1}|x_1, \dots, x_n)`, the + Given $q(\hat{x}_{n+1}|x_1, \dots, x_n)$, the probability of + $\hat{x}_{n+1}$ given context $x_1, \dots, x_n$ according + to the target model, and $p(\hat{x}_{n+1}|x_1, \dots, x_n)$, the same conditional probability according to the draft model, the token is accepted with probability: - :::{math} + $$ \min\left(1, \frac{q(\hat{x}_{n+1}|x_1, \dots, x_n)} {p(\hat{x}_{n+1}|x_1, \dots, x_n)}\right) - ::: + $$ This implementation does not apply causality. When using the output, if a token is rejected, subsequent tokens should not be used. @@ -314,30 +314,31 @@ def _get_recovered_probs( target model is recovered (within hardware numerics). The probability distribution used in this rejection case is constructed - as follows. Given {math}`q(x|x_1, \dots, x_n)`, the probability of - {math}`x` given context {math}`x_1, \dots, x_n` according to the target - model and {math}`p(x|x_1, \dots, x_n)`, the same conditional probability + as follows. Given $q(x|x_1, \dots, x_n)$, the probability of + $x$ given context $x_1, \dots, x_n$ according to the target + model and $p(x|x_1, \dots, x_n)$, the same conditional probability according to the draft model: - :::{math} + $$ x_{n+1} \sim (q(x|x_1, \dots, x_n) - p(x|x_1, \dots, x_n))_+ - ::: + $$ - where {math}`(f(x))_+` is defined as: + where $(f(x))_+$ is defined as: - :::{math} + $$ (f(x))_+ = \frac{\max(0, f(x))}{\sum_x \max(0, f(x))} - ::: + $$ See https://github.com/vllm-project/vllm/pull/2336 for a visualization of the draft, target, and recovered probability distributions. Returns a tensor of shape [batch_size, k, vocab_size]. - Note: This batches operations on GPU and thus constructs the recovered - distribution for all tokens, even if they are accepted. This causes - division-by-zero errors, so we use self._smallest_positive_value to - avoid that. This introduces some drift to the distribution. + Note: + This batches operations on GPU and thus constructs the recovered + distribution for all tokens, even if they are accepted. This causes + division-by-zero errors, so we use self._smallest_positive_value to + avoid that. This introduces some drift to the distribution. """ _, k, _ = draft_probs.shape diff --git a/vllm/model_executor/layers/typical_acceptance_sampler.py b/vllm/model_executor/layers/typical_acceptance_sampler.py index 527a301cd8e2..a14c86148e73 100644 --- a/vllm/model_executor/layers/typical_acceptance_sampler.py +++ b/vllm/model_executor/layers/typical_acceptance_sampler.py @@ -93,29 +93,27 @@ def _evaluate_accepted_tokens(self, target_probs, draft_token_ids): Evaluates and returns a mask of accepted tokens based on the posterior probabilities. - Parameters: - ---------- - target_probs : torch.Tensor - A tensor of shape (batch_size, k, vocab_size) representing - the probabilities of each token in the vocabulary for each - position in the proposed sequence. This is the distribution - generated by the target model. - draft_token_ids : torch.Tensor - A tensor of shape (batch_size, k) representing the proposed - token ids. + Args: + target_probs (torch.Tensor): A tensor of shape + (batch_size, k, vocab_size) representing the probabilities of + each token in the vocabulary for each position in the proposed + sequence. This is the distribution generated by the target + model. + draft_token_ids (torch.Tensor): A tensor of shape (batch_size, k) + representing the proposed token ids. A draft token_id x_{n+k} is accepted if it satisfies the following condition - :::{math} + $$ p_{\text{original}}(x_{n+k} | x_1, x_2, \dots, x_{n+k-1}) > \min \left( \epsilon, \delta * \exp \left( -H(p_{\text{original}}( \cdot | x_1, x_2, \ldots, x_{n+k-1})) \right) \right) - ::: + $$ - where {math}`p_{\text{original}}` corresponds to target_probs - and {math}`\epsilon` and {math}`\delta` correspond to hyperparameters + where $p_{\text{original}}$ corresponds to target_probs + and $\epsilon$ and $\delta$ correspond to hyperparameters specified using self._posterior_threshold and self._posterior_alpha This method computes the posterior probabilities for the given @@ -126,13 +124,10 @@ def _evaluate_accepted_tokens(self, target_probs, draft_token_ids): returns a boolean mask indicating which tokens can be accepted. Returns: - ------- - torch.Tensor - A boolean tensor of shape (batch_size, k) where each element - indicates whether the corresponding draft token has been accepted - or rejected. True indicates acceptance and false indicates - rejection. - + torch.Tensor: A boolean tensor of shape (batch_size, k) where each + element indicates whether the corresponding draft token has + been accepted or rejected. True indicates acceptance and false + indicates rejection. """ device = target_probs.device candidates_prob = torch.gather( @@ -156,17 +151,14 @@ def _get_recovered_token_ids(self, target_probs): The recovered token ids will fill the first unmatched token by the target token. - Parameters - ---------- - target_probs : torch.Tensor - A tensor of shape (batch_size, k, vocab_size) containing - the target probability distribution - - Returns - ------- - torch.Tensor - A tensor of shape (batch_size, k) with the recovered token - ids which are selected from target probs. + Args: + target_probs (torch.Tensor): A tensor of shape + (batch_size, k, vocab_size) containing the target probability + distribution. + + Returns: + torch.Tensor: A tensor of shape (batch_size, k) with the recovered + token ids which are selected from target probs. """ max_indices = torch.argmax(target_probs, dim=-1)