diff --git a/vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py index 06dd336bf9cf..d083ece892d5 100644 --- a/vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py +++ b/vllm/entrypoints/openai/tool_parsers/minimax_m2_tool_parser.py @@ -2,11 +2,12 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import json -import re import uuid from collections.abc import Sequence from typing import Any +import regex as re + from vllm.entrypoints.openai.protocol import ( ChatCompletionRequest, DeltaFunctionCall, diff --git a/vllm/model_executor/layers/utils.py b/vllm/model_executor/layers/utils.py index 925f9ac0a16e..1bd1159b321e 100644 --- a/vllm/model_executor/layers/utils.py +++ b/vllm/model_executor/layers/utils.py @@ -74,6 +74,10 @@ def apply_penalties( frequency_penalties: The frequency penalties of shape (num_seqs, ) repetition_penalties: The repetition penalties of shape (num_seqs, ) """ + if prompt_tokens_tensor is None or output_tokens_tensor is None: + # If either tensor is None, return logits unchanged + # (cannot apply penalties without token information) + return logits num_seqs, vocab_size = logits.shape _, prompt_mask = get_token_bin_counts_and_mask( prompt_tokens_tensor, vocab_size, num_seqs diff --git a/vllm/utils/torch_utils.py b/vllm/utils/torch_utils.py index adcacb34cb7c..e09e029564cf 100644 --- a/vllm/utils/torch_utils.py +++ b/vllm/utils/torch_utils.py @@ -317,14 +317,23 @@ def make_tensor_with_pad( The padding is applied to the end of each inner list until it reaches `max_len`. """ - np_dtype = TORCH_DTYPE_TO_NUMPY_DTYPE[dtype] - padded_x = make_ndarray_with_pad(x, pad, np_dtype, max_len=max_len) + if max_len is None: + max_len = max(len(row) for row in x) if x else 0 + + padded_tensor = torch.full( + (len(x), max_len), fill_value=pad, dtype=dtype, device=device + ) + + for i, row in enumerate(x): + row_len = len(row) + if row_len > 0: + row_tensor = torch.as_tensor(row, dtype=dtype, device=device) + padded_tensor[i, :row_len] = row_tensor - tensor = torch.from_numpy(padded_x).to(device) - if pin_memory: - tensor = tensor.pin_memory() + if pin_memory and padded_tensor.device.type == "cpu": + return padded_tensor.pin_memory() - return tensor + return padded_tensor prev_set_stream = torch.cuda.set_stream diff --git a/vllm/v1/sample/sampler.py b/vllm/v1/sample/sampler.py index 39c63fe31ad2..7b45d6803548 100644 --- a/vllm/v1/sample/sampler.py +++ b/vllm/v1/sample/sampler.py @@ -305,7 +305,6 @@ def apply_penalties( if sampling_metadata.no_penalties: return logits - assert sampling_metadata.prompt_token_ids is not None return apply_all_penalties( logits, sampling_metadata.prompt_token_ids,