Skip to content

Commit 4c34704

Browse files
Isotr0pyywang96
authored andcommitted
[VLM] Update Qwen3-VL max_num_video_tokens calculation for configurable video profiling (#25557)
Signed-off-by: Isotr0py <[email protected]> Signed-off-by: Roger Wang <[email protected]> Co-authored-by: Roger Wang <[email protected]> Signed-off-by: simon-mo <[email protected]>
1 parent 19e7ab7 commit 4c34704

File tree

2 files changed

+74
-9
lines changed

2 files changed

+74
-9
lines changed

vllm/model_executor/models/qwen2_vl.py

Lines changed: 9 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@
7979
logger = init_logger(__name__)
8080

8181
# For profile run
82-
_MAX_FRAMES_PER_VIDEO = 32
82+
_MAX_FRAMES_PER_VIDEO = 14
8383

8484
# === Vision Inputs === #
8585

@@ -932,6 +932,7 @@ def get_num_image_tokens(
932932
_, num_image_tokens = self._get_vision_info(
933933
image_width=image_width,
934934
image_height=image_height,
935+
num_frames=1,
935936
image_processor=image_processor,
936937
)
937938
return num_image_tokens
@@ -956,6 +957,7 @@ def get_image_size_with_most_features(self) -> ImageSize:
956957
max_image_size, _ = self._get_vision_info(
957958
image_width=9999999,
958959
image_height=9999999,
960+
num_frames=1,
959961
image_processor=None,
960962
)
961963
return max_image_size
@@ -969,10 +971,12 @@ def get_max_image_tokens(self) -> int:
969971
image_processor=None,
970972
)
971973

972-
def _get_max_video_frames(self, max_tokens: int) -> int:
974+
def _get_max_video_frames(self,
975+
max_tokens: int,
976+
start_num_frames: int = 1) -> int:
973977
target_width, target_height = self.get_image_size_with_most_features()
974978

975-
num_frames = 0
979+
num_frames = start_num_frames
976980

977981
while True:
978982
next_num_frames = num_frames + 1
@@ -994,12 +998,13 @@ def get_num_frames_with_most_features(
994998
self,
995999
seq_len: int,
9961000
mm_counts: Mapping[str, int],
1001+
max_frames_per_video: int = _MAX_FRAMES_PER_VIDEO,
9971002
) -> int:
9981003
max_videos = mm_counts.get("video", 0)
9991004

10001005
max_total_frames = self._get_max_video_frames(seq_len)
10011006
max_frames_per_video = min(max_total_frames // max(max_videos, 1),
1002-
_MAX_FRAMES_PER_VIDEO)
1007+
max_frames_per_video)
10031008

10041009
return max(max_frames_per_video, 1)
10051010

vllm/model_executor/models/qwen3_vl.py

Lines changed: 65 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,11 +33,14 @@
3333
import torch.nn.functional as F
3434
from transformers import BatchFeature
3535
from transformers.models.qwen2_vl import Qwen2VLImageProcessorFast
36-
from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
36+
from transformers.models.qwen2_vl.image_processing_qwen2_vl import (
37+
smart_resize as image_smart_resize)
3738
from transformers.models.qwen3_vl import (Qwen3VLProcessor,
3839
Qwen3VLVideoProcessor)
3940
from transformers.models.qwen3_vl.configuration_qwen3_vl import (
4041
Qwen3VLConfig, Qwen3VLVisionConfig)
42+
from transformers.models.qwen3_vl.video_processing_qwen3_vl import (
43+
smart_resize as video_smart_resize)
4144
from transformers.video_utils import VideoMetadata
4245

4346
from vllm.attention.layer import check_upstream_fa_availability
@@ -84,6 +87,9 @@
8487

8588
logger = init_logger(__name__)
8689

90+
# Official recommended max pixels is 24576 * 32 * 32
91+
_MAX_FRAMES_PER_VIDEO = 24576
92+
8793

8894
class Qwen3_VisionPatchEmbed(nn.Module):
8995

@@ -592,24 +598,39 @@ def _get_vision_info(
592598
image_height: int,
593599
num_frames: int = 2,
594600
do_resize: bool = True,
595-
image_processor: Optional[Qwen2VLImageProcessorFast],
601+
image_processor: Optional[Union[Qwen2VLImageProcessorFast,
602+
Qwen3VLVideoProcessor]],
596603
) -> tuple[ImageSize, int]:
597-
if image_processor is None:
604+
if image_processor is None and num_frames > 1:
605+
image_processor = self.get_video_processor()
606+
elif image_processor is None:
598607
image_processor = self.get_image_processor()
599608

609+
is_video = isinstance(image_processor, Qwen3VLVideoProcessor)
610+
600611
hf_config = self.get_hf_config()
601612
vision_config = hf_config.vision_config
602613
patch_size = vision_config.patch_size
603614
merge_size = vision_config.spatial_merge_size
604615
temporal_patch_size = vision_config.temporal_patch_size
605616

606617
if do_resize:
618+
if is_video:
619+
smart_resize = video_smart_resize
620+
extra_kwargs = {
621+
"num_frames": num_frames,
622+
"temporal_factor": temporal_patch_size
623+
}
624+
else:
625+
smart_resize = image_smart_resize
626+
extra_kwargs = {}
607627
resized_height, resized_width = smart_resize(
608628
height=image_height,
609629
width=image_width,
610630
factor=patch_size * merge_size,
611631
min_pixels=image_processor.size["shortest_edge"],
612632
max_pixels=image_processor.size["longest_edge"],
633+
**extra_kwargs,
613634
)
614635
preprocessed_size = ImageSize(width=resized_width,
615636
height=resized_height)
@@ -628,6 +649,39 @@ def _get_vision_info(
628649

629650
return preprocessed_size, num_vision_tokens
630651

652+
def _get_max_video_frames(self,
653+
max_tokens: int,
654+
start_num_frames: int = 2) -> int:
655+
return super()._get_max_video_frames(max_tokens,
656+
start_num_frames=start_num_frames)
657+
658+
def get_num_frames_with_most_features(
659+
self,
660+
seq_len: int,
661+
mm_counts: Mapping[str, int],
662+
) -> int:
663+
return super().get_num_frames_with_most_features(
664+
seq_len, mm_counts, max_frames_per_video=_MAX_FRAMES_PER_VIDEO)
665+
666+
def get_max_video_tokens(
667+
self,
668+
seq_len: int,
669+
mm_counts: Mapping[str, int],
670+
) -> int:
671+
target_width, target_height = self.get_image_size_with_most_features()
672+
video_soft_tokens = self.get_num_video_tokens(
673+
image_width=target_width,
674+
image_height=target_height,
675+
num_frames=self.get_num_frames_with_most_features(
676+
seq_len, mm_counts),
677+
image_processor=None,
678+
)
679+
680+
# NOTE: By default in Qwen3-VL, one video token is converted to
681+
# "<{timestamp} seconds>" (on average 9.5 tokens) + vision_start_token + video_token + vision_end_token # noqa: E501
682+
formatted_video_soft_tokens = video_soft_tokens * 12.5
683+
return int(formatted_video_soft_tokens)
684+
631685
def _calculate_timestamps(self, indices: list[int] | torch.Tensor,
632686
video_fps: float, merge_size: int):
633687
if not isinstance(indices, list):
@@ -697,15 +751,21 @@ def get_dummy_mm_data(
697751
self.info.get_image_size_with_most_features())
698752
target_num_frames = self.info.get_num_frames_with_most_features(
699753
seq_len, mm_counts)
754+
target_video_size, _ = self.info._get_vision_info(
755+
image_width=target_width,
756+
image_height=target_height,
757+
num_frames=target_num_frames,
758+
image_processor=self.info.get_video_processor(),
759+
)
700760
return {
701761
"image":
702762
self._get_dummy_images(width=target_width,
703763
height=target_height,
704764
num_images=num_images),
705765
"video":
706766
self._get_dummy_videos(
707-
width=target_width,
708-
height=target_height,
767+
width=target_video_size.width,
768+
height=target_video_size.height,
709769
num_frames=target_num_frames,
710770
num_videos=num_videos,
711771
),

0 commit comments

Comments
 (0)