1919from  vllm .multimodal  import  MULTIMODAL_REGISTRY 
2020from  vllm .multimodal .inputs  import  (MultiModalFieldConfig , MultiModalKwargs ,
2121                                    NestedTensors )
22- from  vllm .multimodal .parse  import  (ImageSize ,  MultiModalDataItems ,
23-                                    VideoEmbeddingItems ,  VideoProcessorItems )
22+ from  vllm .multimodal .parse  import  (MultiModalDataItems ,  VideoEmbeddingItems ,
23+                                    VideoProcessorItems )
2424from  vllm .multimodal .processing  import  PromptReplacement 
2525from  vllm .multimodal .profiling  import  ProcessorInputs 
2626from  vllm .sequence  import  IntermediateTensors 
@@ -109,7 +109,7 @@ def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
109109
110110    # Based on: https://github.com/huggingface/text-generation-inference/blob/v3.0.1/server/text_generation_server/models/vlm_causal_lm.py#L86 
111111    # with additional logic afterwards taken from LlavaOnevisionProcessor 
112-     def  get_num_unpadded_features (
112+     def  _get_num_unpadded_features (
113113        self ,
114114        * ,
115115        original_height : int ,
@@ -145,23 +145,7 @@ def get_num_unpadded_features(
145145
146146        return  (unpadded_features , newline_features )
147147
148-     def  get_image_size_with_most_features (self ) ->  ImageSize :
149-         hf_config  =  self .get_hf_config ()
150-         largest_feature_size , largest_feature_pinpoint  =  0 , None 
151-         for  (height , width ) in  hf_config .image_grid_pinpoints :
152-             feat_size  =  self .get_num_image_tokens (image_width = width ,
153-                                                   image_height = height )
154-             if  feat_size  >  largest_feature_size :
155-                 largest_feature_size  =  feat_size 
156-                 largest_feature_pinpoint  =  ImageSize (width = width ,
157-                                                      height = height )
158- 
159-         if  largest_feature_size  ==  0  or  largest_feature_pinpoint  is  None :
160-             raise  ValueError ("Cannot have a largest feature size of 0!" )
161- 
162-         return  largest_feature_pinpoint 
163- 
164-     def  get_num_frame_tokens (
148+     def  _get_num_frame_tokens (
165149        self ,
166150        * ,
167151        image_width : int ,
@@ -183,14 +167,14 @@ def get_num_video_tokens(
183167        image_height : int ,
184168        num_frames : int ,
185169    ) ->  int :
186-         num_frame_tokens  =  self .get_num_frame_tokens (
170+         num_frame_tokens  =  self ._get_num_frame_tokens (
187171            image_width = image_width ,
188172            image_height = image_height ,
189173        )
190174
191175        return  num_frame_tokens  *  num_frames  +  1   # Newline token 
192176
193-     def  get_max_video_frames (self , max_tokens : int ) ->  int :
177+     def  _get_max_video_frames (self , max_tokens : int ) ->  int :
194178        target_width , target_height  =  self .get_image_size_with_most_features ()
195179
196180        num_frames  =  0 
@@ -210,14 +194,14 @@ def get_max_video_frames(self, max_tokens: int) -> int:
210194
211195        return  num_frames 
212196
213-     def  get_max_num_frames (self , seq_len : int ) ->  int :
197+     def  get_num_frames_with_most_features (self , seq_len : int ) ->  int :
214198        mm_config  =  self .ctx .get_mm_config ()
215199        max_images  =  mm_config .limit_per_prompt .get ("image" , 1 )
216200        max_videos  =  mm_config .limit_per_prompt .get ("video" , 1 )
217201
218202        max_image_tokens  =  self .get_max_image_tokens () *  max_images 
219-         max_total_frames  =  self .get_max_video_frames (seq_len  - 
220-                                                      max_image_tokens )
203+         max_total_frames  =  self ._get_max_video_frames (seq_len  - 
204+                                                        max_image_tokens )
221205        max_frames_per_video  =  min (max_total_frames  //  max (max_videos , 1 ),
222206                                   _MAX_FRAMES_PER_VIDEO )
223207
@@ -229,7 +213,7 @@ def get_max_video_tokens(self, seq_len: int) -> int:
229213        return  self .get_num_video_tokens (
230214            image_width = target_width ,
231215            image_height = target_height ,
232-             num_frames = self .get_max_num_frames (seq_len ),
216+             num_frames = self .get_num_frames_with_most_features (seq_len ),
233217        )
234218
235219
@@ -247,8 +231,11 @@ def get_dummy_processor_inputs(
247231        processor  =  self .info .get_hf_processor ()
248232        image_token  =  processor .image_token 
249233        video_token  =  processor .video_token 
234+ 
250235        target_width , target_height  =  \
251236            self .info .get_image_size_with_most_features ()
237+         target_num_frames  =  \
238+             self .info .get_num_frames_with_most_features (seq_len )
252239
253240        mm_data  =  {
254241            "image" :
@@ -259,7 +246,7 @@ def get_dummy_processor_inputs(
259246            self ._get_dummy_videos (
260247                width = target_width ,
261248                height = target_height ,
262-                 num_frames = self . info . get_max_num_frames ( seq_len ) ,
249+                 num_frames = target_num_frames ,
263250                num_videos = num_videos ,
264251            )
265252        }
0 commit comments