@@ -50,6 +50,24 @@ class BaseMultimodalInputProcessor:
5050 models. Specific processors can override these methods if they need custom logic.
5151 """
5252
53+ @property
54+ def get_num_multimodal_tokens (self ):
55+ """
56+ Get the Hugging Face processor's '_get_num_multimodal_tokens' method.
57+
58+ """
59+ if hasattr (self , 'processor' ) and hasattr (self .processor ,
60+ '_get_num_multimodal_tokens' ):
61+ return self .processor ._get_num_multimodal_tokens
62+ elif hasattr (self , '_processor' ) and hasattr (
63+ self ._processor , '_get_num_multimodal_tokens' ):
64+ return self ._processor ._get_num_multimodal_tokens
65+ else :
66+ raise NotImplementedError (
67+ f"get_num_multimodal_tokens not implemented for { self .__class__ .__name__ } . "
68+ "Please override this method or ensure the processor has _get_num_multimodal_tokens method."
69+ )
70+
5371 def get_num_tokens_per_image (
5472 self ,
5573 * ,
@@ -60,30 +78,14 @@ def get_num_tokens_per_image(
6078 """
6179 Calculate the number of tokens generated for an image.
6280
63- Default implementation assumes the processor has either:
64- 1. A 'processor' attribute with _get_num_multimodal_tokens method
65- 2. A '_processor' attribute with _get_num_multimodal_tokens method
81+ This (default) method delegates to the Hugging Face processor's '_get_num_multimodal_tokens' method.
82+ Returns the token count for the given image.
6683
67- Override this method for custom implementations .
84+ Subclasses can override this method to provide custom logic to calculate the number of tokens .
6885 """
69- if hasattr (self , 'processor' ) and hasattr (self .processor ,
70- '_get_num_multimodal_tokens' ):
71- image_size = (image_height , image_width )
72- num_image_tokens = self .processor ._get_num_multimodal_tokens (
73- [image_size ], ** kwargs )["num_image_tokens" ][0 ]
74- return num_image_tokens
75- # Check for _processor attribute (e.g., Mistral3)
76- elif hasattr (self , '_processor' ) and hasattr (
77- self ._processor , '_get_num_multimodal_tokens' ):
78- image_size = (image_height , image_width )
79- num_image_tokens = self ._processor ._get_num_multimodal_tokens (
80- [image_size ], ** kwargs )["num_image_tokens" ][0 ]
81- return num_image_tokens
82- else :
83- raise NotImplementedError (
84- f"get_num_tokens_per_image not implemented for { self .__class__ .__name__ } . "
85- "Please override this method or ensure the processor has _get_num_multimodal_tokens method."
86- )
86+ image_size = (image_height , image_width )
87+ return self .get_num_multimodal_tokens ([image_size ],
88+ ** kwargs )["num_image_tokens" ][0 ]
8789
8890 def get_num_tokens_per_video (
8991 self ,
@@ -96,53 +98,23 @@ def get_num_tokens_per_video(
9698 """
9799 Calculate the number of tokens generated for a video.
98100
99- Default implementation assumes the processor has either:
100- 1. A 'processor' attribute with _get_num_multimodal_tokens method
101- 2. A '_processor' attribute with _get_num_multimodal_tokens method
101+ This (default) method delegates to the Hugging Face processor's '_get_num_multimodal_tokens' method.
102+ Returns the token count for the given video.
102103
103- Override this method for custom implementations .
104+ Subclasses can override this method to provide custom logic to calculate the number of tokens .
104105 """
105- if hasattr (self , 'processor' ) and hasattr (self .processor ,
106- '_get_num_multimodal_tokens' ):
107- video_size = (num_frames , video_height , video_width )
108- # Try to get video tokens directly
109- try :
110- num_video_tokens = self .processor ._get_num_multimodal_tokens (
111- video_sizes = [video_size ], ** kwargs )["num_video_tokens" ][0 ]
112- return num_video_tokens
113- except Exception :
114- # Fallback: treat video as sequence of frames
115- num_tokens_per_frame = self .get_num_tokens_per_image (
116- image_width = video_width ,
117- image_height = video_height ,
118- ** kwargs )
119- temporal_patch_size = self .temporal_patch_size if hasattr (
120- self , 'temporal_patch_size' ) else 1
121- return num_tokens_per_frame * num_frames // temporal_patch_size
122- # Check for _processor attribute (e.g., Mistral3)
123- # TODO: unify the naming convention for the processor attribute
124- elif hasattr (self , '_processor' ) and hasattr (
125- self ._processor , '_get_num_multimodal_tokens' ):
126- video_size = (num_frames , video_height , video_width )
127- # Try to get video tokens directly
128- try :
129- num_video_tokens = self ._processor ._get_num_multimodal_tokens (
130- video_sizes = [video_size ], ** kwargs )["num_video_tokens" ][0 ]
131- return num_video_tokens
132- except Exception :
133- # Fallback: treat video as sequence of frames
134- num_tokens_per_frame = self .get_num_tokens_per_image (
135- image_width = video_width ,
136- image_height = video_height ,
137- ** kwargs )
138- temporal_patch_size = self .temporal_patch_size if hasattr (
139- self , 'temporal_patch_size' ) else 1
140- return num_tokens_per_frame * num_frames // temporal_patch_size
141- else :
142- raise NotImplementedError (
143- f"get_num_tokens_per_video not implemented for { self .__class__ .__name__ } . "
144- "Please override this method or ensure the processor has _get_num_multimodal_tokens method."
145- )
106+ video_size = (num_frames , video_height , video_width )
107+ try :
108+ num_video_tokens = self .get_num_multimodal_tokens (
109+ video_sizes = [video_size ], ** kwargs )["num_video_tokens" ][0 ]
110+ return num_video_tokens
111+ except Exception :
112+ # Fallback: treat video as sequence of frames
113+ num_tokens_per_frame = self .get_num_tokens_per_image (
114+ image_width = video_width , image_height = video_height , ** kwargs )
115+ temporal_patch_size = self .temporal_patch_size if hasattr (
116+ self , 'temporal_patch_size' ) else 1
117+ return num_tokens_per_frame * num_frames // temporal_patch_size
146118
147119
148120class DefaultInputProcessor (InputProcessor ):
0 commit comments