From f7b66a96758f2e87bd759c07db02f44dcff5a6ef Mon Sep 17 00:00:00 2001 From: jp1924 Date: Tue, 3 Dec 2024 01:02:37 +0000 Subject: [PATCH 01/12] Add: num_additional_image_tokens to models --- src/transformers/models/llava/configuration_llava.py | 3 +++ src/transformers/models/llava/modeling_llava.py | 4 +++- .../models/llava_next/configuration_llava_next.py | 3 +++ src/transformers/models/llava_next/modeling_llava_next.py | 3 ++- .../llava_next_video/configuration_llava_next_video.py | 2 ++ .../models/llava_next_video/modeling_llava_next_video.py | 3 ++- .../llava_onevision/configuration_llava_onevision.py | 2 ++ .../models/llava_onevision/modeling_llava_onevision.py | 5 +++-- src/transformers/models/vipllava/configuration_vipllava.py | 2 ++ src/transformers/models/vipllava/modeling_vipllava.py | 7 ++++++- 10 files changed, 28 insertions(+), 6 deletions(-) diff --git a/src/transformers/models/llava/configuration_llava.py b/src/transformers/models/llava/configuration_llava.py index 05034f5cfcf6..673d6cddfdad 100644 --- a/src/transformers/models/llava/configuration_llava.py +++ b/src/transformers/models/llava/configuration_llava.py @@ -50,6 +50,7 @@ class LlavaConfig(PretrainedConfig): The index of the layer to select the vision feature. image_seq_length (`int`, *optional*, defaults to 576): Sequence length of one image embedding. + num_additional_image_tokens (``, *optional*, defaults to 0): Example: @@ -85,6 +86,7 @@ def __init__( vision_feature_select_strategy="default", vision_feature_layer=-2, image_seq_length=576, + num_additional_image_tokens=0, **kwargs, ): self.ignore_index = ignore_index @@ -127,5 +129,6 @@ def __init__( text_config = CONFIG_MAPPING["llama"]() self.text_config = text_config + self.num_additional_image_tokens = num_additional_image_tokens super().__init__(**kwargs) diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py index e8536ee50f94..df0d91e4243e 100644 --- a/src/transformers/models/llava/modeling_llava.py +++ b/src/transformers/models/llava/modeling_llava.py @@ -240,6 +240,8 @@ def __init__(self, config: LlavaConfig): self.vocab_size = config.text_config.vocab_size self.language_model = AutoModelForCausalLM.from_config(config.text_config) self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1 + self.num_additional_image_tokens = config.num_additional_image_tokens + self.post_init() def get_input_embeddings(self): @@ -291,7 +293,7 @@ def get_image_features( # this is not memory efficient at all (output_hidden_states=True) will save all the hidden stated. selected_image_feature = image_outputs.hidden_states[vision_feature_layer] if vision_feature_select_strategy == "default": - selected_image_feature = selected_image_feature[:, 1:] + selected_image_feature = selected_image_feature[:, self.num_additional_image_tokens :] elif vision_feature_select_strategy == "full": selected_image_feature = selected_image_feature else: diff --git a/src/transformers/models/llava_next/configuration_llava_next.py b/src/transformers/models/llava_next/configuration_llava_next.py index 54616edbf96d..09f2475f5a18 100644 --- a/src/transformers/models/llava_next/configuration_llava_next.py +++ b/src/transformers/models/llava_next/configuration_llava_next.py @@ -55,6 +55,7 @@ class LlavaNextConfig(PretrainedConfig): Whether the model's input and output word embeddings should be tied. image_seq_length (`int`, *optional*, defaults to 576): Sequence length of one image embedding. + num_additional_image_tokens (``, *optional*, defaults to 0): Example: @@ -92,6 +93,7 @@ def __init__( image_grid_pinpoints=None, tie_word_embeddings=False, image_seq_length=576, + num_additional_image_tokens=0, **kwargs, ): self.ignore_index = ignore_index @@ -140,5 +142,6 @@ def __init__( text_config = CONFIG_MAPPING["llama"]() self.text_config = text_config + self.num_additional_image_tokens = num_additional_image_tokens super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py index 269663c7d614..6ac0f1df4f02 100644 --- a/src/transformers/models/llava_next/modeling_llava_next.py +++ b/src/transformers/models/llava_next/modeling_llava_next.py @@ -356,6 +356,7 @@ def __init__(self, config: LlavaNextConfig): self.language_model = AutoModelForCausalLM.from_config(config.text_config) self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1 self._padding_side = "left" # set it to left by default, user can use setter to change padding_sides + self.num_additional_image_tokens = config.num_additional_image_tokens self.post_init() @property @@ -749,7 +750,7 @@ def get_image_features( image_features = self.vision_tower(pixel_values, output_hidden_states=True) selected_image_feature = image_features.hidden_states[vision_feature_layer] if vision_feature_select_strategy == "default": - selected_image_feature = selected_image_feature[:, 1:] + selected_image_feature = selected_image_feature[:, self.num_additional_image_tokens :] elif vision_feature_select_strategy == "full": selected_image_feature = selected_image_feature image_features = self.multi_modal_projector(selected_image_feature) diff --git a/src/transformers/models/llava_next_video/configuration_llava_next_video.py b/src/transformers/models/llava_next_video/configuration_llava_next_video.py index 2fe889da6033..3e55f00e1aeb 100644 --- a/src/transformers/models/llava_next_video/configuration_llava_next_video.py +++ b/src/transformers/models/llava_next_video/configuration_llava_next_video.py @@ -104,6 +104,7 @@ def __init__( spatial_pool_stride=2, image_seq_length=576, video_seq_length=288, + num_additional_image_tokens=0, **kwargs, ): self.video_token_index = video_token_index @@ -156,5 +157,6 @@ def __init__( text_config = CONFIG_MAPPING["llama"]() self.text_config = text_config + self.num_additional_image_tokens = num_additional_image_tokens super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) diff --git a/src/transformers/models/llava_next_video/modeling_llava_next_video.py b/src/transformers/models/llava_next_video/modeling_llava_next_video.py index b0a20d6c5ccd..0204ff99af22 100644 --- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py @@ -396,6 +396,7 @@ def __init__( self.language_model = AutoModelForCausalLM.from_config(config.text_config) self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1 self._padding_side = "left" # set it to left by default, user can use setter to change padding_sides + self.num_additional_image_tokens = config.num_additional_image_tokens self.vision_resampler = LlavaNextVideoPooler(config) self.post_init() @@ -782,7 +783,7 @@ def get_image_features( image_features = self.vision_tower(pixel_values, output_hidden_states=True) selected_image_feature = image_features.hidden_states[vision_feature_layer] if vision_feature_select_strategy == "default": - selected_image_feature = selected_image_feature[:, 1:] + selected_image_feature = selected_image_feature[:, self.num_additional_image_tokens :] elif vision_feature_select_strategy == "full": selected_image_feature = selected_image_feature image_features = self.multi_modal_projector(selected_image_feature) diff --git a/src/transformers/models/llava_onevision/configuration_llava_onevision.py b/src/transformers/models/llava_onevision/configuration_llava_onevision.py index 46b65b35b1a5..de61e86bfcb4 100644 --- a/src/transformers/models/llava_onevision/configuration_llava_onevision.py +++ b/src/transformers/models/llava_onevision/configuration_llava_onevision.py @@ -95,6 +95,7 @@ def __init__( vision_aspect_ratio="anyres_max_9", image_grid_pinpoints=None, tie_word_embeddings=False, + num_additional_image_tokens=0, **kwargs, ): self.image_token_index = image_token_index @@ -179,5 +180,6 @@ def __init__( text_config = CONFIG_MAPPING["qwen2"]() self.text_config = text_config + self.num_additional_image_tokens = num_additional_image_tokens super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) diff --git a/src/transformers/models/llava_onevision/modeling_llava_onevision.py b/src/transformers/models/llava_onevision/modeling_llava_onevision.py index 626db4d96aae..19cd6a0bf26f 100644 --- a/src/transformers/models/llava_onevision/modeling_llava_onevision.py +++ b/src/transformers/models/llava_onevision/modeling_llava_onevision.py @@ -371,6 +371,7 @@ def __init__(self, config: LlavaOnevisionConfig): self.vocab_size = config.text_config.vocab_size self.language_model = AutoModelForCausalLM.from_config(config.text_config) + self.num_additional_image_tokens = config.num_additional_image_tokens self.post_init() # Copied from transformers.models.llava_next.modeling_llava_next.LlavaNextForConditionalGeneration.get_input_embeddings @@ -525,7 +526,7 @@ def get_image_features( image_features = self.vision_tower(pixel_values, output_hidden_states=True) selected_image_feature = image_features.hidden_states[vision_feature_layer] if vision_feature_select_strategy == "default": - selected_image_feature = selected_image_feature[:, 1:] + selected_image_feature = selected_image_feature[:, self.num_additional_image_tokens :] elif vision_feature_select_strategy == "full": selected_image_feature = selected_image_feature image_features = self.multi_modal_projector(selected_image_feature) @@ -556,7 +557,7 @@ def get_video_features( selected_video_feature = video_features.hidden_states[vision_feature_layer] if vision_feature_select_strategy == "default": - selected_video_feature = selected_video_feature[:, 1:] + selected_video_feature = selected_video_feature[:, self.num_additional_image_tokens :] elif vision_feature_select_strategy == "full": selected_video_feature = selected_video_feature video_features = self.multi_modal_projector(selected_video_feature) diff --git a/src/transformers/models/vipllava/configuration_vipllava.py b/src/transformers/models/vipllava/configuration_vipllava.py index f26c2b2f50fb..2f3e25f7db87 100644 --- a/src/transformers/models/vipllava/configuration_vipllava.py +++ b/src/transformers/models/vipllava/configuration_vipllava.py @@ -84,6 +84,7 @@ def __init__( projector_layernorm_eps=1e-5, vision_feature_layers=[-2, -5, -8, -11, 6], image_seq_length=576, + num_additional_image_tokens=0, **kwargs, ): self.ignore_index = ignore_index @@ -118,5 +119,6 @@ def __init__( text_config = CONFIG_MAPPING["llama"]() self.text_config = text_config + self.num_additional_image_tokens = num_additional_image_tokens super().__init__(**kwargs) diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py index b45325d2194e..9471f90d5414 100644 --- a/src/transformers/models/vipllava/modeling_vipllava.py +++ b/src/transformers/models/vipllava/modeling_vipllava.py @@ -243,6 +243,8 @@ def __init__(self, config: VipLlavaConfig): self.vocab_size = config.text_config.vocab_size self.language_model = AutoModelForCausalLM.from_config(config.text_config) self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1 + self.num_additional_image_tokens = config.num_additional_image_tokens + self.post_init() def get_input_embeddings(self): @@ -290,7 +292,10 @@ def get_image_features(self, pixel_values: torch.FloatTensor, vision_feature_lay # For VIP-llava, the image features are computed this way # We select the features from index 1: for the layers -2, -5, -8, -11 and 6 - image_features = [image_outputs.hidden_states[index][:, 1:] for index in vision_feature_layers] + image_features = [ + image_outputs.hidden_states[index][:, self.num_additional_image_tokens :] + for index in vision_feature_layers + ] image_features = torch.cat(image_features, dim=-1) image_features = self.multi_modal_projector(image_features) return image_features From 17ca2855b94c7fe560e7872483107a7433b48f1b Mon Sep 17 00:00:00 2001 From: jp1924 Date: Tue, 3 Dec 2024 01:17:01 +0000 Subject: [PATCH 02/12] docs: update docstring for num_additional_image_tokens in configuration files --- src/transformers/models/llava/configuration_llava.py | 4 +++- .../models/llava_next/configuration_llava_next.py | 4 +++- .../models/llava_next_video/configuration_llava_next_video.py | 3 +++ .../models/llava_onevision/configuration_llava_onevision.py | 3 +++ src/transformers/models/vipllava/configuration_vipllava.py | 3 +++ 5 files changed, 15 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/llava/configuration_llava.py b/src/transformers/models/llava/configuration_llava.py index 673d6cddfdad..d9b19cecdc39 100644 --- a/src/transformers/models/llava/configuration_llava.py +++ b/src/transformers/models/llava/configuration_llava.py @@ -50,7 +50,9 @@ class LlavaConfig(PretrainedConfig): The index of the layer to select the vision feature. image_seq_length (`int`, *optional*, defaults to 576): Sequence length of one image embedding. - num_additional_image_tokens (``, *optional*, defaults to 0): + num_additional_image_tokens (`int`, *optional*, defaults to 0): + Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other + extra tokens appended, no need to set this arg. Example: diff --git a/src/transformers/models/llava_next/configuration_llava_next.py b/src/transformers/models/llava_next/configuration_llava_next.py index 09f2475f5a18..ba88c5659fed 100644 --- a/src/transformers/models/llava_next/configuration_llava_next.py +++ b/src/transformers/models/llava_next/configuration_llava_next.py @@ -55,7 +55,9 @@ class LlavaNextConfig(PretrainedConfig): Whether the model's input and output word embeddings should be tied. image_seq_length (`int`, *optional*, defaults to 576): Sequence length of one image embedding. - num_additional_image_tokens (``, *optional*, defaults to 0): + num_additional_image_tokens (`int`, *optional*, defaults to 0): + Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other + extra tokens appended, no need to set this arg. Example: diff --git a/src/transformers/models/llava_next_video/configuration_llava_next_video.py b/src/transformers/models/llava_next_video/configuration_llava_next_video.py index 3e55f00e1aeb..b349cfc01360 100644 --- a/src/transformers/models/llava_next_video/configuration_llava_next_video.py +++ b/src/transformers/models/llava_next_video/configuration_llava_next_video.py @@ -65,6 +65,9 @@ class LlavaNextVideoConfig(PretrainedConfig): Sequence length of one image embedding. video_seq_length (`int`, *optional*, defaults to 288): Sequence length of one video embedding. + num_additional_image_tokens (`int`, *optional*, defaults to 0): + Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other + extra tokens appended, no need to set this arg. Example: diff --git a/src/transformers/models/llava_onevision/configuration_llava_onevision.py b/src/transformers/models/llava_onevision/configuration_llava_onevision.py index de61e86bfcb4..49afdd14edc5 100644 --- a/src/transformers/models/llava_onevision/configuration_llava_onevision.py +++ b/src/transformers/models/llava_onevision/configuration_llava_onevision.py @@ -58,6 +58,9 @@ class LlavaOnevisionConfig(PretrainedConfig): of the form `(height, width)`. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether the model's input and output word embeddings should be tied. + num_additional_image_tokens (`int`, *optional*, defaults to 0): + Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other + extra tokens appended, no need to set this arg. Example: diff --git a/src/transformers/models/vipllava/configuration_vipllava.py b/src/transformers/models/vipllava/configuration_vipllava.py index 2f3e25f7db87..7410f8f92b9e 100644 --- a/src/transformers/models/vipllava/configuration_vipllava.py +++ b/src/transformers/models/vipllava/configuration_vipllava.py @@ -49,6 +49,9 @@ class VipLlavaConfig(PretrainedConfig): The list of layers to select the vision features from. image_seq_length (`int`, *optional*, defaults to 576): Sequence length of one image embedding. + num_additional_image_tokens (`int`, *optional*, defaults to 0): + Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other + extra tokens appended, no need to set this arg. Example: From 5093d49cfaccd5c70045d6d792ecc18d1fab400a Mon Sep 17 00:00:00 2001 From: jp1924 Date: Tue, 3 Dec 2024 01:38:54 +0000 Subject: [PATCH 03/12] Add num_additional_image_tokens to LlavaNextVideo model and update feature selection logic --- .../llava_next_video/modeling_llava_next_video.py | 2 +- .../llava_next_video/modular_llava_next_video.py | 10 ++++++++-- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/llava_next_video/modeling_llava_next_video.py b/src/transformers/models/llava_next_video/modeling_llava_next_video.py index 0204ff99af22..a145883b7f2a 100644 --- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py @@ -1149,7 +1149,7 @@ def get_video_features( video_features = self.vision_tower(pixel_values, output_hidden_states=True) selected_video_features = video_features.hidden_states[vision_feature_layer] if vision_feature_select_strategy == "default": - selected_video_features = selected_video_features[:, 1:] + selected_video_features = selected_video_features[:, self.num_additional_image_tokens :] elif vision_feature_select_strategy == "full": selected_video_features = selected_video_features diff --git a/src/transformers/models/llava_next_video/modular_llava_next_video.py b/src/transformers/models/llava_next_video/modular_llava_next_video.py index 3d6431d7ea29..4cebb880b911 100644 --- a/src/transformers/models/llava_next_video/modular_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modular_llava_next_video.py @@ -78,6 +78,9 @@ class LlavaNextVideoConfig(PretrainedConfig): Sequence length of one image embedding. video_seq_length (`int`, *optional*, defaults to 288): Sequence length of one video embedding. + num_additional_image_tokens (`int`, *optional*, defaults to 0): + Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other + extra tokens appended, no need to set this arg. Example: @@ -117,6 +120,7 @@ def __init__( spatial_pool_stride=2, image_seq_length=576, video_seq_length=288, + num_additional_image_tokens=0, **kwargs, ): self.video_token_index = video_token_index @@ -169,6 +173,7 @@ def __init__( text_config = CONFIG_MAPPING["llama"]() self.text_config = text_config + self.num_additional_image_tokens = num_additional_image_tokens super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) @@ -221,6 +226,7 @@ def forward(self, image_features): class LlavaNextVideoForConditionalGeneration(LlavaNextForConditionalGeneration): def __init__(self, config: LlavaNextVideoConfig, **super_kwargs): super().__init__(config, **super_kwargs) + self.num_additional_image_tokens = config.num_additional_image_tokens self.vision_resampler = LlavaNextVideoPooler(config) self.post_init() @@ -268,7 +274,7 @@ def get_image_features( image_features = self.vision_tower(pixel_values, output_hidden_states=True) selected_image_feature = image_features.hidden_states[vision_feature_layer] if vision_feature_select_strategy == "default": - selected_image_feature = selected_image_feature[:, 1:] + selected_image_feature = selected_image_feature[:, self.num_additional_image_tokens :] elif vision_feature_select_strategy == "full": selected_image_feature = selected_image_feature image_features = self.multi_modal_projector(selected_image_feature) @@ -298,7 +304,7 @@ def get_video_features( video_features = self.vision_tower(pixel_values, output_hidden_states=True) selected_video_features = video_features.hidden_states[vision_feature_layer] if vision_feature_select_strategy == "default": - selected_video_features = selected_video_features[:, 1:] + selected_video_features = selected_video_features[:, self.num_additional_image_tokens :] elif vision_feature_select_strategy == "full": selected_video_features = selected_video_features From 264f84892387d1d15029137a7e9e2a2eafcfd171 Mon Sep 17 00:00:00 2001 From: jp1924 Date: Fri, 6 Dec 2024 06:42:00 +0000 Subject: [PATCH 04/12] revert --- src/transformers/models/llava/configuration_llava.py | 5 ----- src/transformers/models/llava/modeling_llava.py | 3 +-- .../models/llava_next/configuration_llava_next.py | 4 ---- src/transformers/models/llava_next/modeling_llava_next.py | 3 +-- .../llava_next_video/configuration_llava_next_video.py | 4 ---- .../models/llava_next_video/modeling_llava_next_video.py | 3 +-- .../models/llava_next_video/modular_llava_next_video.py | 6 ++---- .../models/llava_onevision/configuration_llava_onevision.py | 4 ---- .../models/llava_onevision/modeling_llava_onevision.py | 5 ++--- src/transformers/models/vipllava/configuration_vipllava.py | 5 ----- src/transformers/models/vipllava/modeling_vipllava.py | 6 +----- 11 files changed, 8 insertions(+), 40 deletions(-) diff --git a/src/transformers/models/llava/configuration_llava.py b/src/transformers/models/llava/configuration_llava.py index d9b19cecdc39..05034f5cfcf6 100644 --- a/src/transformers/models/llava/configuration_llava.py +++ b/src/transformers/models/llava/configuration_llava.py @@ -50,9 +50,6 @@ class LlavaConfig(PretrainedConfig): The index of the layer to select the vision feature. image_seq_length (`int`, *optional*, defaults to 576): Sequence length of one image embedding. - num_additional_image_tokens (`int`, *optional*, defaults to 0): - Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other - extra tokens appended, no need to set this arg. Example: @@ -88,7 +85,6 @@ def __init__( vision_feature_select_strategy="default", vision_feature_layer=-2, image_seq_length=576, - num_additional_image_tokens=0, **kwargs, ): self.ignore_index = ignore_index @@ -131,6 +127,5 @@ def __init__( text_config = CONFIG_MAPPING["llama"]() self.text_config = text_config - self.num_additional_image_tokens = num_additional_image_tokens super().__init__(**kwargs) diff --git a/src/transformers/models/llava/modeling_llava.py b/src/transformers/models/llava/modeling_llava.py index df0d91e4243e..ea04d4ce8020 100644 --- a/src/transformers/models/llava/modeling_llava.py +++ b/src/transformers/models/llava/modeling_llava.py @@ -240,7 +240,6 @@ def __init__(self, config: LlavaConfig): self.vocab_size = config.text_config.vocab_size self.language_model = AutoModelForCausalLM.from_config(config.text_config) self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1 - self.num_additional_image_tokens = config.num_additional_image_tokens self.post_init() @@ -293,7 +292,7 @@ def get_image_features( # this is not memory efficient at all (output_hidden_states=True) will save all the hidden stated. selected_image_feature = image_outputs.hidden_states[vision_feature_layer] if vision_feature_select_strategy == "default": - selected_image_feature = selected_image_feature[:, self.num_additional_image_tokens :] + selected_image_feature = selected_image_feature[:, 1:] elif vision_feature_select_strategy == "full": selected_image_feature = selected_image_feature else: diff --git a/src/transformers/models/llava_next/configuration_llava_next.py b/src/transformers/models/llava_next/configuration_llava_next.py index ba88c5659fed..8435093e6e88 100644 --- a/src/transformers/models/llava_next/configuration_llava_next.py +++ b/src/transformers/models/llava_next/configuration_llava_next.py @@ -55,9 +55,6 @@ class LlavaNextConfig(PretrainedConfig): Whether the model's input and output word embeddings should be tied. image_seq_length (`int`, *optional*, defaults to 576): Sequence length of one image embedding. - num_additional_image_tokens (`int`, *optional*, defaults to 0): - Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other - extra tokens appended, no need to set this arg. Example: @@ -144,6 +141,5 @@ def __init__( text_config = CONFIG_MAPPING["llama"]() self.text_config = text_config - self.num_additional_image_tokens = num_additional_image_tokens super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py index 6ac0f1df4f02..45e6bc9f773c 100644 --- a/src/transformers/models/llava_next/modeling_llava_next.py +++ b/src/transformers/models/llava_next/modeling_llava_next.py @@ -356,7 +356,6 @@ def __init__(self, config: LlavaNextConfig): self.language_model = AutoModelForCausalLM.from_config(config.text_config) self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1 self._padding_side = "left" # set it to left by default, user can use setter to change padding_sides - self.num_additional_image_tokens = config.num_additional_image_tokens self.post_init() @property @@ -750,7 +749,7 @@ def get_image_features( image_features = self.vision_tower(pixel_values, output_hidden_states=True) selected_image_feature = image_features.hidden_states[vision_feature_layer] if vision_feature_select_strategy == "default": - selected_image_feature = selected_image_feature[:, self.num_additional_image_tokens :] + selected_image_feature = selected_image_feature[:, :] elif vision_feature_select_strategy == "full": selected_image_feature = selected_image_feature image_features = self.multi_modal_projector(selected_image_feature) diff --git a/src/transformers/models/llava_next_video/configuration_llava_next_video.py b/src/transformers/models/llava_next_video/configuration_llava_next_video.py index b349cfc01360..5c965ad5fc75 100644 --- a/src/transformers/models/llava_next_video/configuration_llava_next_video.py +++ b/src/transformers/models/llava_next_video/configuration_llava_next_video.py @@ -65,9 +65,6 @@ class LlavaNextVideoConfig(PretrainedConfig): Sequence length of one image embedding. video_seq_length (`int`, *optional*, defaults to 288): Sequence length of one video embedding. - num_additional_image_tokens (`int`, *optional*, defaults to 0): - Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other - extra tokens appended, no need to set this arg. Example: @@ -160,6 +157,5 @@ def __init__( text_config = CONFIG_MAPPING["llama"]() self.text_config = text_config - self.num_additional_image_tokens = num_additional_image_tokens super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) diff --git a/src/transformers/models/llava_next_video/modeling_llava_next_video.py b/src/transformers/models/llava_next_video/modeling_llava_next_video.py index a145883b7f2a..1448f41a6895 100644 --- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py @@ -396,7 +396,6 @@ def __init__( self.language_model = AutoModelForCausalLM.from_config(config.text_config) self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1 self._padding_side = "left" # set it to left by default, user can use setter to change padding_sides - self.num_additional_image_tokens = config.num_additional_image_tokens self.vision_resampler = LlavaNextVideoPooler(config) self.post_init() @@ -783,7 +782,7 @@ def get_image_features( image_features = self.vision_tower(pixel_values, output_hidden_states=True) selected_image_feature = image_features.hidden_states[vision_feature_layer] if vision_feature_select_strategy == "default": - selected_image_feature = selected_image_feature[:, self.num_additional_image_tokens :] + selected_image_feature = selected_image_feature[:, 1:] elif vision_feature_select_strategy == "full": selected_image_feature = selected_image_feature image_features = self.multi_modal_projector(selected_image_feature) diff --git a/src/transformers/models/llava_next_video/modular_llava_next_video.py b/src/transformers/models/llava_next_video/modular_llava_next_video.py index 4cebb880b911..59bcea63c7f8 100644 --- a/src/transformers/models/llava_next_video/modular_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modular_llava_next_video.py @@ -173,7 +173,6 @@ def __init__( text_config = CONFIG_MAPPING["llama"]() self.text_config = text_config - self.num_additional_image_tokens = num_additional_image_tokens super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) @@ -226,7 +225,6 @@ def forward(self, image_features): class LlavaNextVideoForConditionalGeneration(LlavaNextForConditionalGeneration): def __init__(self, config: LlavaNextVideoConfig, **super_kwargs): super().__init__(config, **super_kwargs) - self.num_additional_image_tokens = config.num_additional_image_tokens self.vision_resampler = LlavaNextVideoPooler(config) self.post_init() @@ -274,7 +272,7 @@ def get_image_features( image_features = self.vision_tower(pixel_values, output_hidden_states=True) selected_image_feature = image_features.hidden_states[vision_feature_layer] if vision_feature_select_strategy == "default": - selected_image_feature = selected_image_feature[:, self.num_additional_image_tokens :] + selected_image_feature = selected_image_feature[:, 1:] elif vision_feature_select_strategy == "full": selected_image_feature = selected_image_feature image_features = self.multi_modal_projector(selected_image_feature) @@ -304,7 +302,7 @@ def get_video_features( video_features = self.vision_tower(pixel_values, output_hidden_states=True) selected_video_features = video_features.hidden_states[vision_feature_layer] if vision_feature_select_strategy == "default": - selected_video_features = selected_video_features[:, self.num_additional_image_tokens :] + selected_video_features = selected_video_features[:, 1:] elif vision_feature_select_strategy == "full": selected_video_features = selected_video_features diff --git a/src/transformers/models/llava_onevision/configuration_llava_onevision.py b/src/transformers/models/llava_onevision/configuration_llava_onevision.py index 49afdd14edc5..627bf76a4bb6 100644 --- a/src/transformers/models/llava_onevision/configuration_llava_onevision.py +++ b/src/transformers/models/llava_onevision/configuration_llava_onevision.py @@ -58,9 +58,6 @@ class LlavaOnevisionConfig(PretrainedConfig): of the form `(height, width)`. tie_word_embeddings (`bool`, *optional*, defaults to `False`): Whether the model's input and output word embeddings should be tied. - num_additional_image_tokens (`int`, *optional*, defaults to 0): - Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other - extra tokens appended, no need to set this arg. Example: @@ -183,6 +180,5 @@ def __init__( text_config = CONFIG_MAPPING["qwen2"]() self.text_config = text_config - self.num_additional_image_tokens = num_additional_image_tokens super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs) diff --git a/src/transformers/models/llava_onevision/modeling_llava_onevision.py b/src/transformers/models/llava_onevision/modeling_llava_onevision.py index 19cd6a0bf26f..626db4d96aae 100644 --- a/src/transformers/models/llava_onevision/modeling_llava_onevision.py +++ b/src/transformers/models/llava_onevision/modeling_llava_onevision.py @@ -371,7 +371,6 @@ def __init__(self, config: LlavaOnevisionConfig): self.vocab_size = config.text_config.vocab_size self.language_model = AutoModelForCausalLM.from_config(config.text_config) - self.num_additional_image_tokens = config.num_additional_image_tokens self.post_init() # Copied from transformers.models.llava_next.modeling_llava_next.LlavaNextForConditionalGeneration.get_input_embeddings @@ -526,7 +525,7 @@ def get_image_features( image_features = self.vision_tower(pixel_values, output_hidden_states=True) selected_image_feature = image_features.hidden_states[vision_feature_layer] if vision_feature_select_strategy == "default": - selected_image_feature = selected_image_feature[:, self.num_additional_image_tokens :] + selected_image_feature = selected_image_feature[:, 1:] elif vision_feature_select_strategy == "full": selected_image_feature = selected_image_feature image_features = self.multi_modal_projector(selected_image_feature) @@ -557,7 +556,7 @@ def get_video_features( selected_video_feature = video_features.hidden_states[vision_feature_layer] if vision_feature_select_strategy == "default": - selected_video_feature = selected_video_feature[:, self.num_additional_image_tokens :] + selected_video_feature = selected_video_feature[:, 1:] elif vision_feature_select_strategy == "full": selected_video_feature = selected_video_feature video_features = self.multi_modal_projector(selected_video_feature) diff --git a/src/transformers/models/vipllava/configuration_vipllava.py b/src/transformers/models/vipllava/configuration_vipllava.py index 7410f8f92b9e..f26c2b2f50fb 100644 --- a/src/transformers/models/vipllava/configuration_vipllava.py +++ b/src/transformers/models/vipllava/configuration_vipllava.py @@ -49,9 +49,6 @@ class VipLlavaConfig(PretrainedConfig): The list of layers to select the vision features from. image_seq_length (`int`, *optional*, defaults to 576): Sequence length of one image embedding. - num_additional_image_tokens (`int`, *optional*, defaults to 0): - Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other - extra tokens appended, no need to set this arg. Example: @@ -87,7 +84,6 @@ def __init__( projector_layernorm_eps=1e-5, vision_feature_layers=[-2, -5, -8, -11, 6], image_seq_length=576, - num_additional_image_tokens=0, **kwargs, ): self.ignore_index = ignore_index @@ -122,6 +118,5 @@ def __init__( text_config = CONFIG_MAPPING["llama"]() self.text_config = text_config - self.num_additional_image_tokens = num_additional_image_tokens super().__init__(**kwargs) diff --git a/src/transformers/models/vipllava/modeling_vipllava.py b/src/transformers/models/vipllava/modeling_vipllava.py index 9471f90d5414..84c9a302772e 100644 --- a/src/transformers/models/vipllava/modeling_vipllava.py +++ b/src/transformers/models/vipllava/modeling_vipllava.py @@ -243,7 +243,6 @@ def __init__(self, config: VipLlavaConfig): self.vocab_size = config.text_config.vocab_size self.language_model = AutoModelForCausalLM.from_config(config.text_config) self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1 - self.num_additional_image_tokens = config.num_additional_image_tokens self.post_init() @@ -292,10 +291,7 @@ def get_image_features(self, pixel_values: torch.FloatTensor, vision_feature_lay # For VIP-llava, the image features are computed this way # We select the features from index 1: for the layers -2, -5, -8, -11 and 6 - image_features = [ - image_outputs.hidden_states[index][:, self.num_additional_image_tokens :] - for index in vision_feature_layers - ] + image_features = [image_outputs.hidden_states[index][:, 1:] for index in vision_feature_layers] image_features = torch.cat(image_features, dim=-1) image_features = self.multi_modal_projector(image_features) return image_features From b666b291c61d74b51847831b9a10e18d6bdf6320 Mon Sep 17 00:00:00 2001 From: jp1924 Date: Fri, 6 Dec 2024 06:42:09 +0000 Subject: [PATCH 05/12] Fix: adjust num_image_tokens calculation in LlavaProcessor --- src/transformers/models/llava/processing_llava.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/llava/processing_llava.py b/src/transformers/models/llava/processing_llava.py index 08caa3d1d8a7..bb4a5c7130cd 100644 --- a/src/transformers/models/llava/processing_llava.py +++ b/src/transformers/models/llava/processing_llava.py @@ -162,7 +162,7 @@ def __call__( width // self.patch_size ) + self.num_additional_image_tokens if self.vision_feature_select_strategy == "default": - num_image_tokens -= self.num_additional_image_tokens + num_image_tokens -= 1 prompt_strings = [] for sample in text: From e17d95e6a675a255e3e5d16e5c47af9299ea34be Mon Sep 17 00:00:00 2001 From: jp1924 Date: Fri, 6 Dec 2024 06:52:22 +0000 Subject: [PATCH 06/12] Remove num_additional_image_tokens initialization from configuration files --- src/transformers/models/gemma/configuration_gemma.py | 1 - src/transformers/models/llava_next/configuration_llava_next.py | 1 - .../models/llava_next_video/configuration_llava_next_video.py | 1 - .../models/llava_next_video/modeling_llava_next_video.py | 2 +- .../models/llava_onevision/configuration_llava_onevision.py | 1 - 5 files changed, 1 insertion(+), 5 deletions(-) diff --git a/src/transformers/models/gemma/configuration_gemma.py b/src/transformers/models/gemma/configuration_gemma.py index e170803cccab..346f386ba698 100644 --- a/src/transformers/models/gemma/configuration_gemma.py +++ b/src/transformers/models/gemma/configuration_gemma.py @@ -20,7 +20,6 @@ # See the License for the specific language governing permissions and # limitations under the License. - from ...configuration_utils import PretrainedConfig diff --git a/src/transformers/models/llava_next/configuration_llava_next.py b/src/transformers/models/llava_next/configuration_llava_next.py index 8435093e6e88..54616edbf96d 100644 --- a/src/transformers/models/llava_next/configuration_llava_next.py +++ b/src/transformers/models/llava_next/configuration_llava_next.py @@ -92,7 +92,6 @@ def __init__( image_grid_pinpoints=None, tie_word_embeddings=False, image_seq_length=576, - num_additional_image_tokens=0, **kwargs, ): self.ignore_index = ignore_index diff --git a/src/transformers/models/llava_next_video/configuration_llava_next_video.py b/src/transformers/models/llava_next_video/configuration_llava_next_video.py index 5c965ad5fc75..2fe889da6033 100644 --- a/src/transformers/models/llava_next_video/configuration_llava_next_video.py +++ b/src/transformers/models/llava_next_video/configuration_llava_next_video.py @@ -104,7 +104,6 @@ def __init__( spatial_pool_stride=2, image_seq_length=576, video_seq_length=288, - num_additional_image_tokens=0, **kwargs, ): self.video_token_index = video_token_index diff --git a/src/transformers/models/llava_next_video/modeling_llava_next_video.py b/src/transformers/models/llava_next_video/modeling_llava_next_video.py index 1448f41a6895..b0a20d6c5ccd 100644 --- a/src/transformers/models/llava_next_video/modeling_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modeling_llava_next_video.py @@ -1148,7 +1148,7 @@ def get_video_features( video_features = self.vision_tower(pixel_values, output_hidden_states=True) selected_video_features = video_features.hidden_states[vision_feature_layer] if vision_feature_select_strategy == "default": - selected_video_features = selected_video_features[:, self.num_additional_image_tokens :] + selected_video_features = selected_video_features[:, 1:] elif vision_feature_select_strategy == "full": selected_video_features = selected_video_features diff --git a/src/transformers/models/llava_onevision/configuration_llava_onevision.py b/src/transformers/models/llava_onevision/configuration_llava_onevision.py index 627bf76a4bb6..46b65b35b1a5 100644 --- a/src/transformers/models/llava_onevision/configuration_llava_onevision.py +++ b/src/transformers/models/llava_onevision/configuration_llava_onevision.py @@ -95,7 +95,6 @@ def __init__( vision_aspect_ratio="anyres_max_9", image_grid_pinpoints=None, tie_word_embeddings=False, - num_additional_image_tokens=0, **kwargs, ): self.image_token_index = image_token_index From 925eaefb9cb94fe7294dd0867ea098c9b861bf08 Mon Sep 17 00:00:00 2001 From: jp1924 Date: Fri, 6 Dec 2024 07:06:06 +0000 Subject: [PATCH 07/12] Fix test error --- src/transformers/models/llava_next/modeling_llava_next.py | 2 +- src/transformers/models/llava_next/processing_llava_next.py | 2 +- .../models/llava_next_video/modular_llava_next_video.py | 4 ---- 3 files changed, 2 insertions(+), 6 deletions(-) diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py index 45e6bc9f773c..269663c7d614 100644 --- a/src/transformers/models/llava_next/modeling_llava_next.py +++ b/src/transformers/models/llava_next/modeling_llava_next.py @@ -749,7 +749,7 @@ def get_image_features( image_features = self.vision_tower(pixel_values, output_hidden_states=True) selected_image_feature = image_features.hidden_states[vision_feature_layer] if vision_feature_select_strategy == "default": - selected_image_feature = selected_image_feature[:, :] + selected_image_feature = selected_image_feature[:, 1:] elif vision_feature_select_strategy == "full": selected_image_feature = selected_image_feature image_features = self.multi_modal_projector(selected_image_feature) diff --git a/src/transformers/models/llava_next/processing_llava_next.py b/src/transformers/models/llava_next/processing_llava_next.py index 38173cbd861f..5b5a2b4d85d6 100644 --- a/src/transformers/models/llava_next/processing_llava_next.py +++ b/src/transformers/models/llava_next/processing_llava_next.py @@ -169,7 +169,7 @@ def __call__( orig_height, orig_width = image_size num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width) if self.vision_feature_select_strategy == "default": - num_image_tokens -= self.num_additional_image_tokens + num_image_tokens -= 1 sample = sample.replace(self.image_token, "" * num_image_tokens, 1) prompt_strings.append(sample) prompt_strings = [sample.replace("", self.image_token) for sample in prompt_strings] diff --git a/src/transformers/models/llava_next_video/modular_llava_next_video.py b/src/transformers/models/llava_next_video/modular_llava_next_video.py index 59bcea63c7f8..3d6431d7ea29 100644 --- a/src/transformers/models/llava_next_video/modular_llava_next_video.py +++ b/src/transformers/models/llava_next_video/modular_llava_next_video.py @@ -78,9 +78,6 @@ class LlavaNextVideoConfig(PretrainedConfig): Sequence length of one image embedding. video_seq_length (`int`, *optional*, defaults to 288): Sequence length of one video embedding. - num_additional_image_tokens (`int`, *optional*, defaults to 0): - Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other - extra tokens appended, no need to set this arg. Example: @@ -120,7 +117,6 @@ def __init__( spatial_pool_stride=2, image_seq_length=576, video_seq_length=288, - num_additional_image_tokens=0, **kwargs, ): self.video_token_index = video_token_index From 0eb8415c04767e738028a31e35ac9b3158edd669 Mon Sep 17 00:00:00 2001 From: jp1924 Date: Fri, 6 Dec 2024 07:11:22 +0000 Subject: [PATCH 08/12] revert --- src/transformers/models/gemma/configuration_gemma.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/transformers/models/gemma/configuration_gemma.py b/src/transformers/models/gemma/configuration_gemma.py index 346f386ba698..75d0096d4811 100644 --- a/src/transformers/models/gemma/configuration_gemma.py +++ b/src/transformers/models/gemma/configuration_gemma.py @@ -19,7 +19,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - from ...configuration_utils import PretrainedConfig From 301a6693634b063eed82ebf3d6fd1f9f968575d8 Mon Sep 17 00:00:00 2001 From: jp1924 Date: Sun, 8 Dec 2024 09:22:25 +0000 Subject: [PATCH 09/12] Fix: adjust num_image_tokens calculation in LlavaNextVideoProcessor --- .../models/llava_next_video/processing_llava_next_video.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/llava_next_video/processing_llava_next_video.py b/src/transformers/models/llava_next_video/processing_llava_next_video.py index 65195b772407..36a4048463c0 100644 --- a/src/transformers/models/llava_next_video/processing_llava_next_video.py +++ b/src/transformers/models/llava_next_video/processing_llava_next_video.py @@ -196,7 +196,7 @@ def __call__( orig_height, orig_width = image_size num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width) if self.vision_feature_select_strategy == "default": - num_image_tokens -= self.num_additional_image_tokens + num_image_tokens -= 1 sample = sample.replace(self.image_token, "" * num_image_tokens, 1) prompt_strings.append(sample) text = [sample.replace("", self.image_token) for sample in prompt_strings] From 8ee4024d2877ca13e58b993165c5b6eea9aa73a6 Mon Sep 17 00:00:00 2001 From: jp1924 Date: Mon, 9 Dec 2024 00:29:41 +0000 Subject: [PATCH 10/12] fix conflict --- src/transformers/models/gemma/configuration_gemma.py | 1 + 1 file changed, 1 insertion(+) diff --git a/src/transformers/models/gemma/configuration_gemma.py b/src/transformers/models/gemma/configuration_gemma.py index 75d0096d4811..346f386ba698 100644 --- a/src/transformers/models/gemma/configuration_gemma.py +++ b/src/transformers/models/gemma/configuration_gemma.py @@ -19,6 +19,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + from ...configuration_utils import PretrainedConfig From 114283b8109644a7395f889f3ac4887c12522986 Mon Sep 17 00:00:00 2001 From: jp1924 Date: Mon, 9 Dec 2024 12:30:29 +0000 Subject: [PATCH 11/12] Fix: adjust num_image_tokens calculation in VideoLlavaProcessor --- src/transformers/models/video_llava/processing_video_llava.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/transformers/models/video_llava/processing_video_llava.py b/src/transformers/models/video_llava/processing_video_llava.py index 3e1884271efe..52f583716803 100644 --- a/src/transformers/models/video_llava/processing_video_llava.py +++ b/src/transformers/models/video_llava/processing_video_llava.py @@ -187,7 +187,7 @@ def __call__( ) + self.num_additional_image_tokens num_video_tokens = num_image_tokens * num_frames if self.vision_feature_select_strategy == "default": - num_image_tokens -= self.num_additional_image_tokens + num_image_tokens -= 1 prompt_strings = [] for sample in text: From bb73040e5f0584cc0250f17846933c36bb7e3f8e Mon Sep 17 00:00:00 2001 From: raushan Date: Wed, 8 Jan 2025 15:50:09 +0100 Subject: [PATCH 12/12] make style --- src/transformers/models/llava/processing_llava.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/transformers/models/llava/processing_llava.py b/src/transformers/models/llava/processing_llava.py index ea4349992ad3..630ccdce1434 100644 --- a/src/transformers/models/llava/processing_llava.py +++ b/src/transformers/models/llava/processing_llava.py @@ -158,8 +158,8 @@ def __call__( pixel_values = image_inputs["pixel_values"] height, width = get_image_size(to_numpy_array(pixel_values[0])) num_image_tokens = (height // self.patch_size) * ( - width // self.patch_size - ) + self.num_additional_image_tokens + width // self.patch_size + ) + self.num_additional_image_tokens if self.vision_feature_select_strategy == "default": num_image_tokens -= 1