@@ -52,50 +52,70 @@ def __init__(
5252 self ,
5353 parent ,
5454 ignore_index = - 100 ,
55- image_token_index = 0 ,
55+ pad_token_id = 2 ,
5656 projector_hidden_act = "gelu" ,
57- seq_length = 25 ,
57+ seq_length = 20 ,
5858 vision_feature_select_strategy = "default" ,
5959 vision_feature_layer = - 1 ,
6060 projection_dim = 32 ,
6161 is_training = False ,
6262 use_cache = False ,
6363 vlm_config = {
6464 "_name_or_path" : "Qwen/Qwen2-VL-2B-Instruct" ,
65- "bos_token_id" : 151643 ,
66- "eos_token_id" : 151645 ,
65+ "bos_token_id" : 0 ,
66+ "eos_token_id" : 1 ,
67+ "vision_start_token_id" : 3 ,
68+ "image_token_id" : 4 ,
69+ "video_token_id" : 5 ,
6770 "hidden_size" : 1536 ,
68- "image_token_id" : 151655 ,
6971 "intermediate_size" : 2 ,
7072 "max_window_layers" : 2 ,
7173 "model_type" : "qwen2_vl" ,
7274 "num_attention_heads" : 2 ,
7375 "num_hidden_layers" : 2 ,
7476 "num_key_value_heads" : 2 ,
7577 "rms_norm_eps" : 1e-06 ,
76- "rope_scaling" : {"mrope_section" : [16 , 24 , 24 ], "rope_type" : "default" , "type" : "default" },
78+ "rope_scaling" : {"mrope_section" : [1 , 1 , 1 ], "rope_type" : "default" , "type" : "default" },
7779 "sliding_window" : 32768 ,
7880 "tie_word_embeddings" : True ,
7981 "torch_dtype" : "bfloat16" ,
80- "video_token_id" : 151656 ,
81- "vision_config" : {"hidden_size" : 2 , "in_chans" : 3 , "spatial_patch_size" : 14 },
82+ "vision_config" : {
83+ "depth" : 2 ,
84+ "embed_dim" : 32 ,
85+ "hidden_act" : "quick_gelu" ,
86+ "hidden_size" : 32 ,
87+ "mlp_ratio" : 4 ,
88+ "num_heads" : 4 ,
89+ "patch_size" : 14 ,
90+ "in_chans" : 3 ,
91+ "spatial_merge_size" : 1 ,
92+ "temporal_patch_size" : 1 ,
93+ },
8294 "vision_end_token_id" : 151653 ,
83- "vision_start_token_id" : 151652 ,
8495 "vision_token_id" : 151654 ,
8596 "vocab_size" : 99 ,
8697 },
87- embedding_dim = 128 ,
98+ embedding_dim = 32 ,
8899 ):
89100 self .parent = parent
90101 self .ignore_index = ignore_index
102+ self .pad_token_id = pad_token_id
103+
91104 # `image_token_index` is set to 0 to pass "resize_embeddings" test, do not modify
92- self .image_token_index = image_token_index
105+ self .image_token_index = 0
106+
107+ self .image_token_id = vlm_config ["image_token_id" ]
108+ self .video_token_id = vlm_config ["video_token_id" ]
109+ self .pad_token_id = vlm_config ["eos_token_id" ]
93110 self .projector_hidden_act = projector_hidden_act
94111 self .vision_feature_select_strategy = vision_feature_select_strategy
95112 self .vision_feature_layer = vision_feature_layer
96- self .seq_length = seq_length
113+
114+ self .image_size = 14
115+ self .num_image_tokens = 32
116+
117+ self .seq_length = seq_length + self .num_image_tokens
97118 self .projection_dim = projection_dim
98- self .pad_token_id = vlm_config ["eos_token_id" ]
99119
100120 self .num_hidden_layers = vlm_config ["num_hidden_layers" ]
101121 self .vocab_size = vlm_config ["vocab_size" ]
@@ -105,7 +125,7 @@ def __init__(
105125
106126 self .batch_size = 3
107127 self .num_channels = vlm_config ["vision_config" ]["in_chans" ]
108- self . image_size = 56
128+
109129 self .encoder_seq_length = seq_length
110130 self .use_cache = use_cache
111131
@@ -119,15 +139,23 @@ def get_config(self):
119139 )
120140
121141 def prepare_config_and_inputs (self ):
142+ # pixel_values = floats_tensor(
143+ # [
144+ # self.batch_size,
145+ # self.num_channels,
146+ # self.image_size,
147+ # self.image_size,
148+ # ]
149+ # )
150+ config = self .get_config ()
151+ patch_size = config .vlm_config .vision_config .patch_size
152+ temporal_patch_size = config .vlm_config .vision_config .temporal_patch_size
122153 pixel_values = floats_tensor (
123154 [
124- self .batch_size ,
125- self .num_channels ,
126- self .image_size ,
127- self .image_size ,
155+ self .batch_size * (self .image_size ** 2 ) // (patch_size ** 2 ),
156+ self .num_channels * (patch_size ** 2 ) * temporal_patch_size ,
128157 ]
129158 )
130- config = self .get_config ()
131159
132160 return config , pixel_values
133161
@@ -137,16 +165,16 @@ def prepare_config_and_inputs_for_common(self):
137165 input_ids = (
138166 ids_tensor (shape = [self .batch_size , self .seq_length ], vocab_size = config .vlm_config .vocab_size - 1 ) + 1
139167 )
140- attention_mask = input_ids . ne ( 1 ). to ( torch_device )
168+ attention_mask = torch . ones ( input_ids . shape , dtype = torch . long , device = torch_device )
141169
142- # (image_size // image_patch // spatial_merge_size) ** 2 = (56 // 14 // 2) ** 2 = 4
143- # Set the 4 first tokens to be image, and ensure that no other tokens are image tokens.
144170 # Do not change this unless you modified image size or patch size.
145- input_ids [input_ids == self .image_token_index ] = self .pad_token_id
146- input_ids [:, :4 ] = self .image_token_index
171+ input_ids [:, - 1 ] = self .pad_token_id
172+ input_ids [input_ids == self .video_token_id ] = self .pad_token_id
173+ input_ids [input_ids == self .image_token_id ] = self .pad_token_id
174+ input_ids [:, self .num_image_tokens ] = self .image_token_id
147175
148176 # Hardcoded image grid size: do not change unless you modified image size or patch size!
149- image_grid_thw = torch .tensor ([1 , 4 , 4 ]).repeat (self .batch_size , 1 )
177+ image_grid_thw = torch .tensor ([1 , 1 , 1 ]).repeat (self .batch_size , 1 )
150178 inputs_dict = {
151179 "input_ids" : input_ids ,
152180 "pixel_values" : pixel_values ,
0 commit comments