Skip to content

Commit 025ca25

Browse files
committed
wip: try to mimic modeling test for Qwen2VL
1 parent fc3de6d commit 025ca25

File tree

1 file changed

+53
-25
lines changed

1 file changed

+53
-25
lines changed

tests/models/colqwen2/test_modeling_colqwen2.py

Lines changed: 53 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -52,50 +52,70 @@ def __init__(
5252
self,
5353
parent,
5454
ignore_index=-100,
55-
image_token_index=0,
55+
pad_token_id=2,
5656
projector_hidden_act="gelu",
57-
seq_length=25,
57+
seq_length=20,
5858
vision_feature_select_strategy="default",
5959
vision_feature_layer=-1,
6060
projection_dim=32,
6161
is_training=False,
6262
use_cache=False,
6363
vlm_config={
6464
"_name_or_path": "Qwen/Qwen2-VL-2B-Instruct",
65-
"bos_token_id": 151643,
66-
"eos_token_id": 151645,
65+
"bos_token_id": 0,
66+
"eos_token_id": 1,
67+
"vision_start_token_id": 3,
68+
"image_token_id": 4,
69+
"video_token_id": 5,
6770
"hidden_size": 1536,
68-
"image_token_id": 151655,
6971
"intermediate_size": 2,
7072
"max_window_layers": 2,
7173
"model_type": "qwen2_vl",
7274
"num_attention_heads": 2,
7375
"num_hidden_layers": 2,
7476
"num_key_value_heads": 2,
7577
"rms_norm_eps": 1e-06,
76-
"rope_scaling": {"mrope_section": [16, 24, 24], "rope_type": "default", "type": "default"},
78+
"rope_scaling": {"mrope_section": [1, 1, 1], "rope_type": "default", "type": "default"},
7779
"sliding_window": 32768,
7880
"tie_word_embeddings": True,
7981
"torch_dtype": "bfloat16",
80-
"video_token_id": 151656,
81-
"vision_config": {"hidden_size": 2, "in_chans": 3, "spatial_patch_size": 14},
82+
"vision_config": {
83+
"depth": 2,
84+
"embed_dim": 32,
85+
"hidden_act": "quick_gelu",
86+
"hidden_size": 32,
87+
"mlp_ratio": 4,
88+
"num_heads": 4,
89+
"patch_size": 14,
90+
"in_chans": 3,
91+
"spatial_merge_size": 1,
92+
"temporal_patch_size": 1,
93+
},
8294
"vision_end_token_id": 151653,
83-
"vision_start_token_id": 151652,
8495
"vision_token_id": 151654,
8596
"vocab_size": 99,
8697
},
87-
embedding_dim=128,
98+
embedding_dim=32,
8899
):
89100
self.parent = parent
90101
self.ignore_index = ignore_index
102+
self.pad_token_id = pad_token_id
103+
91104
# `image_token_index` is set to 0 to pass "resize_embeddings" test, do not modify
92-
self.image_token_index = image_token_index
105+
self.image_token_index = 0
106+
107+
self.image_token_id = vlm_config["image_token_id"]
108+
self.video_token_id = vlm_config["video_token_id"]
109+
self.pad_token_id = vlm_config["eos_token_id"]
93110
self.projector_hidden_act = projector_hidden_act
94111
self.vision_feature_select_strategy = vision_feature_select_strategy
95112
self.vision_feature_layer = vision_feature_layer
96-
self.seq_length = seq_length
113+
114+
self.image_size = 14
115+
self.num_image_tokens = 32
116+
117+
self.seq_length = seq_length + self.num_image_tokens
97118
self.projection_dim = projection_dim
98-
self.pad_token_id = vlm_config["eos_token_id"]
99119

100120
self.num_hidden_layers = vlm_config["num_hidden_layers"]
101121
self.vocab_size = vlm_config["vocab_size"]
@@ -105,7 +125,7 @@ def __init__(
105125

106126
self.batch_size = 3
107127
self.num_channels = vlm_config["vision_config"]["in_chans"]
108-
self.image_size = 56
128+
109129
self.encoder_seq_length = seq_length
110130
self.use_cache = use_cache
111131

@@ -119,15 +139,23 @@ def get_config(self):
119139
)
120140

121141
def prepare_config_and_inputs(self):
142+
# pixel_values = floats_tensor(
143+
# [
144+
# self.batch_size,
145+
# self.num_channels,
146+
# self.image_size,
147+
# self.image_size,
148+
# ]
149+
# )
150+
config = self.get_config()
151+
patch_size = config.vlm_config.vision_config.patch_size
152+
temporal_patch_size = config.vlm_config.vision_config.temporal_patch_size
122153
pixel_values = floats_tensor(
123154
[
124-
self.batch_size,
125-
self.num_channels,
126-
self.image_size,
127-
self.image_size,
155+
self.batch_size * (self.image_size**2) // (patch_size**2),
156+
self.num_channels * (patch_size**2) * temporal_patch_size,
128157
]
129158
)
130-
config = self.get_config()
131159

132160
return config, pixel_values
133161

@@ -137,16 +165,16 @@ def prepare_config_and_inputs_for_common(self):
137165
input_ids = (
138166
ids_tensor(shape=[self.batch_size, self.seq_length], vocab_size=config.vlm_config.vocab_size - 1) + 1
139167
)
140-
attention_mask = input_ids.ne(1).to(torch_device)
168+
attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=torch_device)
141169

142-
# (image_size // image_patch // spatial_merge_size) ** 2 = (56 // 14 // 2) ** 2 = 4
143-
# Set the 4 first tokens to be image, and ensure that no other tokens are image tokens.
144170
# Do not change this unless you modified image size or patch size.
145-
input_ids[input_ids == self.image_token_index] = self.pad_token_id
146-
input_ids[:, :4] = self.image_token_index
171+
input_ids[:, -1] = self.pad_token_id
172+
input_ids[input_ids == self.video_token_id] = self.pad_token_id
173+
input_ids[input_ids == self.image_token_id] = self.pad_token_id
174+
input_ids[:, self.num_image_tokens] = self.image_token_id
147175

148176
# Hardcoded image grid size: do not change unless you modified image size or patch size!
149-
image_grid_thw = torch.tensor([1, 4, 4]).repeat(self.batch_size, 1)
177+
image_grid_thw = torch.tensor([1, 1, 1]).repeat(self.batch_size, 1)
150178
inputs_dict = {
151179
"input_ids": input_ids,
152180
"pixel_values": pixel_values,

0 commit comments

Comments
 (0)