@@ -61,12 +61,19 @@ class AltCLIPTextConfig(PretrainedConfig):
6161 max_position_embeddings (`int`, *optional*, defaults to 514):
6262 The maximum sequence length that this model might ever be used with. Typically set this to something large
6363 just in case (e.g., 512 or 1024 or 2048).
64- type_vocab_size (`int`, *optional*, defaults to 2 ):
64+ type_vocab_size (`int`, *optional*, defaults to 1 ):
6565 The vocabulary size of the `token_type_ids` passed when calling [`AltCLIPTextModel`]
6666 initializer_range (`float`, *optional*, defaults to 0.02):
6767 The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
68- layer_norm_eps (`float`, *optional*, defaults to 1e-5):
68+ initializer_factor (`float`, *optional*, defaults to 0.02):
69+ A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
70+ testing).
71+ layer_norm_eps (`float`, *optional*, defaults to 1e-05):
6972 The epsilon used by the layer normalization layers.
73+ pad_token_id (`int`, *optional*, defaults to 1): The id of the *padding* token.
74+ bos_token_id (`int`, *optional*, defaults to 0): The id of the *beginning-of-sequence* token.
75+ eos_token_id (`Union[int, List[int]]`, *optional*, defaults to 2):
76+ The id of the *end-of-sequence* token. Optionally, use a list to set multiple *end-of-sequence* tokens.
7077 position_embedding_type (`str`, *optional*, defaults to `"absolute"`):
7178 Type of position embedding. Choose one of `"absolute"`, `"relative_key"`, `"relative_key_query"`. For
7279 positional embeddings use `"absolute"`. For more information on `"relative_key"`, please refer to
@@ -154,24 +161,28 @@ class AltCLIPVisionConfig(PretrainedConfig):
154161 Dimensionality of the encoder layers and the pooler layer.
155162 intermediate_size (`int`, *optional*, defaults to 3072):
156163 Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
164+ projection_dim (`int`, *optional*, defaults to 512):
165+ Dimentionality of text and vision projection layers.
157166 num_hidden_layers (`int`, *optional*, defaults to 12):
158167 Number of hidden layers in the Transformer encoder.
159168 num_attention_heads (`int`, *optional*, defaults to 12):
160169 Number of attention heads for each attention layer in the Transformer encoder.
170+ num_channels (`int`, *optional*, defaults to 3):
171+ The number of input channels.
161172 image_size (`int`, *optional*, defaults to 224):
162173 The size (resolution) of each image.
163174 patch_size (`int`, *optional*, defaults to 32):
164175 The size (resolution) of each patch.
165176 hidden_act (`str` or `function`, *optional*, defaults to `"quick_gelu"`):
166177 The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
167178 `"relu"`, `"selu"` and `"gelu_new"` ``"quick_gelu"` are supported.
168- layer_norm_eps (`float`, *optional*, defaults to 1e-5 ):
179+ layer_norm_eps (`float`, *optional*, defaults to 1e-05 ):
169180 The epsilon used by the layer normalization layers.
170181 attention_dropout (`float`, *optional*, defaults to 0.0):
171182 The dropout ratio for the attention probabilities.
172183 initializer_range (`float`, *optional*, defaults to 0.02):
173184 The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
174- initializer_factor (`float`` , *optional*, defaults to 1):
185+ initializer_factor (`float`, *optional*, defaults to 1.0 ):
175186 A factor for initializing all weight matrices (should be kept to 1, used internally for initialization
176187 testing).
177188
0 commit comments