diff --git a/src/transformers/models/mt5/configuration_mt5.py b/src/transformers/models/mt5/configuration_mt5.py index aab93711dfc6..9464979a2b8e 100644 --- a/src/transformers/models/mt5/configuration_mt5.py +++ b/src/transformers/models/mt5/configuration_mt5.py @@ -40,8 +40,8 @@ class MT5Config(PretrainedConfig): d_model (`int`, *optional*, defaults to 512): Size of the encoder layers and the pooler layer. d_kv (`int`, *optional*, defaults to 64): - Size of the key, query, value projections per attention head. `d_kv` has to be equal to `d_model // - num_heads`. + Size of the key, query, value projections per attention head. In the conventional context, it is typically expected that `d_kv` has to be equal to `d_model // num_heads`. + But in the architecture of mt5-small, `d_kv` is not equal to `d_model //num_heads`. The `inner_dim` of the projection layer will be defined as `num_heads * d_kv`. d_ff (`int`, *optional*, defaults to 1024): Size of the intermediate feed forward layer in each `T5Block`. num_layers (`int`, *optional*, defaults to 8):