huggingface
diff --git a/‎docs/source/en/index.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/source/en/index.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/source/en/model_doc/swiftformer.md‎
Lines changed: 11 additions & 1 deletion b/‎docs/source/en/model_doc/swiftformer.md‎
Lines changed: 11 additions & 1 deletion
diff --git a/‎src/transformers/__init__.py‎
Lines changed: 14 additions & 0 deletions b/‎src/transformers/__init__.py‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎src/transformers/models/auto/modeling_tf_auto.py‎
Lines changed: 2 additions & 0 deletions b/‎src/transformers/models/auto/modeling_tf_auto.py‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎src/transformers/models/swiftformer/__init__.py‎
Lines changed: 26 additions & 0 deletions b/‎src/transformers/models/swiftformer/__init__.py‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎src/transformers/models/swiftformer/configuration_swiftformer.py‎
Lines changed: 12 additions & 0 deletions b/‎src/transformers/models/swiftformer/configuration_swiftformer.py‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎src/transformers/models/swiftformer/modeling_swiftformer.py‎
Lines changed: 9 additions & 18 deletions b/‎src/transformers/models/swiftformer/modeling_swiftformer.py‎
Lines changed: 9 additions & 18 deletions
@@ -275,7 +275,7 @@ Flax), PyTorch, and/or TensorFlow.
 |                      [StableLm](model_doc/stablelm)                      |       ✅        |         ❌         |      ❌      |
 |                    [Starcoder2](model_doc/starcoder2)                    |       ✅        |         ❌         |      ❌      |
 |                    [SuperPoint](model_doc/superpoint)                    |       ✅        |         ❌         |      ❌      |
-|                   [SwiftFormer](model_doc/swiftformer)                   |       ✅        |         ❌         |      ❌      |
+|                   [SwiftFormer](model_doc/swiftformer)                   |       ✅        |         ✅         |      ❌      |
 |                    [Swin Transformer](model_doc/swin)                    |       ✅        |         ✅         |      ❌      |
 |                 [Swin Transformer V2](model_doc/swinv2)                  |       ✅        |         ❌         |      ❌      |
 |                       [Swin2SR](model_doc/swin2sr)                       |       ✅        |         ❌         |      ❌      |
 
@@ -26,7 +26,7 @@ The abstract from the paper is the following:
 
 *Self-attention has become a defacto choice for capturing global context in various vision applications. However, its quadratic computational complexity with respect to image resolution limits its use in real-time applications, especially for deployment on resource-constrained mobile devices. Although hybrid approaches have been proposed to combine the advantages of convolutions and self-attention for a better speed-accuracy trade-off, the expensive matrix multiplication operations in self-attention remain a bottleneck. In this work, we introduce a novel efficient additive attention mechanism that effectively replaces the quadratic matrix multiplication operations with linear element-wise multiplications. Our design shows that the key-value interaction can be replaced with a linear layer without sacrificing any accuracy. Unlike previous state-of-the-art methods, our efficient formulation of self-attention enables its usage at all stages of the network. Using our proposed efficient additive attention, we build a series of models called "SwiftFormer" which achieves state-of-the-art performance in terms of both accuracy and mobile inference speed. Our small variant achieves 78.5% top-1 ImageNet-1K accuracy with only 0.8 ms latency on iPhone 14, which is more accurate and 2x faster compared to MobileViT-v2.*
 
-This model was contributed by [shehan97](https://huggingface.co/shehan97).
+This model was contributed by [shehan97](https://huggingface.co/shehan97). The TensorFlow version was contributed by [joaocmd](https://huggingface.co/joaocmd).
 The original code can be found [here](https://github.com/Amshaker/SwiftFormer).
 
 ## SwiftFormerConfig
@@ -42,3 +42,13 @@ The original code can be found [here](https://github.com/Amshaker/SwiftFormer).
 
 [[autodoc]] SwiftFormerForImageClassification
     - forward
+
+## TFSwiftFormerModel
+
+[[autodoc]] TFSwiftFormerModel
+    - call
+
+## TFSwiftFormerForImageClassification
+
+[[autodoc]] TFSwiftFormerForImageClassification
+    - call
@@ -4517,6 +4517,14 @@
             "TFSpeech2TextPreTrainedModel",
         ]
     )
+    _import_structure["models.swiftformer"].extend(
+        [
+            "TF_SWIFTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+            "TFSwiftFormerForImageClassification",
+            "TFSwiftFormerModel",
+            "TFSwiftFormerPreTrainedModel",
+        ]
+    )
     _import_structure["models.swin"].extend(
         [
             "TF_SWIN_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -8901,6 +8909,12 @@
             TFSpeech2TextModel,
             TFSpeech2TextPreTrainedModel,
         )
+        from .models.swiftformer import (
+            TF_SWIFTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            TFSwiftFormerForImageClassification,
+            TFSwiftFormerModel,
+            TFSwiftFormerPreTrainedModel,
+        )
         from .models.swin import (
             TF_SWIN_PRETRAINED_MODEL_ARCHIVE_LIST,
             TFSwinForImageClassification,
 
@@ -81,6 +81,7 @@
         ("sam", "TFSamModel"),
         ("segformer", "TFSegformerModel"),
         ("speech_to_text", "TFSpeech2TextModel"),
+        ("swiftformer", "TFSwiftFormerModel"),
         ("swin", "TFSwinModel"),
         ("t5", "TFT5Model"),
         ("tapas", "TFTapasModel"),
@@ -213,6 +214,7 @@
         ("regnet", "TFRegNetForImageClassification"),
         ("resnet", "TFResNetForImageClassification"),
         ("segformer", "TFSegformerForImageClassification"),
+        ("swiftformer", "TFSwiftFormerForImageClassification"),
         ("swin", "TFSwinForImageClassification"),
         ("vit", "TFViTForImageClassification"),
     ]
 
@@ -16,6 +16,7 @@
 from ...utils import (
     OptionalDependencyNotAvailable,
     _LazyModule,
+    is_tf_available,
     is_torch_available,
 )
 
@@ -41,6 +42,19 @@
         "SwiftFormerPreTrainedModel",
     ]
 
+try:
+    if not is_tf_available():
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    pass
+else:
+    _import_structure["modeling_tf_swiftformer"] = [
+        "TF_SWIFTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST",
+        "TFSwiftFormerForImageClassification",
+        "TFSwiftFormerModel",
+        "TFSwiftFormerPreTrainedModel",
+    ]
+
 if TYPE_CHECKING:
     from .configuration_swiftformer import (
         SWIFTFORMER_PRETRAINED_CONFIG_ARCHIVE_MAP,
@@ -60,6 +74,18 @@
             SwiftFormerModel,
             SwiftFormerPreTrainedModel,
         )
+    try:
+        if not is_tf_available():
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        pass
+    else:
+        from .modeling_tf_swiftformer import (
+            TF_SWIFTFORMER_PRETRAINED_MODEL_ARCHIVE_LIST,
+            TFSwiftFormerForImageClassification,
+            TFSwiftFormerModel,
+            TFSwiftFormerPreTrainedModel,
+        )
 
 else:
     import sys
 
@@ -42,6 +42,8 @@ class SwiftFormerConfig(PretrainedConfig):
 
 
     Args:
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image
         num_channels (`int`, *optional*, defaults to 3):
             The number of input channels
         depths (`List[int]`, *optional*, defaults to `[3, 3, 6, 4]`):
@@ -62,6 +64,10 @@ class SwiftFormerConfig(PretrainedConfig):
             Padding in downsampling layers.
         drop_path_rate (`float`, *optional*, defaults to 0.0):
             Rate at which to increase dropout probability in DropPath.
+        drop_mlp_rate (`float`, *optional*, defaults to 0.0):
+            Dropout rate for the MLP component of SwiftFormer.
+        drop_conv_encoder_rate (`float`, *optional*, defaults to 0.0):
+            Dropout rate for the ConvEncoder component of SwiftFormer.
         use_layer_scale (`bool`, *optional*, defaults to `True`):
             Whether to scale outputs from token mixers.
         layer_scale_init_value (`float`, *optional*, defaults to 1e-05):
@@ -89,6 +95,7 @@ class SwiftFormerConfig(PretrainedConfig):
 
     def __init__(
         self,
+        image_size=224,
         num_channels=3,
         depths=[3, 3, 6, 4],
         embed_dims=[48, 56, 112, 220],
@@ -99,12 +106,15 @@ def __init__(
         down_stride=2,
         down_pad=1,
         drop_path_rate=0.0,
+        drop_mlp_rate=0.0,
+        drop_conv_encoder_rate=0.0,
         use_layer_scale=True,
         layer_scale_init_value=1e-5,
         batch_norm_eps=1e-5,
         **kwargs,
     ):
         super().__init__(**kwargs)
+        self.image_size = image_size
         self.num_channels = num_channels
         self.depths = depths
         self.embed_dims = embed_dims
@@ -115,6 +125,8 @@ def __init__(
         self.down_stride = down_stride
         self.down_pad = down_pad
         self.drop_path_rate = drop_path_rate
+        self.drop_mlp_rate = drop_mlp_rate
+        self.drop_conv_encoder_rate = drop_conv_encoder_rate
         self.use_layer_scale = use_layer_scale
         self.layer_scale_init_value = layer_scale_init_value
         self.batch_norm_eps = batch_norm_eps
 
@@ -103,13 +103,12 @@ def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = Fals
     return output
 
 
-# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->Swiftformer
 class SwiftFormerDropPath(nn.Module):
     """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
 
-    def __init__(self, drop_prob: Optional[float] = None) -> None:
+    def __init__(self, config: SwiftFormerConfig) -> None:
         super().__init__()
-        self.drop_prob = drop_prob
+        self.drop_prob = config.drop_path_rate
 
     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
         return drop_path(hidden_states, self.drop_prob, self.training)
@@ -169,7 +168,7 @@ def __init__(self, config: SwiftFormerConfig, dim: int):
         self.point_wise_conv1 = nn.Conv2d(dim, hidden_dim, kernel_size=1)
         self.act = nn.GELU()
         self.point_wise_conv2 = nn.Conv2d(hidden_dim, dim, kernel_size=1)
-        self.drop_path = nn.Identity()
+        self.drop_path = nn.Dropout(p=config.drop_conv_encoder_rate)
         self.layer_scale = nn.Parameter(torch.ones(dim).unsqueeze(-1).unsqueeze(-1), requires_grad=True)
 
     def forward(self, x):
@@ -200,7 +199,7 @@ def __init__(self, config: SwiftFormerConfig, in_features: int):
         act_layer = ACT2CLS[config.hidden_act]
         self.act = act_layer()
         self.fc2 = nn.Conv2d(hidden_features, in_features, 1)
-        self.drop = nn.Dropout(p=0.0)
+        self.drop = nn.Dropout(p=config.drop_mlp_rate)
 
     def forward(self, x):
         x = self.norm1(x)
@@ -302,7 +301,7 @@ def __init__(self, config: SwiftFormerConfig, dim: int, drop_path: float = 0.0)
         self.local_representation = SwiftFormerLocalRepresentation(config, dim=dim)
         self.attn = SwiftFormerEfficientAdditiveAttention(config, dim=dim)
         self.linear = SwiftFormerMlp(config, in_features=dim)
-        self.drop_path = SwiftFormerDropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.drop_path = SwiftFormerDropPath(config) if drop_path > 0.0 else nn.Identity()
         self.use_layer_scale = use_layer_scale
         if use_layer_scale:
             self.layer_scale_1 = nn.Parameter(
@@ -315,21 +314,13 @@ def __init__(self, config: SwiftFormerConfig, dim: int, drop_path: float = 0.0)
     def forward(self, x):
         x = self.local_representation(x)
         batch_size, channels, height, width = x.shape
+        res = self.attn(x.permute(0, 2, 3, 1).reshape(batch_size, height * width, channels))
+        res = res.reshape(batch_size, height, width, channels).permute(0, 3, 1, 2)
         if self.use_layer_scale:
-            x = x + self.drop_path(
-                self.layer_scale_1
-                * self.attn(x.permute(0, 2, 3, 1).reshape(batch_size, height * width, channels))
-                .reshape(batch_size, height, width, channels)
-                .permute(0, 3, 1, 2)
-            )
+            x = x + self.drop_path(self.layer_scale_1 * res)
             x = x + self.drop_path(self.layer_scale_2 * self.linear(x))
-
         else:
-            x = x + self.drop_path(
-                self.attn(x.permute(0, 2, 3, 1).reshape(batch_size, height * width, channels))
-                .reshape(batch_size, height, width, channels)
-                .permute(0, 3, 1, 2)
-            )
+            x = x + self.drop_path(res)
             x = x + self.drop_path(self.linear(x))
         return x