huggingface
diff --git a/‎.github/scripts/assign_reviewers.py‎
Lines changed: 16 additions & 5 deletions b/‎.github/scripts/assign_reviewers.py‎
Lines changed: 16 additions & 5 deletions
diff --git a/‎.github/scripts/codeowners_for_review_action‎
Lines changed: 3 additions & 3 deletions b/‎.github/scripts/codeowners_for_review_action‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎docs/source/en/_toctree.yml‎
Lines changed: 2 additions & 0 deletions b/‎docs/source/en/_toctree.yml‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎docs/source/en/serving.md‎
Lines changed: 64 additions & 0 deletions b/‎docs/source/en/serving.md‎
Lines changed: 64 additions & 0 deletions
diff --git a/‎src/transformers/cache_utils.py‎
Lines changed: 3 additions & 1 deletion b/‎src/transformers/cache_utils.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎src/transformers/models/aria/modeling_aria.py‎
Lines changed: 1 addition & 1 deletion b/‎src/transformers/models/aria/modeling_aria.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/transformers/models/bamba/modeling_bamba.py‎
Lines changed: 1 addition & 1 deletion b/‎src/transformers/models/bamba/modeling_bamba.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/transformers/models/bamba/modular_bamba.py‎
Lines changed: 1 addition & 1 deletion b/‎src/transformers/models/bamba/modular_bamba.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/transformers/models/bark/modeling_bark.py‎
Lines changed: 1 addition & 1 deletion b/‎src/transformers/models/bark/modeling_bark.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/transformers/models/bart/modeling_bart.py‎
Lines changed: 1 addition & 1 deletion b/‎src/transformers/models/bart/modeling_bart.py‎
Lines changed: 1 addition & 1 deletion
@@ -17,10 +17,19 @@
 import github
 import json
 from github import Github
-from fnmatch import fnmatch
+import re
 from collections import Counter
 from pathlib import Path
 
+def pattern_to_regex(pattern):
+    start_anchor = pattern.startswith("/")
+    pattern = re.escape(pattern)
+    # Replace `*` with "any number of non-slash characters"
+    pattern = pattern.replace(r"\*", "[^/]*")
+    if start_anchor:
+        pattern = "^" + pattern
+    return pattern
+
 def get_file_owners(file_path, codeowners_lines):
     # Process lines in reverse (last matching pattern takes precedence)
     for line in reversed(codeowners_lines):
@@ -36,18 +45,20 @@ def get_file_owners(file_path, codeowners_lines):
         owners = [owner.removeprefix("@") for owner in parts[1:]]
 
         # Check if file matches pattern
-        if fnmatch(file_path, pattern):
+        file_regex = pattern_to_regex(pattern)
+        if re.search(file_regex, file_path) is not None:
             return owners  # Remember, can still be empty!
     return []  # Should never happen, but just in case
 
 def main():
+    script_dir = Path(__file__).parent.absolute()
+    with open(script_dir / "codeowners_for_review_action") as f:
+        codeowners_lines = f.readlines()
+
     g = Github(os.environ['GITHUB_TOKEN'])
     repo = g.get_repo("huggingface/transformers")
     with open(os.environ['GITHUB_EVENT_PATH']) as f:
         event = json.load(f)
-    script_dir = Path(__file__).parent.absolute()
-    with open(script_dir / "codeowners_for_review_action") as f:
-        codeowners_lines = f.readlines()
 
     # The PR number is available in the event payload
     pr_number = event['pull_request']['number']
 
@@ -11,14 +11,14 @@ docs/ @stevhliu
 /src/transformers/models/*/image_processing* @qubvel
 /src/transformers/models/*/image_processing_*_fast* @yonigozlan
 
-
 # Owners of subsections of the library
 /src/transformers/generation/ @gante
 /src/transformers/pipeline/ @Rocketknight1 @yonigozlan
 /src/transformers/integrations/ @SunMarc @MekkCyber @muellerzr
 /src/transformers/quantizers/ @SunMarc @MekkCyber
-/src/transformers/tests/ @ydshieh
-/src/transformers/tests/generation/ @gante
+tests/ @ydshieh
+tests/generation/ @gante
+
 /src/transformers/models/auto/ @ArthurZucker
 /src/transformers/utils/ @ArthurZucker @Rocketknight1
 /src/transformers/loss/ @ArthurZucker
 
@@ -74,6 +74,8 @@
       title: Optimizing inference
     - local: kv_cache
       title: KV cache strategies
+    - local: serving
+      title: Serving
     - local: cache_explanation
       title: Caching
     - local: llm_tutorial_optimization
 
@@ -0,0 +1,64 @@
+<!--Copyright 2025 The HuggingFace Team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
+an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
+specific language governing permissions and limitations under the License.
+
+⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
+rendered properly in your Markdown viewer.
+
+-->
+
+# Serving
+
+Transformer models can be served for inference with specialized libraries such as Text Generation Inference (TGI) and vLLM. These libraries are specifically designed to optimize performance with LLMs and include many unique optimization features that may not be included in Transformers.
+
+## TGI
+
+[TGI](https://huggingface.co/docs/text-generation-inference/index) can serve models that aren't [natively implemented](https://huggingface.co/docs/text-generation-inference/supported_models) by falling back on the Transformers implementation of the model. Some of TGIs high-performance features aren't available in the Transformers implementation, but other features like continuous batching and streaming are still supported.
+
+> [!TIP]
+> Refer to the [Non-core model serving](https://huggingface.co/docs/text-generation-inference/basic_tutorials/non_core_models) guide for more details.
+
+Serve a Transformers implementation the same way you'd serve a TGI model.
+
+```docker
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id gpt2
+```
+
+Add `--trust-remote_code` to the command to serve a custom Transformers model.
+
+```docker
+docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id <CUSTOM_MODEL_ID> --trust-remote-code
+```
+
+## vLLM
+
+[vLLM](https://docs.vllm.ai/en/latest/index.html) can also serve a Transformers implementation of a model if it isn't [natively implemented](https://docs.vllm.ai/en/latest/models/supported_models.html#list-of-text-only-language-models) in vLLM.
+
+Many features like quantization, LoRA adapters, and distributed inference and serving are supported for the Transformers implementation.
+
+> [!TIP]
+> Refer to the [Transformers fallback](https://docs.vllm.ai/en/latest/models/supported_models.html#transformers-fallback) section for more details.
+
+By default, vLLM serves the native implementation and if it doesn't exist, it falls back on the Transformers implementation. But you can also set `--model-impl transformers` to explicitly use the Transformers model implementation.
+
+```shell
+vllm serve Qwen/Qwen2.5-1.5B-Instruct \
+    --task generate \
+    --model-impl transformers \
+```
+
+Add the `trust-remote-code` parameter to enable loading a remote code model.
+
+```shell
+vllm serve Qwen/Qwen2.5-1.5B-Instruct \
+    --task generate \
+    --model-impl transformers \
+    --trust-remote-code \
+```
@@ -1602,7 +1602,9 @@ class HybridCache(Cache):
         ```
     """
 
-    is_compileable = True
+    # TODO (joao): dive deeper into gemma2 and paligemma -- there are reports of speed loss with compilation. Revert
+    # ALL changes from the PR that commented the line below when reactivating it.
+    # is_compileable = True
 
     # TODO (joao): remove `=None` in non-optional arguments in v4.46. Remove from `OBJECTS_TO_IGNORE` as well.
     @deprecate_kwarg("layer_device_map", version="4.52.0")
 
@@ -1094,7 +1094,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
 
@@ -1399,7 +1399,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
 
@@ -1140,7 +1140,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
             dtype (`torch.dtype`):
                 The dtype to use for the 4D attention mask.
             device (`torch.device`):
-                The device to plcae the 4D attention mask on.
+                The device to place the 4D attention mask on.
             cache_position (`torch.Tensor`):
                 Indices depicting the position of the input sequence tokens in the sequence.
             batch_size (`torch.Tensor`):
 
@@ -201,7 +201,7 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
         self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
 
 
@@ -298,7 +298,7 @@ def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
 
         # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
-        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
         # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
         self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()