Skip to content

Commit a846302

Browse files
authored
Merge branch 'main' into add-seed-for-data-collator-for-language-modeling
2 parents 5b378e5 + e9756cd commit a846302

File tree

121 files changed

+238
-153
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

121 files changed

+238
-153
lines changed

.github/scripts/assign_reviewers.py

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,19 @@
1717
import github
1818
import json
1919
from github import Github
20-
from fnmatch import fnmatch
20+
import re
2121
from collections import Counter
2222
from pathlib import Path
2323

24+
def pattern_to_regex(pattern):
25+
start_anchor = pattern.startswith("/")
26+
pattern = re.escape(pattern)
27+
# Replace `*` with "any number of non-slash characters"
28+
pattern = pattern.replace(r"\*", "[^/]*")
29+
if start_anchor:
30+
pattern = "^" + pattern
31+
return pattern
32+
2433
def get_file_owners(file_path, codeowners_lines):
2534
# Process lines in reverse (last matching pattern takes precedence)
2635
for line in reversed(codeowners_lines):
@@ -36,18 +45,20 @@ def get_file_owners(file_path, codeowners_lines):
3645
owners = [owner.removeprefix("@") for owner in parts[1:]]
3746

3847
# Check if file matches pattern
39-
if fnmatch(file_path, pattern):
48+
file_regex = pattern_to_regex(pattern)
49+
if re.search(file_regex, file_path) is not None:
4050
return owners # Remember, can still be empty!
4151
return [] # Should never happen, but just in case
4252

4353
def main():
54+
script_dir = Path(__file__).parent.absolute()
55+
with open(script_dir / "codeowners_for_review_action") as f:
56+
codeowners_lines = f.readlines()
57+
4458
g = Github(os.environ['GITHUB_TOKEN'])
4559
repo = g.get_repo("huggingface/transformers")
4660
with open(os.environ['GITHUB_EVENT_PATH']) as f:
4761
event = json.load(f)
48-
script_dir = Path(__file__).parent.absolute()
49-
with open(script_dir / "codeowners_for_review_action") as f:
50-
codeowners_lines = f.readlines()
5162

5263
# The PR number is available in the event payload
5364
pr_number = event['pull_request']['number']

.github/scripts/codeowners_for_review_action

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,14 +11,14 @@ docs/ @stevhliu
1111
/src/transformers/models/*/image_processing* @qubvel
1212
/src/transformers/models/*/image_processing_*_fast* @yonigozlan
1313

14-
1514
# Owners of subsections of the library
1615
/src/transformers/generation/ @gante
1716
/src/transformers/pipeline/ @Rocketknight1 @yonigozlan
1817
/src/transformers/integrations/ @SunMarc @MekkCyber @muellerzr
1918
/src/transformers/quantizers/ @SunMarc @MekkCyber
20-
/src/transformers/tests/ @ydshieh
21-
/src/transformers/tests/generation/ @gante
19+
tests/ @ydshieh
20+
tests/generation/ @gante
21+
2222
/src/transformers/models/auto/ @ArthurZucker
2323
/src/transformers/utils/ @ArthurZucker @Rocketknight1
2424
/src/transformers/loss/ @ArthurZucker

docs/source/en/_toctree.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,8 @@
7474
title: Optimizing inference
7575
- local: kv_cache
7676
title: KV cache strategies
77+
- local: serving
78+
title: Serving
7779
- local: cache_explanation
7880
title: Caching
7981
- local: llm_tutorial_optimization

docs/source/en/serving.md

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
<!--Copyright 2025 The HuggingFace Team. All rights reserved.
2+
3+
Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with
4+
the License. You may obtain a copy of the License at
5+
6+
http://www.apache.org/licenses/LICENSE-2.0
7+
8+
Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on
9+
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the
10+
specific language governing permissions and limitations under the License.
11+
12+
⚠️ Note that this file is in Markdown but contain specific syntax for our doc-builder (similar to MDX) that may not be
13+
rendered properly in your Markdown viewer.
14+
15+
-->
16+
17+
# Serving
18+
19+
Transformer models can be served for inference with specialized libraries such as Text Generation Inference (TGI) and vLLM. These libraries are specifically designed to optimize performance with LLMs and include many unique optimization features that may not be included in Transformers.
20+
21+
## TGI
22+
23+
[TGI](https://huggingface.co/docs/text-generation-inference/index) can serve models that aren't [natively implemented](https://huggingface.co/docs/text-generation-inference/supported_models) by falling back on the Transformers implementation of the model. Some of TGIs high-performance features aren't available in the Transformers implementation, but other features like continuous batching and streaming are still supported.
24+
25+
> [!TIP]
26+
> Refer to the [Non-core model serving](https://huggingface.co/docs/text-generation-inference/basic_tutorials/non_core_models) guide for more details.
27+
28+
Serve a Transformers implementation the same way you'd serve a TGI model.
29+
30+
```docker
31+
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id gpt2
32+
```
33+
34+
Add `--trust-remote_code` to the command to serve a custom Transformers model.
35+
36+
```docker
37+
docker run --gpus all --shm-size 1g -p 8080:80 -v $volume:/data ghcr.io/huggingface/text-generation-inference:latest --model-id <CUSTOM_MODEL_ID> --trust-remote-code
38+
```
39+
40+
## vLLM
41+
42+
[vLLM](https://docs.vllm.ai/en/latest/index.html) can also serve a Transformers implementation of a model if it isn't [natively implemented](https://docs.vllm.ai/en/latest/models/supported_models.html#list-of-text-only-language-models) in vLLM.
43+
44+
Many features like quantization, LoRA adapters, and distributed inference and serving are supported for the Transformers implementation.
45+
46+
> [!TIP]
47+
> Refer to the [Transformers fallback](https://docs.vllm.ai/en/latest/models/supported_models.html#transformers-fallback) section for more details.
48+
49+
By default, vLLM serves the native implementation and if it doesn't exist, it falls back on the Transformers implementation. But you can also set `--model-impl transformers` to explicitly use the Transformers model implementation.
50+
51+
```shell
52+
vllm serve Qwen/Qwen2.5-1.5B-Instruct \
53+
--task generate \
54+
--model-impl transformers \
55+
```
56+
57+
Add the `trust-remote-code` parameter to enable loading a remote code model.
58+
59+
```shell
60+
vllm serve Qwen/Qwen2.5-1.5B-Instruct \
61+
--task generate \
62+
--model-impl transformers \
63+
--trust-remote-code \
64+
```

src/transformers/cache_utils.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1602,7 +1602,9 @@ class HybridCache(Cache):
16021602
```
16031603
"""
16041604

1605-
is_compileable = True
1605+
# TODO (joao): dive deeper into gemma2 and paligemma -- there are reports of speed loss with compilation. Revert
1606+
# ALL changes from the PR that commented the line below when reactivating it.
1607+
# is_compileable = True
16061608

16071609
# TODO (joao): remove `=None` in non-optional arguments in v4.46. Remove from `OBJECTS_TO_IGNORE` as well.
16081610
@deprecate_kwarg("layer_device_map", version="4.52.0")

src/transformers/models/aria/modeling_aria.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1094,7 +1094,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
10941094
dtype (`torch.dtype`):
10951095
The dtype to use for the 4D attention mask.
10961096
device (`torch.device`):
1097-
The device to plcae the 4D attention mask on.
1097+
The device to place the 4D attention mask on.
10981098
cache_position (`torch.Tensor`):
10991099
Indices depicting the position of the input sequence tokens in the sequence.
11001100
batch_size (`torch.Tensor`):

src/transformers/models/bamba/modeling_bamba.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1399,7 +1399,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
13991399
dtype (`torch.dtype`):
14001400
The dtype to use for the 4D attention mask.
14011401
device (`torch.device`):
1402-
The device to plcae the 4D attention mask on.
1402+
The device to place the 4D attention mask on.
14031403
cache_position (`torch.Tensor`):
14041404
Indices depicting the position of the input sequence tokens in the sequence.
14051405
batch_size (`torch.Tensor`):

src/transformers/models/bamba/modular_bamba.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1140,7 +1140,7 @@ def _prepare_4d_causal_attention_mask_with_cache_position(
11401140
dtype (`torch.dtype`):
11411141
The dtype to use for the 4D attention mask.
11421142
device (`torch.device`):
1143-
The device to plcae the 4D attention mask on.
1143+
The device to place the 4D attention mask on.
11441144
cache_position (`torch.Tensor`):
11451145
Indices depicting the position of the input sequence tokens in the sequence.
11461146
batch_size (`torch.Tensor`):

src/transformers/models/bark/modeling_bark.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -201,7 +201,7 @@ def __init__(self, *args, **kwargs):
201201
super().__init__(*args, **kwargs)
202202

203203
# TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
204-
# flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
204+
# flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
205205
# Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
206206
self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
207207

src/transformers/models/bart/modeling_bart.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -298,7 +298,7 @@ def __init__(self, *args, **kwargs):
298298
super().__init__(*args, **kwargs)
299299

300300
# TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
301-
# flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignement, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
301+
# flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
302302
# Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
303303
self._flash_attn_uses_top_left_mask = not is_flash_attn_greater_or_equal_2_10()
304304

0 commit comments

Comments
 (0)