Skip to content

Commit b6523a5

Browse files
nrghoshelliot-barn
authored andcommitted
[data.LLM] fix doc test for Working with LLMs guide (#55917)
Signed-off-by: Nikhil Ghosh <[email protected]> Signed-off-by: elliot-barn <[email protected]>
1 parent 428aafe commit b6523a5

File tree

6 files changed

+726
-285
lines changed

6 files changed

+726
-285
lines changed

doc/BUILD.bazel

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -349,6 +349,30 @@ py_test_run_all_subdirectory(
349349
],
350350
)
351351

352+
# --------------------------------------------------------------------
353+
# Test all doc/source/data/doc_code/working-with-llms code included in rst/md files.
354+
# --------------------------------------------------------------------
355+
356+
filegroup(
357+
name = "data_llm_examples",
358+
srcs = glob(["source/data/doc_code/working-with-llms/**/*.py"]),
359+
visibility = ["//doc:__subpackages__"],
360+
)
361+
362+
# GPU Tests
363+
py_test_run_all_subdirectory(
364+
size = "large",
365+
include = ["source/data/doc_code/working-with-llms/**/*.py"],
366+
exclude = [],
367+
extra_srcs = [],
368+
tags = [
369+
"exclusive",
370+
"gpu",
371+
"team:data",
372+
"team:llm"
373+
],
374+
)
375+
352376
# --------------------------------------------------------------------
353377
# Test all doc/source/tune/doc_code code included in rst/md files.
354378
# --------------------------------------------------------------------
@@ -545,8 +569,6 @@ doctest_each(
545569
# These tests run on GPU (see below).
546570
"source/data/batch_inference.rst",
547571
"source/data/transforming-data.rst",
548-
# These tests are currently failing.
549-
"source/data/working-with-llms.rst",
550572
# These don't contain code snippets.
551573
"source/data/api/**/*.rst",
552574
],
Lines changed: 200 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,200 @@
1+
"""
2+
This file serves as a documentation example and CI test for basic LLM batch inference.
3+
4+
"""
5+
6+
# Dependency setup
7+
import subprocess
8+
import sys
9+
10+
subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "ray[llm]"])
11+
subprocess.check_call(
12+
[sys.executable, "-m", "pip", "install", "--upgrade", "transformers"]
13+
)
14+
subprocess.check_call([sys.executable, "-m", "pip", "install", "numpy==1.26.4"])
15+
16+
17+
# __basic_llm_example_start__
18+
import ray
19+
from ray.data.llm import vLLMEngineProcessorConfig, build_llm_processor
20+
21+
# __basic_config_example_start__
22+
# Basic vLLM configuration
23+
config = vLLMEngineProcessorConfig(
24+
model_source="unsloth/Llama-3.1-8B-Instruct",
25+
engine_kwargs={
26+
"enable_chunked_prefill": True,
27+
"max_num_batched_tokens": 4096, # Reduce if CUDA OOM occurs
28+
"max_model_len": 16384,
29+
},
30+
concurrency=1,
31+
batch_size=64,
32+
)
33+
# __basic_config_example_end__
34+
35+
processor = build_llm_processor(
36+
config,
37+
preprocess=lambda row: dict(
38+
messages=[
39+
{"role": "system", "content": "You are a bot that responds with haikus."},
40+
{"role": "user", "content": row["item"]},
41+
],
42+
sampling_params=dict(
43+
temperature=0.3,
44+
max_tokens=250,
45+
),
46+
),
47+
postprocess=lambda row: dict(
48+
answer=row["generated_text"],
49+
**row, # This will return all the original columns in the dataset.
50+
),
51+
)
52+
53+
ds = ray.data.from_items(["Start of the haiku is: Complete this for me..."])
54+
55+
if __name__ == "__main__":
56+
try:
57+
import torch
58+
59+
if torch.cuda.is_available():
60+
ds = processor(ds)
61+
ds.show(limit=1)
62+
else:
63+
print("Skipping basic LLM run (no GPU available)")
64+
except Exception as e:
65+
print(f"Skipping basic LLM run due to environment error: {e}")
66+
67+
# __hf_token_config_example_start__
68+
# Configuration with Hugging Face token
69+
config_with_token = vLLMEngineProcessorConfig(
70+
model_source="unsloth/Llama-3.1-8B-Instruct",
71+
runtime_env={"env_vars": {"HF_TOKEN": "your_huggingface_token"}},
72+
concurrency=1,
73+
batch_size=64,
74+
)
75+
# __hf_token_config_example_end__
76+
77+
# __parallel_config_example_start__
78+
# Model parallelism configuration for larger models
79+
# tensor_parallel_size=2: Split model across 2 GPUs for tensor parallelism
80+
# pipeline_parallel_size=2: Use 2 pipeline stages (total 4 GPUs needed)
81+
# Total GPUs required = tensor_parallel_size * pipeline_parallel_size = 4
82+
config = vLLMEngineProcessorConfig(
83+
model_source="unsloth/Llama-3.1-8B-Instruct",
84+
engine_kwargs={
85+
"max_model_len": 16384,
86+
"tensor_parallel_size": 2,
87+
"pipeline_parallel_size": 2,
88+
"enable_chunked_prefill": True,
89+
"max_num_batched_tokens": 2048,
90+
},
91+
concurrency=1,
92+
batch_size=32,
93+
accelerator_type="L4",
94+
)
95+
# __parallel_config_example_end__
96+
97+
# __runai_config_example_start__
98+
# RunAI streamer configuration for optimized model loading
99+
# Note: Install vLLM with runai dependencies: pip install -U "vllm[runai]>=0.10.1"
100+
config = vLLMEngineProcessorConfig(
101+
model_source="unsloth/Llama-3.1-8B-Instruct",
102+
engine_kwargs={
103+
"load_format": "runai_streamer",
104+
"max_model_len": 16384,
105+
},
106+
concurrency=1,
107+
batch_size=64,
108+
)
109+
# __runai_config_example_end__
110+
111+
# __lora_config_example_start__
112+
# Multi-LoRA configuration
113+
config = vLLMEngineProcessorConfig(
114+
model_source="unsloth/Llama-3.1-8B-Instruct",
115+
engine_kwargs={
116+
"enable_lora": True,
117+
"max_lora_rank": 32,
118+
"max_loras": 1,
119+
"max_model_len": 16384,
120+
},
121+
concurrency=1,
122+
batch_size=32,
123+
)
124+
# __lora_config_example_end__
125+
126+
# __s3_config_example_start__
127+
# S3 hosted model configuration
128+
s3_config = vLLMEngineProcessorConfig(
129+
model_source="s3://your-bucket/your-model-path/",
130+
engine_kwargs={
131+
"load_format": "runai_streamer",
132+
"max_model_len": 16384,
133+
},
134+
concurrency=1,
135+
batch_size=64,
136+
)
137+
# __s3_config_example_end__
138+
139+
# __gpu_memory_config_example_start__
140+
# GPU memory management configuration
141+
# If you encounter CUDA out of memory errors, try these optimizations:
142+
config_memory_optimized = vLLMEngineProcessorConfig(
143+
model_source="unsloth/Llama-3.1-8B-Instruct",
144+
engine_kwargs={
145+
"max_model_len": 8192,
146+
"max_num_batched_tokens": 2048,
147+
"enable_chunked_prefill": True,
148+
"gpu_memory_utilization": 0.85,
149+
"block_size": 16,
150+
},
151+
concurrency=1,
152+
batch_size=16,
153+
)
154+
155+
# For very large models or limited GPU memory:
156+
config_minimal_memory = vLLMEngineProcessorConfig(
157+
model_source="unsloth/Llama-3.1-8B-Instruct",
158+
engine_kwargs={
159+
"max_model_len": 4096,
160+
"max_num_batched_tokens": 1024,
161+
"enable_chunked_prefill": True,
162+
"gpu_memory_utilization": 0.75,
163+
},
164+
concurrency=1,
165+
batch_size=8,
166+
)
167+
# __gpu_memory_config_example_end__
168+
169+
# __embedding_config_example_start__
170+
# Embedding model configuration
171+
embedding_config = vLLMEngineProcessorConfig(
172+
model_source="sentence-transformers/all-MiniLM-L6-v2",
173+
task_type="embed",
174+
engine_kwargs=dict(
175+
enable_prefix_caching=False,
176+
enable_chunked_prefill=False,
177+
max_model_len=256,
178+
enforce_eager=True,
179+
),
180+
batch_size=32,
181+
concurrency=1,
182+
apply_chat_template=False,
183+
detokenize=False,
184+
)
185+
186+
# Example usage for embeddings
187+
def create_embedding_processor():
188+
return build_llm_processor(
189+
embedding_config,
190+
preprocess=lambda row: dict(prompt=row["text"]),
191+
postprocess=lambda row: {
192+
"text": row["prompt"],
193+
"embedding": row["embeddings"],
194+
},
195+
)
196+
197+
198+
# __embedding_config_example_end__
199+
200+
# __basic_llm_example_end__
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
"""
2+
Documentation example and test for embedding model batch inference.
3+
4+
"""
5+
6+
import subprocess
7+
import sys
8+
9+
subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "ray[llm]"])
10+
subprocess.check_call([sys.executable, "-m", "pip", "install", "numpy==1.26.4"])
11+
12+
13+
def run_embedding_example():
14+
# __embedding_example_start__
15+
import ray
16+
from ray.data.llm import vLLMEngineProcessorConfig, build_llm_processor
17+
18+
embedding_config = vLLMEngineProcessorConfig(
19+
model_source="sentence-transformers/all-MiniLM-L6-v2",
20+
task_type="embed",
21+
engine_kwargs=dict(
22+
enable_prefix_caching=False,
23+
enable_chunked_prefill=False,
24+
max_model_len=256,
25+
enforce_eager=True,
26+
),
27+
batch_size=32,
28+
concurrency=1,
29+
apply_chat_template=False,
30+
detokenize=False,
31+
)
32+
33+
embedding_processor = build_llm_processor(
34+
embedding_config,
35+
preprocess=lambda row: dict(prompt=row["text"]),
36+
postprocess=lambda row: {
37+
"text": row["prompt"],
38+
"embedding": row["embeddings"],
39+
},
40+
)
41+
42+
texts = [
43+
"Hello world",
44+
"This is a test sentence",
45+
"Embedding models convert text to vectors",
46+
]
47+
ds = ray.data.from_items([{"text": text} for text in texts])
48+
49+
embedded_ds = embedding_processor(ds)
50+
embedded_ds.show(limit=1)
51+
# __embedding_example_end__
52+
53+
54+
if __name__ == "__main__":
55+
try:
56+
import torch
57+
58+
if torch.cuda.is_available():
59+
run_embedding_example()
60+
else:
61+
print("Skipping embedding example (no GPU available)")
62+
except Exception as e:
63+
print(f"Skipping embedding example: {e}")

0 commit comments

Comments
 (0)