Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
53de11a
[LLM] re-enable doc test for Working with LLMs guide #55796
nrghosh Aug 25, 2025
3920f45
remove llm example from exclusion list
nrghosh Aug 25, 2025
70db35f
Refactor LLM examples to use external files
nrghosh Aug 26, 2025
be7cd15
fix lint
nrghosh Aug 26, 2025
bf73dcb
more lint
nrghosh Aug 27, 2025
cb7fc7b
more lint
nrghosh Aug 27, 2025
090e764
wip
nrghosh Aug 28, 2025
f71704a
doc lint wip
nrghosh Aug 28, 2025
897ecc4
Replace explicit line refs with semantic tags
nrghosh Sep 2, 2025
2f8d2d1
fix code snippet separation and doc comments
nrghosh Sep 2, 2025
4e685ad
improve tag / code blocks and explanation
nrghosh Sep 3, 2025
240738c
Merge remote-tracking branch 'origin/master' into nrghosh/llms-doctest
nrghosh Sep 12, 2025
15d157a
wip
nrghosh Sep 13, 2025
6c8761d
wip - vlm working
nrghosh Sep 19, 2025
ae20fb1
wip - basic llm working
nrghosh Sep 19, 2025
ac120b9
wip - basic llm working
nrghosh Sep 19, 2025
cd1f5bc
wip - formatting - all 3 examples working
nrghosh Sep 19, 2025
8c01472
Merge branch 'master' into nrghosh/llms-doctest
nrghosh Sep 19, 2025
dbbf6bf
wip lint
nrghosh Sep 19, 2025
525177e
wip
nrghosh Sep 20, 2025
6d7b188
wip lint fix ci
nrghosh Sep 20, 2025
d499f5c
wip
nrghosh Sep 20, 2025
e39006a
wip lint
nrghosh Sep 20, 2025
9721cf9
wip lint
nrghosh Sep 20, 2025
b4c1198
wip lint ci
nrghosh Sep 20, 2025
5d11faf
wip imports lint
nrghosh Sep 21, 2025
a87af3e
wip - gpu
nrghosh Sep 22, 2025
36dd406
gpu in ci
nrghosh Sep 23, 2025
057127f
ci + refactor embedding example out
nrghosh Sep 23, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 24 additions & 2 deletions doc/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -333,6 +333,30 @@ py_test_run_all_subdirectory(
],
)

# --------------------------------------------------------------------
# Test all doc/source/data/doc_code/working-with-llms code included in rst/md files.
# --------------------------------------------------------------------

filegroup(
name = "data_llm_examples",
srcs = glob(["source/data/doc_code/working-with-llms/**/*.py"]),
visibility = ["//doc:__subpackages__"],
)

# GPU Tests
py_test_run_all_subdirectory(
size = "large",
include = ["source/data/doc_code/working-with-llms/**/*.py"],
exclude = [],
extra_srcs = [],
tags = [
"exclusive",
"gpu",
"team:data",
"team:llm"
],
)

# --------------------------------------------------------------------
# Test all doc/source/tune/doc_code code included in rst/md files.
# --------------------------------------------------------------------
Expand Down Expand Up @@ -527,8 +551,6 @@ doctest_each(
# These tests run on GPU (see below).
"source/data/batch_inference.rst",
"source/data/transforming-data.rst",
# These tests are currently failing.
"source/data/working-with-llms.rst",
# These don't contain code snippets.
"source/data/api/**/*.rst",
],
Expand Down
200 changes: 200 additions & 0 deletions doc/source/data/doc_code/working-with-llms/basic_llm_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
"""
This file serves as a documentation example and CI test for basic LLM batch inference.

"""

# Dependency setup
import subprocess
import sys

subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "ray[llm]"])
subprocess.check_call(
[sys.executable, "-m", "pip", "install", "--upgrade", "transformers"]
)
subprocess.check_call([sys.executable, "-m", "pip", "install", "numpy==1.26.4"])


# __basic_llm_example_start__
import ray
from ray.data.llm import vLLMEngineProcessorConfig, build_llm_processor

# __basic_config_example_start__
# Basic vLLM configuration
config = vLLMEngineProcessorConfig(
model_source="unsloth/Llama-3.1-8B-Instruct",
engine_kwargs={
"enable_chunked_prefill": True,
"max_num_batched_tokens": 4096, # Reduce if CUDA OOM occurs
"max_model_len": 16384,
},
concurrency=1,
batch_size=64,
)
# __basic_config_example_end__

processor = build_llm_processor(
config,
preprocess=lambda row: dict(
messages=[
{"role": "system", "content": "You are a bot that responds with haikus."},
{"role": "user", "content": row["item"]},
],
sampling_params=dict(
temperature=0.3,
max_tokens=250,
),
),
postprocess=lambda row: dict(
answer=row["generated_text"],
**row, # This will return all the original columns in the dataset.
),
)

ds = ray.data.from_items(["Start of the haiku is: Complete this for me..."])

if __name__ == "__main__":
try:
import torch

if torch.cuda.is_available():
ds = processor(ds)
ds.show(limit=1)
else:
print("Skipping basic LLM run (no GPU available)")
except Exception as e:
print(f"Skipping basic LLM run due to environment error: {e}")

# __hf_token_config_example_start__
# Configuration with Hugging Face token
config_with_token = vLLMEngineProcessorConfig(
model_source="unsloth/Llama-3.1-8B-Instruct",
runtime_env={"env_vars": {"HF_TOKEN": "your_huggingface_token"}},
concurrency=1,
batch_size=64,
)
# __hf_token_config_example_end__

# __parallel_config_example_start__
# Model parallelism configuration for larger models
# tensor_parallel_size=2: Split model across 2 GPUs for tensor parallelism
# pipeline_parallel_size=2: Use 2 pipeline stages (total 4 GPUs needed)
# Total GPUs required = tensor_parallel_size * pipeline_parallel_size = 4
config = vLLMEngineProcessorConfig(
model_source="unsloth/Llama-3.1-8B-Instruct",
engine_kwargs={
"max_model_len": 16384,
"tensor_parallel_size": 2,
"pipeline_parallel_size": 2,
"enable_chunked_prefill": True,
"max_num_batched_tokens": 2048,
},
concurrency=1,
batch_size=32,
accelerator_type="L4",
)
# __parallel_config_example_end__

# __runai_config_example_start__
# RunAI streamer configuration for optimized model loading
# Note: Install vLLM with runai dependencies: pip install -U "vllm[runai]>=0.10.1"
config = vLLMEngineProcessorConfig(
model_source="unsloth/Llama-3.1-8B-Instruct",
engine_kwargs={
"load_format": "runai_streamer",
"max_model_len": 16384,
},
concurrency=1,
batch_size=64,
)
# __runai_config_example_end__

# __lora_config_example_start__
# Multi-LoRA configuration
config = vLLMEngineProcessorConfig(
model_source="unsloth/Llama-3.1-8B-Instruct",
engine_kwargs={
"enable_lora": True,
"max_lora_rank": 32,
"max_loras": 1,
"max_model_len": 16384,
},
concurrency=1,
batch_size=32,
)
# __lora_config_example_end__

# __s3_config_example_start__
# S3 hosted model configuration
s3_config = vLLMEngineProcessorConfig(
model_source="s3://your-bucket/your-model-path/",
engine_kwargs={
"load_format": "runai_streamer",
"max_model_len": 16384,
},
concurrency=1,
batch_size=64,
)
# __s3_config_example_end__

# __gpu_memory_config_example_start__
# GPU memory management configuration
# If you encounter CUDA out of memory errors, try these optimizations:
config_memory_optimized = vLLMEngineProcessorConfig(
model_source="unsloth/Llama-3.1-8B-Instruct",
engine_kwargs={
"max_model_len": 8192,
"max_num_batched_tokens": 2048,
"enable_chunked_prefill": True,
"gpu_memory_utilization": 0.85,
"block_size": 16,
},
concurrency=1,
batch_size=16,
)

# For very large models or limited GPU memory:
config_minimal_memory = vLLMEngineProcessorConfig(
model_source="unsloth/Llama-3.1-8B-Instruct",
engine_kwargs={
"max_model_len": 4096,
"max_num_batched_tokens": 1024,
"enable_chunked_prefill": True,
"gpu_memory_utilization": 0.75,
},
concurrency=1,
batch_size=8,
)
# __gpu_memory_config_example_end__

# __embedding_config_example_start__
# Embedding model configuration
embedding_config = vLLMEngineProcessorConfig(
model_source="sentence-transformers/all-MiniLM-L6-v2",
task_type="embed",
engine_kwargs=dict(
enable_prefix_caching=False,
enable_chunked_prefill=False,
max_model_len=256,
enforce_eager=True,
),
batch_size=32,
concurrency=1,
apply_chat_template=False,
detokenize=False,
)

# Example usage for embeddings
def create_embedding_processor():
return build_llm_processor(
embedding_config,
preprocess=lambda row: dict(prompt=row["text"]),
postprocess=lambda row: {
"text": row["prompt"],
"embedding": row["embeddings"],
},
)


# __embedding_config_example_end__

# __basic_llm_example_end__
63 changes: 63 additions & 0 deletions doc/source/data/doc_code/working-with-llms/embedding_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
"""
Documentation example and test for embedding model batch inference.

"""

import subprocess
import sys

subprocess.check_call([sys.executable, "-m", "pip", "install", "--upgrade", "ray[llm]"])
subprocess.check_call([sys.executable, "-m", "pip", "install", "numpy==1.26.4"])


def run_embedding_example():
# __embedding_example_start__
import ray
from ray.data.llm import vLLMEngineProcessorConfig, build_llm_processor

embedding_config = vLLMEngineProcessorConfig(
model_source="sentence-transformers/all-MiniLM-L6-v2",
task_type="embed",
engine_kwargs=dict(
enable_prefix_caching=False,
enable_chunked_prefill=False,
max_model_len=256,
enforce_eager=True,
),
batch_size=32,
concurrency=1,
apply_chat_template=False,
detokenize=False,
)

embedding_processor = build_llm_processor(
embedding_config,
preprocess=lambda row: dict(prompt=row["text"]),
postprocess=lambda row: {
"text": row["prompt"],
"embedding": row["embeddings"],
},
)

texts = [
"Hello world",
"This is a test sentence",
"Embedding models convert text to vectors",
]
ds = ray.data.from_items([{"text": text} for text in texts])

embedded_ds = embedding_processor(ds)
embedded_ds.show(limit=1)
# __embedding_example_end__


if __name__ == "__main__":
try:
import torch

if torch.cuda.is_available():
run_embedding_example()
else:
print("Skipping embedding example (no GPU available)")
except Exception as e:
print(f"Skipping embedding example: {e}")
Loading