Skip to content

Commit a5b84f1

Browse files
authored
[Core] Shared memory based object store for Multimodal data caching and IPC (#20452)
Signed-off-by: donglu <[email protected]>
1 parent 9f04d9d commit a5b84f1

File tree

17 files changed

+1487
-27
lines changed

17 files changed

+1487
-27
lines changed

.buildkite/test-pipeline.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -789,6 +789,8 @@ steps:
789789
commands:
790790
- pytest -v -s distributed/test_comm_ops.py
791791
- pytest -v -s distributed/test_shm_broadcast.py
792+
- pytest -v -s distributed/test_shm_buffer.py
793+
- pytest -v -s distributed/test_shm_storage.py
792794

793795
- label: 2 Node Tests (4 GPUs in total) # 16min
794796
timeout_in_minutes: 30

docs/configuration/optimization.md

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -230,6 +230,20 @@ Multi-modal IPC caching is automatically enabled when
230230
there is a one-to-one correspondence between API (`P0`) and engine core (`P1`) processes,
231231
to avoid repeatedly transferring the same multi-modal inputs between them.
232232

233+
#### Key-Replicated Cache
234+
235+
By default, IPC caching uses a **key-replicated cache**, where cache keys exist
236+
in both the API (`P0`) and engine core (`P1`) processes, but the actual cache
237+
data resides only in `P1`.
238+
239+
#### Shared Memory Cache
240+
241+
When multiple worker processes are involved (e.g., when TP > 1), a
242+
**shared-memory cache** is more efficient. This can be enabled by setting
243+
`mm_processor_cache_type="shm"`. In this mode, cache keys are stored
244+
on `P0`, while the cache data itself lives in shared memory accessible by all
245+
processes.
246+
233247
### Configuration
234248

235249
You can adjust the size of the cache by setting the value of `mm_processor_cache_gb` (default 4 GiB).
@@ -244,6 +258,12 @@ Examples:
244258
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
245259
mm_processor_cache_gb=8)
246260

261+
# Use a shared-memory based IPC cache
262+
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
263+
tensor_parallel_size=2,
264+
mm_processor_cache_type="shm",
265+
mm_processor_cache_gb=8)
266+
247267
# Disable the cache
248268
llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
249269
mm_processor_cache_gb=0)
@@ -253,11 +273,12 @@ llm = LLM(model="Qwen/Qwen2.5-VL-3B-Instruct",
253273

254274
Based on the configuration, the content of the multi-modal caches on `P0` and `P1` are as follows:
255275

256-
| Processor Caching | IPC Caching | `P0` Cache | `P1` Cache | Max. Memory |
257-
|-------------------|-------------|------------|------------|-------------|
258-
||| K | K + V | `mm_processor_cache_gb * data_parallel_size` |
259-
||| K + V | N/A | `mm_processor_cache_gb * api_server_count` |
260-
||| N/A | N/A | `0` |
276+
| mm_processor_cache_type | Cache Type | `P0` Cache | `P1` Engine Cache | `P1` Worker Cache | Max. Memory |
277+
|-------------------|-------------|------------|------------|-------------|-------------|
278+
| lru | Processor Caching | K + V | N/A | N/A | `mm_processor_cache_gb * data_parallel_size` |
279+
| lru | Key-Replicated Caching | K | K + V | N/A | `mm_processor_cache_gb * api_server_count` |
280+
| shm | Shared Memory Caching | K | N/A | V | `mm_processor_cache_gb * api_server_count` |
281+
| N/A | Disabled | N/A | N/A | N/A | `0` |
261282

262283
K: Stores the hashes of multi-modal items
263284
V: Stores the processed tensor data of multi-modal items
Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
# SPDX-License-Identifier: Apache-2.0
2+
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
3+
4+
import traceback
5+
import unittest
6+
7+
from vllm.distributed.device_communicators.shm_object_storage import (
8+
SingleWriterShmRingBuffer)
9+
10+
11+
class TestSingleWriterShmRingBuffer(unittest.TestCase):
12+
"""Test suite for the ring buffer implementation"""
13+
14+
def setUp(self):
15+
"""Set up test fixtures"""
16+
self.buffer_size = 4096
17+
self.ring_buffer = None
18+
19+
def tearDown(self):
20+
"""Clean up after tests"""
21+
if self.ring_buffer:
22+
del self.ring_buffer
23+
24+
def test_buffer_opening(self):
25+
"""Test opening an existing buffer"""
26+
# First create a buffer
27+
self.ring_buffer = SingleWriterShmRingBuffer(
28+
data_buffer_size=self.buffer_size, create=True)
29+
30+
# Then open it with another instance
31+
reader_buffer = SingleWriterShmRingBuffer(*self.ring_buffer.handle())
32+
self.assertFalse(reader_buffer.is_writer)
33+
self.assertEqual(reader_buffer.shared_memory.name,
34+
self.ring_buffer.shared_memory.name)
35+
36+
def test_buffer_access(self):
37+
"""Test accessing allocated buffers"""
38+
self.ring_buffer = SingleWriterShmRingBuffer(
39+
data_buffer_size=self.buffer_size, create=True)
40+
41+
size = 100
42+
address, monotonic_id = self.ring_buffer.allocate_buf(size)
43+
44+
# Write some test data
45+
test_data = b"Hello, World!" * 7 # 91 bytes
46+
with self.ring_buffer.access_buf(address) as (data_buf, metadata):
47+
data_buf[0:len(test_data)] = test_data
48+
49+
# Read it back
50+
with self.ring_buffer.access_buf(address) as (data_buf2, metadata2):
51+
read_data = bytes(data_buf2[0:len(test_data)])
52+
read_id = metadata2[0]
53+
54+
self.assertEqual(read_data, test_data)
55+
self.assertEqual(read_id, monotonic_id)
56+
57+
def test_memory_error_on_full_buffer(self):
58+
"""Test that MemoryError is raised when buffer is full"""
59+
small_buffer_size = 200
60+
self.ring_buffer = SingleWriterShmRingBuffer(
61+
data_buffer_size=small_buffer_size, create=True)
62+
63+
# Fill up the buffer
64+
self.ring_buffer.allocate_buf(100)
65+
self.ring_buffer.allocate_buf(80) # Total: 196 bytes used
66+
67+
# This should fail
68+
with self.assertRaises(MemoryError):
69+
self.ring_buffer.allocate_buf(1) # Would exceed buffer capacity
70+
71+
def test_allocation_and_free(self):
72+
"""Test allocation and freeing of buffers"""
73+
small_buffer_size = 200
74+
self.ring_buffer = SingleWriterShmRingBuffer(
75+
data_buffer_size=small_buffer_size, create=True)
76+
77+
size = 80
78+
# Write some data
79+
test_data = b"Repeated test data"
80+
for i in range(5):
81+
address, monotonic_id = self.ring_buffer.allocate_buf(size)
82+
with self.ring_buffer.access_buf(address) as (data_buf, metadata):
83+
data_buf[0:4] = (0).to_bytes(4, "little") # 0 for not in-use
84+
data_buf[4:len(test_data) + 4] = test_data
85+
print(self.ring_buffer.metadata)
86+
freed_ids = self.ring_buffer.free_buf(lambda *args: True)
87+
print(f" Freed IDs: {freed_ids}")
88+
self.assertEqual(freed_ids[0], i)
89+
90+
def test_clear_buffer(self):
91+
"""Test clearing the buffer"""
92+
self.ring_buffer = SingleWriterShmRingBuffer(
93+
data_buffer_size=self.buffer_size, create=True)
94+
95+
# Allocate some buffers
96+
for _ in range(3):
97+
self.ring_buffer.allocate_buf(100)
98+
99+
# Clear the buffer
100+
self.ring_buffer.clear()
101+
102+
# Check that metadata is empty and IDs reset
103+
self.assertEqual(len(self.ring_buffer.metadata), 0)
104+
self.assertEqual(self.ring_buffer.monotonic_id_start, 0)
105+
self.assertEqual(self.ring_buffer.monotonic_id_end, 0)
106+
self.assertEqual(self.ring_buffer.data_buffer_start, 0)
107+
self.assertEqual(self.ring_buffer.data_buffer_end, 0)
108+
109+
110+
def main():
111+
"""Main function demonstrating usage and running tests"""
112+
print("=== SingleWriterShmRingBuffer Test Suite ===\n")
113+
114+
# Run unit tests
115+
print("Running unit tests...")
116+
unittest.main(argv=[""], exit=False, verbosity=2)
117+
118+
print("\n" + "=" * 50)
119+
print("=== Manual Demo ===\n")
120+
121+
# Manual demonstration
122+
try:
123+
print("Creating ring buffer...")
124+
writer_buffer = SingleWriterShmRingBuffer(data_buffer_size=2048,
125+
create=True)
126+
reader_buffer = SingleWriterShmRingBuffer(*writer_buffer.handle())
127+
128+
print(f"Buffer created with name: {writer_buffer.shared_memory.name}")
129+
130+
# Allocate some buffers
131+
print("\nAllocating buffers...")
132+
address_array = []
133+
for i in range(3):
134+
size = 100 + i * 50
135+
try:
136+
writer_buffer.free_buf(lambda *args: True)
137+
address, monotonic_id = writer_buffer.allocate_buf(size)
138+
address_array.append((address, size, monotonic_id))
139+
140+
# Write some test data
141+
with writer_buffer.access_buf(address) as (data_buf, metadata):
142+
test_message = f"Test message {i}".encode()
143+
data_buf[0:len(test_message)] = test_message
144+
145+
except MemoryError as e:
146+
print(f" Failed to allocate {size} bytes: {e}")
147+
148+
print("\nBuffer state:")
149+
print(f" Data buffer start: {writer_buffer.data_buffer_start}")
150+
print(f" Data buffer end: {writer_buffer.data_buffer_end}")
151+
print(f" Monotonic ID start: {writer_buffer.monotonic_id_start}")
152+
print(f" Monotonic ID end: {writer_buffer.monotonic_id_end}")
153+
print(f" Metadata entries: {len(writer_buffer.metadata)}")
154+
155+
# Try to read back the data
156+
print("\nReading back data...")
157+
for address, size, monotonic_id in address_array:
158+
with reader_buffer.access_buf(address) as (data_buf, metadata):
159+
# Find null terminator or read first 50 chars
160+
data_bytes = bytes(data_buf[0:size])
161+
message = data_bytes.decode()
162+
print(f" ID {monotonic_id}: '{message}'")
163+
164+
except Exception as e:
165+
print(f"Demo error: {e}")
166+
traceback.print_exc()
167+
168+
print("\n=== Demo Complete ===")
169+
170+
171+
if __name__ == "__main__":
172+
main()

0 commit comments

Comments
 (0)