Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 9 additions & 2 deletions vllm/envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@
VLLM_IMAGE_FETCH_TIMEOUT: int = 5
VLLM_VIDEO_FETCH_TIMEOUT: int = 30
VLLM_AUDIO_FETCH_TIMEOUT: int = 10
VLLM_MM_INPUT_CACHE_SIZE: int = 256
VLLM_TARGET_DEVICE: str = "cuda"
MAX_JOBS: Optional[str] = None
NVCC_THREADS: Optional[str] = None
Expand Down Expand Up @@ -401,15 +402,21 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
lambda: int(os.getenv("VLLM_IMAGE_FETCH_TIMEOUT", "5")),

# Timeout for fetching videos when serving multimodal models
# Default is 15 seconds
# Default is 30 seconds
"VLLM_VIDEO_FETCH_TIMEOUT":
lambda: int(os.getenv("VLLM_VIDEO_FETCH_TIMEOUT", "15")),
lambda: int(os.getenv("VLLM_VIDEO_FETCH_TIMEOUT", "30")),
Comment on lines -404 to +407
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was previously addressed in #10495 but not entirely updated, so I fixed it in this PR too.


# Timeout for fetching audio when serving multimodal models
# Default is 10 seconds
"VLLM_AUDIO_FETCH_TIMEOUT":
lambda: int(os.getenv("VLLM_AUDIO_FETCH_TIMEOUT", "10")),

# Cache size for multimodal feature/input cache for multimodal models
# in unit of number of multimodal data items (e.g. image, video, audio).
# Default is 256 multimodal data items.
"VLLM_MM_INPUT_CACHE_SIZE":
lambda: int(os.getenv("VLLM_MM_INPUT_CACHE_SIZE", "256")),

# Path to the XLA persistent cache directory.
# Only used for XLA devices such as TPUs.
"VLLM_XLA_CACHE_PATH":
Expand Down
6 changes: 2 additions & 4 deletions vllm/multimodal/registry.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

import torch.nn as nn

from vllm.envs import VLLM_MM_INPUT_CACHE_SIZE
from vllm.inputs import InputProcessingContext
from vllm.logger import init_logger
from vllm.transformers_utils.tokenizer import AnyTokenizer
Expand All @@ -28,9 +29,6 @@

logger = init_logger(__name__)

# TODO: Tune the MM cache size
MM_CACHE_SIZE = 256

N = TypeVar("N", bound=Type[nn.Module])
_I = TypeVar("_I", bound=BaseProcessingInfo)
_I_co = TypeVar("_I_co", bound=BaseProcessingInfo, covariant=True)
Expand Down Expand Up @@ -121,7 +119,7 @@ def __init__(

self._limits_by_model = _MultiModalLimits()

self._processing_cache = ProcessingCache(MM_CACHE_SIZE)
self._processing_cache = ProcessingCache(VLLM_MM_INPUT_CACHE_SIZE)

def register_plugin(self, plugin: MultiModalPlugin) -> None:
"""
Expand Down
12 changes: 7 additions & 5 deletions vllm/v1/engine/mm_input_cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from typing import Any, Dict, List, Optional

from vllm.config import ModelConfig
from vllm.envs import VLLM_MM_INPUT_CACHE_SIZE
from vllm.logger import init_logger
from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict,
MultiModalKwargs, MultiModalRegistry)
Expand All @@ -28,9 +29,8 @@
# client (=P0) and server (=P1) processes.

# Both Client and Server must use the same cache size
# (to perform mirrored caching)
# TODO: Tune the MM cache size
MM_CACHE_SIZE = 256
# (to perform mirrored caching). This cache size is set by the environment
# variable VLLM_MM_INPUT_CACHE_SIZE.


# TODO(ywang96): Deprecate this class once all multimodal models migrate to use
Expand All @@ -50,7 +50,8 @@ def __init__(

# Init cache
self.use_cache = not model_config.disable_mm_preprocessor_cache
self.mm_cache = LRUCache[str, MultiModalKwargs](MM_CACHE_SIZE)
self.mm_cache = LRUCache[str,
MultiModalKwargs](VLLM_MM_INPUT_CACHE_SIZE)

# DEBUG: Set to None to disable
self.mm_debug_cache_hit_ratio_steps = None
Expand Down Expand Up @@ -127,7 +128,8 @@ class MMInputCacheServer:

def __init__(self, model_config):
self.use_cache = not model_config.disable_mm_preprocessor_cache
self.mm_cache = LRUCache[str, MultiModalKwargs](MM_CACHE_SIZE)
self.mm_cache = LRUCache[str,
MultiModalKwargs](VLLM_MM_INPUT_CACHE_SIZE)

def get_and_update(
self,
Expand Down