Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions requirements/docs.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ ruff
# Required for argparse hook only
-f https://download.pytorch.org/whl/cpu
cachetools
cloudpickle
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Strange that organising the utils would cause this to be required, is the mocking not working properly for this one?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Model executor (imported by docs) now imports serial_utils which in turn imports cloudpickle.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, these are relatively cheap to install, so it's not a big problem.

It'd be nice to figure out how to remove them in future.

py-cpuinfo
msgspec
pydantic
torch
1 change: 0 additions & 1 deletion tools/pre_commit/check_pickle_imports.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,6 @@
"vllm/v1/executor/multiproc_executor.py",
"vllm/v1/executor/ray_executor.py",
"vllm/entrypoints/llm.py",
"vllm/utils/__init__.py",
"tests/utils.py",
# pickle and cloudpickle
"vllm/v1/serial_utils.py",
Expand Down
23 changes: 23 additions & 0 deletions vllm/config/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -1618,6 +1618,29 @@ def is_encoder_decoder(self) -> bool:
"""Extract the HF encoder/decoder model flag."""
return is_encoder_decoder(self.hf_config)

@property
def uses_alibi(self) -> bool:
cfg = self.hf_text_config

return (
getattr(cfg, "alibi", False) # Falcon
or "BloomForCausalLM" in self.architectures # Bloom
or getattr(cfg, "position_encoding_type", "") == "alibi" # codellm_1b_alibi
or (
hasattr(cfg, "attn_config") # MPT
and (
(
isinstance(cfg.attn_config, dict)
and cfg.attn_config.get("alibi", False)
)
or (
not isinstance(cfg.attn_config, dict)
and getattr(cfg.attn_config, "alibi", False)
)
)
)
)

@property
def uses_mrope(self) -> bool:
return uses_mrope(self.hf_config)
Expand Down
28 changes: 27 additions & 1 deletion vllm/config/vllm.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,16 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project

import copy
import getpass
import hashlib
import json
import os
import tempfile
import threading
import time
from contextlib import contextmanager
from dataclasses import replace
from datetime import datetime
from functools import lru_cache
from pathlib import Path
from typing import TYPE_CHECKING, Any, TypeVar
Expand All @@ -17,7 +21,7 @@
from pydantic.dataclasses import dataclass

import vllm.envs as envs
from vllm.logger import init_logger
from vllm.logger import enable_trace_function_call, init_logger
from vllm.transformers_utils.runai_utils import is_runai_obj_uri
from vllm.utils import random_uuid

Expand Down Expand Up @@ -206,6 +210,28 @@ def pad_for_cudagraph(self, batch_size: int) -> int:
# i.e., batch_size <= self.compilation_config.max_cudagraph_capture_size
return self.compilation_config.bs_to_padded_graph_size[batch_size]

def enable_trace_function_call_for_thread(self) -> None:
"""
Set up function tracing for the current thread,
if enabled via the `VLLM_TRACE_FUNCTION` environment variable.
"""
if envs.VLLM_TRACE_FUNCTION:
tmp_dir = tempfile.gettempdir()
# add username to tmp_dir to avoid permission issues
tmp_dir = os.path.join(tmp_dir, getpass.getuser())
filename = (
f"VLLM_TRACE_FUNCTION_for_process_{os.getpid()}"
f"_thread_{threading.get_ident()}_at_{datetime.now()}.log"
).replace(" ", "_")
log_path = os.path.join(
tmp_dir,
"vllm",
f"vllm-instance-{self.instance_id}",
filename,
)
os.makedirs(os.path.dirname(log_path), exist_ok=True)
enable_trace_function_call(log_path)

@staticmethod
def _get_quantization_config(
model_config: ModelConfig, load_config: LoadConfig
Expand Down
3 changes: 1 addition & 2 deletions vllm/engine/arg_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@
from vllm.logger import init_logger
from vllm.platforms import CpuArchEnum, current_platform
from vllm.plugins import load_general_plugins
from vllm.ray.lazy_utils import is_ray_initialized
from vllm.ray.lazy_utils import is_in_ray_actor, is_ray_initialized
from vllm.reasoning import ReasoningParserManager
from vllm.test_utils import MODEL_WEIGHTS_S3_BUCKET, MODELS_ON_S3
from vllm.transformers_utils.config import (
Expand All @@ -82,7 +82,6 @@
maybe_override_with_speculators,
)
from vllm.transformers_utils.utils import check_gguf_file
from vllm.utils import is_in_ray_actor
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.mem_constants import GiB_bytes
from vllm.utils.network_utils import get_ip
Expand Down
2 changes: 1 addition & 1 deletion vllm/entrypoints/anthropic/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,9 @@
with_cancellation,
)
from vllm.logger import init_logger
from vllm.utils import set_ulimit
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.network_utils import is_valid_ipv6_address
from vllm.utils.system_utils import set_ulimit
from vllm.version import __version__ as VLLM_VERSION

prometheus_multiproc_dir: tempfile.TemporaryDirectory
Expand Down
3 changes: 2 additions & 1 deletion vllm/entrypoints/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,9 @@
from vllm.logger import init_logger
from vllm.sampling_params import SamplingParams
from vllm.usage.usage_lib import UsageContext
from vllm.utils import random_uuid, set_ulimit
from vllm.utils import random_uuid
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.system_utils import set_ulimit
from vllm.version import __version__ as VLLM_VERSION

logger = init_logger("vllm.entrypoints.api_server")
Expand Down
4 changes: 2 additions & 2 deletions vllm/entrypoints/openai/api_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -108,10 +108,10 @@
from vllm.logger import init_logger
from vllm.reasoning import ReasoningParserManager
from vllm.usage.usage_lib import UsageContext
from vllm.utils import Device, set_ulimit
from vllm.utils import Device
from vllm.utils.argparse_utils import FlexibleArgumentParser
from vllm.utils.network_utils import is_valid_ipv6_address
from vllm.utils.system_utils import decorate_logs
from vllm.utils.system_utils import decorate_logs, set_ulimit
from vllm.v1.engine.exceptions import EngineDeadError
from vllm.v1.metrics.prometheus import get_prometheus_registry
from vllm.version import __version__ as VLLM_VERSION
Expand Down
2 changes: 1 addition & 1 deletion vllm/platforms/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def cuda_platform_plugin() -> str | None:
is_cuda = False
logger.debug("Checking if CUDA platform is available.")
try:
from vllm.utils import import_pynvml
from vllm.utils.import_utils import import_pynvml

pynvml = import_pynvml()
pynvml.nvmlInit()
Expand Down
2 changes: 1 addition & 1 deletion vllm/platforms/cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import vllm._C # noqa
import vllm.envs as envs
from vllm.logger import init_logger
from vllm.utils import import_pynvml
from vllm.utils.import_utils import import_pynvml
from vllm.utils.torch_utils import cuda_device_count_stateless

from .interface import DeviceCapability, Platform, PlatformEnum
Expand Down
Loading