diff --git a/vllm/model_executor/layers/fused_moe/fused_moe.py b/vllm/model_executor/layers/fused_moe/fused_moe.py index f44328418f1b..df208eae2e71 100644 --- a/vllm/model_executor/layers/fused_moe/fused_moe.py +++ b/vllm/model_executor/layers/fused_moe/fused_moe.py @@ -872,8 +872,10 @@ def get_moe_configs( for config_file_path in config_file_paths: if os.path.exists(config_file_path): with open(config_file_path) as f: - logger.info( - "Using configuration from %s for MoE layer.", config_file_path + logger.info_once( + "Using configuration from %s for MoE layer.", + config_file_path, + scope="global", ) # If a configuration has been found, return it tuned_config = json.load(f) diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 92fbdd709348..91bd45bf879c 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -162,9 +162,11 @@ def get_fp8_moe_backend(block_quant: bool) -> Fp8MoeBackend: # deepGEMM on supported platforms with block-quantized weights if envs.VLLM_USE_DEEP_GEMM and envs.VLLM_MOE_USE_DEEP_GEMM and block_quant: if not has_deep_gemm(): - logger.warning_once("DeepGEMM backend requested but not available.") + logger.warning_once( + "DeepGEMM backend requested but not available.", scope="local" + ) elif is_deep_gemm_supported(): - logger.info_once("Using DeepGEMM backend for FP8 MoE") + logger.info_once("Using DeepGEMM backend for FP8 MoE", scope="local") return Fp8MoeBackend.DEEPGEMM # CUTLASS BlockScaled GroupedGemm on SM100 with block-quantized weights @@ -173,7 +175,9 @@ def get_fp8_moe_backend(block_quant: bool) -> Fp8MoeBackend: and current_platform.is_device_capability(100) and block_quant ): - logger.info_once("Using Cutlass BlockScaled GroupedGemm backend for FP8 MoE") + logger.info_once( + "Using Cutlass BlockScaled GroupedGemm backend for FP8 MoE", scope="local" + ) return Fp8MoeBackend.CUTLASS_BLOCK_SCALED_GROUPED_GEMM # default to Triton diff --git a/vllm/profiler/gpu_profiler.py b/vllm/profiler/gpu_profiler.py index 2155b67a3db4..3e2cbe7296e9 100644 --- a/vllm/profiler/gpu_profiler.py +++ b/vllm/profiler/gpu_profiler.py @@ -139,18 +139,19 @@ def __init__(self, worker_name: str, local_rank: int) -> None: self.local_rank = local_rank torch_profiler_trace_dir = envs.VLLM_TORCH_PROFILER_DIR - logger.info( - "Torch profiling enabled. Traces will be saved to: %s", - torch_profiler_trace_dir, - ) - logger.debug( - "Profiler config: record_shapes=%s," - "profile_memory=%s,with_stack=%s,with_flops=%s", - envs.VLLM_TORCH_PROFILER_RECORD_SHAPES, - envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY, - envs.VLLM_TORCH_PROFILER_WITH_STACK, - envs.VLLM_TORCH_PROFILER_WITH_FLOPS, - ) + if local_rank in (None, 0): + logger.info( + "Torch profiling enabled. Traces will be saved to: %s", + torch_profiler_trace_dir, + ) + logger.debug( + "Profiler config: record_shapes=%s," + "profile_memory=%s,with_stack=%s,with_flops=%s", + envs.VLLM_TORCH_PROFILER_RECORD_SHAPES, + envs.VLLM_TORCH_PROFILER_WITH_PROFILE_MEMORY, + envs.VLLM_TORCH_PROFILER_WITH_STACK, + envs.VLLM_TORCH_PROFILER_WITH_FLOPS, + ) self.profiler = torch.profiler.profile( activities=[ torch.profiler.ProfilerActivity.CPU, diff --git a/vllm/v1/core/kv_cache_utils.py b/vllm/v1/core/kv_cache_utils.py index 01ecd881115d..b18ba8e8b2c7 100644 --- a/vllm/v1/core/kv_cache_utils.py +++ b/vllm/v1/core/kv_cache_utils.py @@ -1236,10 +1236,11 @@ def _report_kv_cache_config( max_concurrency = get_max_concurrency_for_kv_cache_config( vllm_config, kv_cache_config ) - logger.info( + logger.info_once( "Maximum concurrency for %s tokens per request: %.2fx", max_model_len_str, max_concurrency, + scope="local", )