Addressing comments

ilmarkov · ilmarkov · commit 82057db18ade · 2025-09-03T07:01:42.000-07:00
Signed-off-by: ilmarkov &lt;markovilya197@gmail.com&gt;
diff --git a/vllm/compilation/collective_fusion.py b/vllm/compilation/collective_fusion.py
@@ -19,8 +19,7 @@
 from vllm.model_executor.layers.quantization.utils.quant_utils import (
     GroupShape)
 from vllm.platforms import current_platform
-from vllm.utils import (_FI_ALLREDUCE_ONE_SHOT_MAX_SIZES,
-                        direct_register_custom_op, flashinfer_max_size)
+from vllm.utils import direct_register_custom_op
 
 from .inductor_pass import enable_fake_mode
 from .vllm_inductor_pass import VllmInductorPass
@@ -398,6 +397,22 @@ def __call__(self, graph: fx.Graph):
 if flashinfer_comm is not None:
     _FI_WORKSPACE_TENSOR = None
 
+    MiB = 1024 * 1024
+    # Max size of the input tensor per world size per device capability
+    # to use flashinfer one shot fused allreduce
+    _FI_ALLREDUCE_ONE_SHOT_MAX_SIZES = {
+        "9.0": {
+            2: 32 * MiB,  # 32MB
+            4: 2 * MiB,  # 2MB
+            8: 1 * MiB,  # 1MB
+        },
+        "10.0": {
+            2: 32 * MiB,  # 32MB
+            4: 4 * MiB,  # 4MB
+            8: 1 * MiB,  # 1MB
+        },
+    }
+
     def call_trtllm_fused_allreduce_norm(
         allreduce_in: torch.Tensor,
         residual: torch.Tensor,
@@ -425,9 +440,11 @@ def call_trtllm_fused_allreduce_norm(
             f"element size {element_size}"
         device_capability = current_platform.get_device_capability(
         ).as_version_str()
-        max_sizes = _FI_ALLREDUCE_ONE_SHOT_MAX_SIZES.get(device_capability, {})
         # Get one shot input size limit for the current world size
-        max_one_shot_size = max_sizes.get(world_size, None)
+        # for the current device capability
+        max_one_shot_size = _FI_ALLREDUCE_ONE_SHOT_MAX_SIZES. \
+                        get(device_capability, {}). \
+                        get(world_size, None)
         # Use one shot if no max size is specified
         use_oneshot = max_one_shot_size is None or \
             current_tensor_size <= max_one_shot_size
@@ -1449,7 +1466,8 @@ def __init__(self, config: VllmConfig):
                 "Flashinfer is not installed or comm module not found, "
                 "skipping allreduce fusion pass")
             return
-        max_size = flashinfer_max_size(self.tp_size, config)
+        max_size = config.compilation_config.\
+            pass_config.flashinfer_max_size(self.tp_size)
         if max_size is None:
             # Flashinfer doesn't support current world size
             logger.warning(
diff --git a/vllm/compilation/compiler_interface.py b/vllm/compilation/compiler_interface.py
@@ -555,6 +555,8 @@ def set_inductor_config(config, compile_range):
     if isinstance(compile_range, tuple):
         # for a specific range of batchsizes, tuning triton kernel parameters
         # can be beneficial
+        #TODO(luka): max autotune only present with -O3,
+        # and this should live in config: https://github.com/vllm-project/vllm/issues/20283
         config["max_autotune"] = True
         config["coordinate_descent_tuning"] = True
 
diff --git a/vllm/compilation/cuda_piecewise_backend.py b/vllm/compilation/cuda_piecewise_backend.py
@@ -50,22 +50,7 @@ def __init__(self, graph: fx.GraphModule, vllm_config: VllmConfig,
 
         self.is_full_graph = total_piecewise_compiles == 1
 
-        self.compile_sizes: set[int] = set(
-            self.compilation_config.compile_sizes)
-        self.compile_ranges_split_points: list[
-            int] = self.compilation_config.compile_ranges_split_points
-        self.compile_ranges = []
-        split_points = sorted(
-            set(self.compile_sizes).union(set(
-                self.compile_ranges_split_points)))
-        for i, s in enumerate(split_points):
-            if i == 0:
-                self.compile_ranges.append((1, s))
-            else:
-                self.compile_ranges.append((split_points[i - 1], s))
-            if s in self.compile_sizes:
-                self.compile_ranges.append((s, s))
-        self.compile_ranges = sorted(self.compile_ranges)
+        self.compile_ranges = self.compilation_config.get_compile_ranges()
         log_string = f"PiecewiseBackend: compile_ranges: {self.compile_ranges}"
         logger.debug_once(log_string)
 
diff --git a/vllm/config/__init__.py b/vllm/config/__init__.py
@@ -50,7 +50,7 @@
 from vllm.transformers_utils.s3_utils import S3Model
 from vllm.transformers_utils.utils import is_s3, maybe_model_redirect
 from vllm.utils import (LayerBlockType, LazyLoader, common_broadcastable_dtype,
-                        flashinfer_max_size, random_uuid)
+                        random_uuid)
 
 if TYPE_CHECKING:
     from _typeshed import DataclassInstance
@@ -3877,7 +3877,8 @@ def _set_compile_ranges(self):
         # Add the compile ranges for flashinfer
         if compilation_config.pass_config.enable_fi_allreduce_fusion:
             tp_size = self.parallel_config.tensor_parallel_size
-            max_size = flashinfer_max_size(tp_size, self)
+            max_size = compilation_config.pass_config.flashinfer_max_size(
+                tp_size)
             if max_size is not None:
                 max_token_num = max_size // (
                     self.model_config.get_hidden_size() *
diff --git a/vllm/config/compilation.py b/vllm/config/compilation.py
@@ -94,10 +94,59 @@ class PassConfig:
     dictionary mapping each world size to the threshold in MB
         { <world size>: <max size in mb> }
     Unspecified world sizes will fallback to
-        { 2: 32, 4: 32, 8: 2 }"""
+        _FI_ALLREDUCE_MAX_INPUT_SIZES = {
+            "9.0": {
+                2: 64 * MiB,  # 64MB
+                4: 2 * MiB,  # 2MB
+                8: 1 * MiB,  # 1MB
+            },
+            "10.0": {
+                2: 64 * MiB,  # 64MB
+                4: 32 * MiB,  # 32MB
+                8: 1 * MiB,  # 1MB
+            },
+        }, where key is the device capability"""
 
     # TODO(luka) better pass enabling system.
 
+    def flashinfer_max_size(self, world_size: int) -> Optional[int]:
+        """
+        Returns the max communication size in bytes for flashinfer
+        allreduce fusion for the given world size. Falls back to
+        conservative defaults if the world size is not specified in config.
+        """
+
+        # import here to avoid circular dependencies
+        from vllm.platforms import current_platform
+        MiB = 1024 * 1024
+
+        # Max size of the input tensor per world size per device capability
+        # to use flashinfer fused allreduce
+        _FI_ALLREDUCE_MAX_INPUT_SIZES = {
+            "9.0": {
+                2: 64 * MiB,  # 64MB
+                4: 2 * MiB,  # 2MB
+                8: 1 * MiB,  # 1MB
+            },
+            "10.0": {
+                2: 64 * MiB,  # 64MB
+                4: 32 * MiB,  # 32MB
+                8: 1 * MiB,  # 1MB
+            },
+        }
+
+        device_capability = current_platform.get_device_capability(
+        ).as_version_str()
+        max_sizes = _FI_ALLREDUCE_MAX_INPUT_SIZES.get(device_capability, {})
+        max_sizes.update({
+            k: int(v * MiB)
+            for k, v in self.fi_allreduce_fusion_max_size_mb.items()
+        })
+        if world_size not in max_sizes:
+            # FlashInfer doesn't support other world sizes
+            return None
+        return max_sizes[world_size]
+
     def uuid(self):
         """
         Produces a hash unique to the pass configuration.
@@ -223,9 +272,11 @@ class CompilationConfig:
     compile_ranges_split_points: Optional[list[int]] = None
     """Split points that represent compile ranges for inductor.
     The compile ranges are 
-    [1, split_points[0]], 
-    [split_points[0], split_points[1]], ..., 
-    [split_points[-1], max_num_batched_tokens].
+    [1, split_points[0]), 
+    [split_points[0], split_points[1]), ..., 
+    [split_points[-1], max_num_batched_tokens + 1).
+    Compile sizes are also used single element ranges:
+    [compile_sizes[i], compile_sizes[i] + 1).
     """
 
     inductor_compile_config: dict = field(default_factory=dict)
@@ -579,3 +630,22 @@ def set_splitting_ops_for_v1(self):
     def splitting_ops_contain_attention(self) -> bool:
         return self.splitting_ops is not None and all(
             op in self.splitting_ops for op in self._attention_ops)
+
+    def get_compile_ranges(self) -> list[tuple[int, int]]:
+        """Get the compile ranges for the compilation config."""
+        compile_ranges_split_points = self.compile_ranges_split_points
+        compile_ranges = []
+        # max_num_batched_tokens + 1
+        max_split_point = max(compile_ranges_split_points)
+        split_points = sorted(
+            set(self.compile_sizes).union(set(
+                self.compile_ranges_split_points)))
+        split_points = split_points.filter(lambda x: x <= max_split_point)
+        for i, s in enumerate(split_points):
+            if i == 0:
+                self.compile_ranges.append((1, s))
+            else:
+                self.compile_ranges.append((split_points[i - 1], s))
+            if s in self.compile_sizes and s != 1:
+                self.compile_ranges.append((s, s))
+        return sorted(compile_ranges)
diff --git a/vllm/utils/__init__.py b/vllm/utils/__init__.py
@@ -87,64 +87,6 @@
 POOLING_MODEL_MAX_NUM_BATCHED_TOKENS = 32768
 MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS = 5120
 
-# Max communication size for flashinfer fused allreduce
-MiB = 1024 * 1024
-
-# Max size of the input tensor per world size per device capability
-# to use flashinfer fused allreduce
-_FI_ALLREDUCE_MAX_INPUT_SIZES = {
-    "9.0": {
-        2: 64 * MiB,  # 64MB
-        4: 2 * MiB,  # 2MB
-        8: 1 * MiB,  # 1MB
-    },
-    "10.0": {
-        2: 64 * MiB,  # 64MB
-        4: 32 * MiB,  # 32MB
-        8: 1 * MiB,  # 1MB
-    },
-}
-
-# Max size of the input tensor per world size per device capability
-# to use flashinfer one shot fused allreduce
-_FI_ALLREDUCE_ONE_SHOT_MAX_SIZES = {
-    "9.0": {
-        2: 32 * MiB,  # 32MB
-        4: 2 * MiB,  # 2MB
-        8: 1 * MiB,  # 1MB
-    },
-    "10.0": {
-        2: 32 * MiB,  # 32MB
-        4: 4 * MiB,  # 4MB
-        8: 1 * MiB,  # 1MB
-    },
-}
-
-
-def flashinfer_max_size(world_size: int, config: VllmConfig) -> Optional[int]:
-    """
-    Returns the max communication size in bytes for flashinfer
-    allreduce fusion for the given world size. Falls back to
-    conservative defaults if the world size is not specified in config.
-    """
-
-    # import here to avoid circular dependencies
-    from vllm.platforms import current_platform
-
-    device_capability = current_platform.get_device_capability(
-    ).as_version_str()
-    max_sizes = _FI_ALLREDUCE_MAX_INPUT_SIZES.get(device_capability, {})
-    max_sizes.update({
-        k: int(v * MiB)
-        for k, v in config.compilation_config.pass_config.
-        fi_allreduce_fusion_max_size_mb.items()
-    })
-    if world_size not in max_sizes:
-        # FlashInfer doesn't support other world sizes
-        return None
-    return max_sizes[world_size]
-
-
 # Exception strings for non-implemented encoder/decoder scenarios
 
 # Reminder: Please update docs/features/compatibility_matrix.md