Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
c0d3784
Added the extra use_irope parameter in
tjtanaa Apr 7, 2025
2c2966c
Fix ROCm V1 Engine Fused MoE Bug
tjtanaa Apr 7, 2025
4d71ebe
Add warning message that V0 do not support irope
tjtanaa Apr 7, 2025
c4fc335
Expose torch.Tag for tensor stride handling
kliuae Apr 8, 2025
f8e76ec
fix linting issue
tjtanaa Apr 8, 2025
f68214d
Merge remote-tracking branch 'origin/main' into llama4-fp8
tjtanaa Apr 8, 2025
1632b77
add initial code path of CompressedTensorsW8A8Fp8MoEAiterMethod
tjtanaa Apr 8, 2025
60e1e19
enable aiter tkw1 kernel on compressed tensors moe
vllmellm Apr 9, 2025
e2104d7
separate out the tkw1 kernel from asm moe
vllmellm Apr 9, 2025
ec9332b
fix spelling typo
vllmellm Apr 9, 2025
645a87b
remove unused code
vllmellm Apr 9, 2025
f5ede3f
add support for aiter moe ops to be registered in v1 graph mode
vllmellm Apr 10, 2025
6ebe26a
aiter asm_moe_tkw1 mem access fault bug fix (WIP)
vllmellm Apr 10, 2025
e9ce59f
linting and update fake tensor function
tjtanaa Apr 10, 2025
3ce36d3
fix the V1 cuda graph mode
tjtanaa Apr 11, 2025
f12b056
clean up
tjtanaa Apr 11, 2025
cdddd61
merge with main
tjtanaa Apr 11, 2025
36c671b
remove kwargs from rocm_aiter wrapper
tjtanaa Apr 13, 2025
1bcf1d7
Merge remote-tracking branch 'origin/main' into llama4-fp8
vllmellm Apr 14, 2025
b98b224
update aiter compressed tensor moe method
vllmellm Apr 14, 2025
3fa80b6
restrict apply_router_weight_on_input
tjtanaa Apr 15, 2025
449bdaf
fix some function signature; fix tkw1 bug
tjtanaa Apr 16, 2025
88977c2
enable tkw1 and ck_moe for V1 Llama4 with torch compile
tjtanaa Apr 17, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions vllm/envs.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,7 @@
VLLM_ROCM_USE_AITER_LINEAR: bool = True
VLLM_ROCM_USE_AITER_MOE: bool = True
VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE: bool = False
VLLM_ROCM_USE_AITER_FP8_TKW1_MOE: bool = False
VLLM_ROCM_USE_AITER_RMSNORM: bool = True
VLLM_ROCM_FP8_PADDING: bool = True
VLLM_ROCM_MOE_PADDING: bool = True
Expand Down Expand Up @@ -553,6 +554,14 @@ def maybe_convert_int(value: Optional[str]) -> Optional[int]:
(os.getenv("VLLM_ROCM_USE_AITER_FP8_BLOCK_SCALED_MOE", "false").lower() in
("true", "1")),

# TODO: change this back to False
# Whether to use aiter custom topk weight multiplication first
# channel scaled moe kernel. (This is for Llama-4)
# By default this is disabled.
"VLLM_ROCM_USE_AITER_FP8_TKW1_MOE":
lambda: (os.getenv("VLLM_ROCM_USE_AITER_FP8_TKW1_MOE", "false").lower() in
("true", "1")),

# use aiter rms norm op if aiter ops are enabled.
"VLLM_ROCM_USE_AITER_RMSNORM":
lambda: (os.getenv("VLLM_ROCM_USE_AITER_RMSNORM", "True").lower() in
Expand Down
4 changes: 2 additions & 2 deletions vllm/model_executor/layers/fused_moe/fused_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@

from .rocm_aiter_fused_moe import (is_rocm_aiter_moe_enabled,
rocm_aiter_fused_experts,
rocm_aiter_topk_softmax)
rocm_aiter_topk_softmax_wrapper)

logger = init_logger(__name__)

Expand Down Expand Up @@ -842,7 +842,7 @@ def vllm_topk_softmax(topk_weights: torch.Tensor, topk_indices: torch.Tensor,

def dispatch_topk_func() -> Callable[..., tuple[torch.Tensor, ...]]:
if is_rocm_aiter_moe_enabled():
return rocm_aiter_topk_softmax
return rocm_aiter_topk_softmax_wrapper
return vllm_topk_softmax


Expand Down
Loading