From 0571d047e46dd596f5b996bbf787279193cb80a6 Mon Sep 17 00:00:00 2001 From: Randall Smith Date: Thu, 13 Feb 2025 03:58:03 +0000 Subject: [PATCH 01/10] DeepSeek performance improvements Signed-off-by: Randall Smith --- benchmarks/kernels/benchmark_moe.py | 16 +- ...dtype=fp8_w8a8,block_shape=[128, 128].json | 128 ++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 128 ++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 164 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 128 ++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 164 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 128 ++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 164 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 128 ++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 164 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 128 ++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 164 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 128 ++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 164 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 128 ++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 164 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 128 ++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 164 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 128 ++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 164 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 128 ++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 164 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 128 ++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 164 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 128 ++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 164 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 128 ++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 164 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 128 ++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 164 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 128 ++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 164 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 128 ++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 164 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 128 ++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 164 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 128 ++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 164 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 128 ++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 164 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 128 ++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 164 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 128 ++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 164 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 128 ++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 164 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 128 ++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 164 ++++++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 128 ++++++++++++++ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 164 ++++++++++++++++++ .../layers/quantization/utils/fp8_utils.py | 48 ++--- 51 files changed, 7169 insertions(+), 31 deletions(-) create mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index a4a45c9cbff2..6b5d5454ee15 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -208,12 +208,12 @@ def get_configs_compute_bound(use_fp16) -> List[Dict[str, int]]: def prune_rocm_search_space(num_tokens, shard_intermediate_size, hidden_size, - search_space, is_fp16): + search_space, is_fp16, topk): N1, K1 = shard_intermediate_size, hidden_size N2, K2 = hidden_size, shard_intermediate_size // 2 - pruned_space_1 = prune_rocm_configs(num_tokens * 2, N1, K1, search_space, + pruned_space_1 = prune_rocm_configs(num_tokens * topk, N1, K1, search_space, is_fp16) - pruned_space_2 = prune_rocm_configs(num_tokens * 2, N2, K2, search_space, + pruned_space_2 = prune_rocm_configs(num_tokens * topk, N2, K2, search_space, is_fp16) search_space = merge_unique_dicts(pruned_space_1, pruned_space_2) return search_space @@ -380,7 +380,7 @@ def tune( search_space = prune_rocm_search_space(num_tokens, shard_intermediate_size, hidden_size, search_space, - is_fp16) + is_fp16, topk) with torch.cuda.device(self.device_id): for config in tqdm(search_space): @@ -437,7 +437,7 @@ def sort_config(config: BenchmarkConfig) -> BenchmarkConfig: def save_configs(configs: Dict[int, BenchmarkConfig], num_experts: int, shard_intermediate_size: int, hidden_size: int, topk: int, dtype: torch.dtype, use_fp8_w8a8: bool, - use_int8_w8a16: bool) -> None: + use_int8_w8a16: bool, block_quant_shape: List[int]) -> None: dtype_str = get_config_dtype_str(dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8) @@ -445,7 +445,7 @@ def save_configs(configs: Dict[int, BenchmarkConfig], num_experts: int, # NOTE(woosuk): The current naming convention uses w2.shape[2], which # is the intermediate size after silu_and_mul. filename = get_config_file_name(num_experts, shard_intermediate_size // 2, - dtype_str) + dtype_str, block_quant_shape) print(f"Writing best config to {filename}...") with open(filename, "w") as f: @@ -458,6 +458,7 @@ def main(args: argparse.Namespace): config = AutoConfig.from_pretrained( args.model, trust_remote_code=args.trust_remote_code) + block_quant_shape = None if config.architectures[0] == "DbrxForCausalLM": E = config.ffn_config.moe_num_experts topk = config.ffn_config.moe_top_k @@ -473,6 +474,7 @@ def main(args: argparse.Namespace): topk = config.num_experts_per_tok intermediate_size = config.moe_intermediate_size shard_intermediate_size = 2 * intermediate_size // args.tp_size + block_quant_shape = [128, 128] else: # Default: Mixtral. E = config.num_local_experts @@ -523,7 +525,7 @@ def _distribute(method: str, inputs: List[Any]) -> List[Any]: for M, config in zip(batch_sizes, configs) } save_configs(best_configs, E, shard_intermediate_size, hidden_size, - topk, dtype, use_fp8_w8a8, use_int8_w8a16) + topk, dtype, use_fp8_w8a8, use_int8_w8a16, block_quant_shape) end = time.time() print(f"Tuning took {end - start:.2f} seconds") else: diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000000..2b1167fc71e2 --- /dev/null +++ b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,128 @@ +{ + "1": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 32, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "2": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 16, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "4": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "8": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "16": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "24": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 128, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "32": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "48": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "64": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "96": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "128": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 256, + "GROUP_SIZE_M": 1, + "num_warps": 2, + "num_stages": 2, + "waves_per_eu": 0 + }, + "256": { + "BLOCK_SIZE_M": 16, + "BLOCK_SIZE_N": 64, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 4, + "num_warps": 4, + "num_stages": 2, + "waves_per_eu": 0 + }, + "512": { + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + }, + "1024": { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 256, + "BLOCK_SIZE_K": 128, + "GROUP_SIZE_M": 8, + "num_warps": 8, + "num_stages": 2, + "waves_per_eu": 0 + } +} diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000000..d1d2f6cfea80 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,128 @@ +{ + "1": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000000..e9a50e1d651f --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000000..44fe3be6e468 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,128 @@ +{ + "1": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000000..b3bf9ea26bee --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000000..0532845fa352 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,128 @@ +{ + "1": { + "BLOCK_SIZE_K": 32, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 32, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 32, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000000..defaacb32030 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000000..0aa55c156b30 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,128 @@ +{ + "1": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000000..987c8f600ea1 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000000..e9a26c62493a --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,128 @@ +{ + "1": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000000..b3ed43aafbd0 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000000..57f8e00d75d8 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,128 @@ +{ + "1": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000000..03e8235353c7 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000000..901f7cef7483 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,128 @@ +{ + "1": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000000..1a457b92a0ba --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000000..eaa079feb3a5 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,128 @@ +{ + "1": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000000..4415cc9d0bfa --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000000..2b60a1c67eda --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,128 @@ +{ + "1": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000000..7c039b409acb --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000000..18573585c2a8 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,128 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000000..5c604b9b6d9a --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000000..298b704da6a6 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,128 @@ +{ + "1": { + "BLOCK_SIZE_K": 32, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 32, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000000..b4d25aef96ec --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000000..9e4d91bbbc17 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,128 @@ +{ + "1": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000000..634c1bfab62a --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000000..f9a454e7fcec --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,128 @@ +{ + "1": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000000..19452dfe77b8 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000000..00166252e4a3 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,128 @@ +{ + "1": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000000..e6d910735f3a --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000000..d4624bd8f28c --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,128 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000000..c298da80a937 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000000..7303c2166213 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,128 @@ +{ + "1": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000000..cb993c878fc9 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000000..616536c9ac62 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,128 @@ +{ + "1": { + "BLOCK_SIZE_K": 32, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 32, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 32, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 32, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000000..56d3e1feea23 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000000..b4be351e6295 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,128 @@ +{ + "1": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000000..9cdff134dba1 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000000..006fd458e7e9 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,128 @@ +{ + "1": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000000..6f9bd755cdad --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000000..850fac7f3f8f --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,128 @@ +{ + "1": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000000..c7122d3b960b --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000000..92892b065212 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,128 @@ +{ + "1": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000000..3cea21b4d722 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000000..b08fa8c54d5f --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,128 @@ +{ + "1": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 32, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 32, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 32, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000000..a8141f535bcf --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 32, + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000000..b7882b061d15 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,128 @@ +{ + "1": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000000..c9566d713260 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000000..b971426f3435 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,128 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json new file mode 100644 index 000000000000..e4716875871f --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index 9895537c219a..8a70b2d56243 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -446,14 +446,14 @@ def get_w8a8_block_fp8_configs(N: int, K: int, block_n: int, return None -def w8a8_block_fp8_matmul( - A: torch.Tensor, - B: torch.Tensor, - As: torch.Tensor, - Bs: torch.Tensor, - block_size: List[int], - output_dtype: torch.dtype = torch.float16, -) -> torch.Tensor: +def w8a8_block_fp8_matmul(A: torch.Tensor, + B: torch.Tensor, + As: torch.Tensor, + Bs: torch.Tensor, + block_size: List[int], + output_dtype: torch.dtype = torch.float16, + tune_config=None, + use_default_config=False) -> torch.Tensor: """This function performs matrix multiplication with block-wise quantization. It takes two input tensors `A` and `B` with scales `As` and `Bs`. @@ -485,22 +485,22 @@ def w8a8_block_fp8_matmul( C_shape = A.shape[:-1] + (N, ) C = A.new_empty(C_shape, dtype=output_dtype) - configs = get_w8a8_block_fp8_configs(N, K, block_size[0], block_size[1]) - if configs: - # Get the optimal config if there is one - config = configs[min(configs.keys(), key=lambda x: abs(x - M))] - else: - # Default config - # Block-wise quant: BLOCK_SIZE_N must be divisible by block_size[0] - # BLOCK_SIZE_K must be divisible by block_size[1] - config = { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": block_size[0], - "BLOCK_SIZE_K": block_size[1], - "GROUP_SIZE_M": 32, - "num_warps": 4, - "num_stages": 2, - } + default_config = { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": block_size[0], + "BLOCK_SIZE_K": block_size[1], + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2, + } + + config = default_config if use_default_config else tune_config + if config is None: + configs = get_w8a8_block_fp8_configs(N, K, block_size[0], + block_size[1]) + config = configs[min( + configs.keys(), + key=lambda x: abs(x - M))] if configs else default_config def grid(META): return (triton.cdiv(M, META["BLOCK_SIZE_M"]) * From 974e422c56f260d7ab0057c1574ecde6c46d7e16 Mon Sep 17 00:00:00 2001 From: Randall Smith Date: Thu, 13 Feb 2025 04:11:36 +0000 Subject: [PATCH 02/10] revert fp8_utils.py Signed-off-by: Randall Smith --- .../layers/quantization/utils/fp8_utils.py | 48 +++++++++---------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index 8a70b2d56243..9895537c219a 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -446,14 +446,14 @@ def get_w8a8_block_fp8_configs(N: int, K: int, block_n: int, return None -def w8a8_block_fp8_matmul(A: torch.Tensor, - B: torch.Tensor, - As: torch.Tensor, - Bs: torch.Tensor, - block_size: List[int], - output_dtype: torch.dtype = torch.float16, - tune_config=None, - use_default_config=False) -> torch.Tensor: +def w8a8_block_fp8_matmul( + A: torch.Tensor, + B: torch.Tensor, + As: torch.Tensor, + Bs: torch.Tensor, + block_size: List[int], + output_dtype: torch.dtype = torch.float16, +) -> torch.Tensor: """This function performs matrix multiplication with block-wise quantization. It takes two input tensors `A` and `B` with scales `As` and `Bs`. @@ -485,22 +485,22 @@ def w8a8_block_fp8_matmul(A: torch.Tensor, C_shape = A.shape[:-1] + (N, ) C = A.new_empty(C_shape, dtype=output_dtype) - default_config = { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": block_size[0], - "BLOCK_SIZE_K": block_size[1], - "GROUP_SIZE_M": 32, - "num_warps": 4, - "num_stages": 2, - } - - config = default_config if use_default_config else tune_config - if config is None: - configs = get_w8a8_block_fp8_configs(N, K, block_size[0], - block_size[1]) - config = configs[min( - configs.keys(), - key=lambda x: abs(x - M))] if configs else default_config + configs = get_w8a8_block_fp8_configs(N, K, block_size[0], block_size[1]) + if configs: + # Get the optimal config if there is one + config = configs[min(configs.keys(), key=lambda x: abs(x - M))] + else: + # Default config + # Block-wise quant: BLOCK_SIZE_N must be divisible by block_size[0] + # BLOCK_SIZE_K must be divisible by block_size[1] + config = { + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": block_size[0], + "BLOCK_SIZE_K": block_size[1], + "GROUP_SIZE_M": 32, + "num_warps": 4, + "num_stages": 2, + } def grid(META): return (triton.cdiv(M, META["BLOCK_SIZE_M"]) * From defd28a0bf171c8083023a9aa1e3dd4867c67671 Mon Sep 17 00:00:00 2001 From: Randall Smith Date: Thu, 13 Feb 2025 04:23:09 +0000 Subject: [PATCH 03/10] yapf Signed-off-by: Randall Smith --- benchmarks/kernels/benchmark_moe.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index 6b5d5454ee15..b75d8dc67ff6 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -211,10 +211,10 @@ def prune_rocm_search_space(num_tokens, shard_intermediate_size, hidden_size, search_space, is_fp16, topk): N1, K1 = shard_intermediate_size, hidden_size N2, K2 = hidden_size, shard_intermediate_size // 2 - pruned_space_1 = prune_rocm_configs(num_tokens * topk, N1, K1, search_space, - is_fp16) - pruned_space_2 = prune_rocm_configs(num_tokens * topk, N2, K2, search_space, - is_fp16) + pruned_space_1 = prune_rocm_configs(num_tokens * topk, N1, K1, + search_space, is_fp16) + pruned_space_2 = prune_rocm_configs(num_tokens * topk, N2, K2, + search_space, is_fp16) search_space = merge_unique_dicts(pruned_space_1, pruned_space_2) return search_space @@ -436,8 +436,8 @@ def sort_config(config: BenchmarkConfig) -> BenchmarkConfig: def save_configs(configs: Dict[int, BenchmarkConfig], num_experts: int, shard_intermediate_size: int, hidden_size: int, topk: int, - dtype: torch.dtype, use_fp8_w8a8: bool, - use_int8_w8a16: bool, block_quant_shape: List[int]) -> None: + dtype: torch.dtype, use_fp8_w8a8: bool, use_int8_w8a16: bool, + block_quant_shape: List[int]) -> None: dtype_str = get_config_dtype_str(dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8) @@ -525,7 +525,8 @@ def _distribute(method: str, inputs: List[Any]) -> List[Any]: for M, config in zip(batch_sizes, configs) } save_configs(best_configs, E, shard_intermediate_size, hidden_size, - topk, dtype, use_fp8_w8a8, use_int8_w8a16, block_quant_shape) + topk, dtype, use_fp8_w8a8, use_int8_w8a16, + block_quant_shape) end = time.time() print(f"Tuning took {end - start:.2f} seconds") else: From 4944224de5fed8d66f34504f0c2f44edbec7a37c Mon Sep 17 00:00:00 2001 From: Randall Smith Date: Thu, 13 Feb 2025 17:32:48 +0000 Subject: [PATCH 04/10] remove space from file names Signed-off-by: Randall Smith --- ...dtype=fp8_w8a8,block_shape=[128, 128].json | 128 -------------- ...dtype=fp8_w8a8,block_shape=[128, 128].json | 164 ------------------ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 128 -------------- ...dtype=fp8_w8a8,block_shape=[128, 128].json | 164 ------------------ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 128 -------------- ...dtype=fp8_w8a8,block_shape=[128, 128].json | 164 ------------------ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 128 -------------- ...dtype=fp8_w8a8,block_shape=[128, 128].json | 164 ------------------ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 128 -------------- ...dtype=fp8_w8a8,block_shape=[128, 128].json | 164 ------------------ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 128 -------------- ...dtype=fp8_w8a8,block_shape=[128, 128].json | 164 ------------------ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 128 -------------- ...dtype=fp8_w8a8,block_shape=[128, 128].json | 164 ------------------ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 128 -------------- ...dtype=fp8_w8a8,block_shape=[128, 128].json | 164 ------------------ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 128 -------------- ...dtype=fp8_w8a8,block_shape=[128, 128].json | 164 ------------------ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 128 -------------- ...dtype=fp8_w8a8,block_shape=[128, 128].json | 164 ------------------ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 128 -------------- ...dtype=fp8_w8a8,block_shape=[128, 128].json | 164 ------------------ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 128 -------------- ...dtype=fp8_w8a8,block_shape=[128, 128].json | 164 ------------------ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 128 -------------- ...dtype=fp8_w8a8,block_shape=[128, 128].json | 164 ------------------ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 128 -------------- ...dtype=fp8_w8a8,block_shape=[128, 128].json | 164 ------------------ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 128 -------------- ...dtype=fp8_w8a8,block_shape=[128, 128].json | 164 ------------------ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 128 -------------- ...dtype=fp8_w8a8,block_shape=[128, 128].json | 164 ------------------ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 128 -------------- ...dtype=fp8_w8a8,block_shape=[128, 128].json | 164 ------------------ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 128 -------------- ...dtype=fp8_w8a8,block_shape=[128, 128].json | 164 ------------------ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 128 -------------- ...dtype=fp8_w8a8,block_shape=[128, 128].json | 164 ------------------ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 128 -------------- ...dtype=fp8_w8a8,block_shape=[128, 128].json | 164 ------------------ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 128 -------------- ...dtype=fp8_w8a8,block_shape=[128, 128].json | 164 ------------------ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 128 -------------- ...dtype=fp8_w8a8,block_shape=[128, 128].json | 164 ------------------ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 128 -------------- ...dtype=fp8_w8a8,block_shape=[128, 128].json | 164 ------------------ ...dtype=fp8_w8a8,block_shape=[128, 128].json | 128 -------------- ...dtype=fp8_w8a8,block_shape=[128, 128].json | 164 ------------------ 48 files changed, 7008 deletions(-) delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json deleted file mode 100644 index d1d2f6cfea80..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json +++ /dev/null @@ -1,128 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json deleted file mode 100644 index e9a50e1d651f..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json +++ /dev/null @@ -1,164 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json deleted file mode 100644 index 44fe3be6e468..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json +++ /dev/null @@ -1,128 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json deleted file mode 100644 index b3bf9ea26bee..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json +++ /dev/null @@ -1,164 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json deleted file mode 100644 index 0532845fa352..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json +++ /dev/null @@ -1,128 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 32, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 32, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 32, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json deleted file mode 100644 index defaacb32030..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json +++ /dev/null @@ -1,164 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 128, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json deleted file mode 100644 index 0aa55c156b30..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json +++ /dev/null @@ -1,128 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json deleted file mode 100644 index 987c8f600ea1..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json +++ /dev/null @@ -1,164 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json deleted file mode 100644 index e9a26c62493a..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json +++ /dev/null @@ -1,128 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json deleted file mode 100644 index b3ed43aafbd0..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json +++ /dev/null @@ -1,164 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json deleted file mode 100644 index 57f8e00d75d8..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json +++ /dev/null @@ -1,128 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json deleted file mode 100644 index 03e8235353c7..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json +++ /dev/null @@ -1,164 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json deleted file mode 100644 index 901f7cef7483..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json +++ /dev/null @@ -1,128 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json deleted file mode 100644 index 1a457b92a0ba..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json +++ /dev/null @@ -1,164 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json deleted file mode 100644 index eaa079feb3a5..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json +++ /dev/null @@ -1,128 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json deleted file mode 100644 index 4415cc9d0bfa..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json +++ /dev/null @@ -1,164 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json deleted file mode 100644 index 2b60a1c67eda..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json +++ /dev/null @@ -1,128 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json deleted file mode 100644 index 7c039b409acb..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json +++ /dev/null @@ -1,164 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json deleted file mode 100644 index 18573585c2a8..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json +++ /dev/null @@ -1,128 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json deleted file mode 100644 index 5c604b9b6d9a..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json +++ /dev/null @@ -1,164 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json deleted file mode 100644 index 298b704da6a6..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json +++ /dev/null @@ -1,128 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 32, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 32, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json deleted file mode 100644 index b4d25aef96ec..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json +++ /dev/null @@ -1,164 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 128, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 128, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json deleted file mode 100644 index 9e4d91bbbc17..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json +++ /dev/null @@ -1,128 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json deleted file mode 100644 index 634c1bfab62a..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json +++ /dev/null @@ -1,164 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json deleted file mode 100644 index f9a454e7fcec..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json +++ /dev/null @@ -1,128 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json deleted file mode 100644 index 19452dfe77b8..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json +++ /dev/null @@ -1,164 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json deleted file mode 100644 index 00166252e4a3..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json +++ /dev/null @@ -1,128 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json deleted file mode 100644 index e6d910735f3a..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json +++ /dev/null @@ -1,164 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json deleted file mode 100644 index d4624bd8f28c..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json +++ /dev/null @@ -1,128 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json deleted file mode 100644 index c298da80a937..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json +++ /dev/null @@ -1,164 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json deleted file mode 100644 index 7303c2166213..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json +++ /dev/null @@ -1,128 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json deleted file mode 100644 index cb993c878fc9..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json +++ /dev/null @@ -1,164 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json deleted file mode 100644 index 616536c9ac62..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json +++ /dev/null @@ -1,128 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 32, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 32, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 32, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 256, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 32, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 256, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 256, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 256, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 256, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 256, - "GROUP_SIZE_M": 32, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json deleted file mode 100644 index 56d3e1feea23..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json +++ /dev/null @@ -1,164 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 128, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 128, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json deleted file mode 100644 index b4be351e6295..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json +++ /dev/null @@ -1,128 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json deleted file mode 100644 index 9cdff134dba1..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json +++ /dev/null @@ -1,164 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json deleted file mode 100644 index 006fd458e7e9..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json +++ /dev/null @@ -1,128 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json deleted file mode 100644 index 6f9bd755cdad..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json +++ /dev/null @@ -1,164 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json deleted file mode 100644 index 850fac7f3f8f..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json +++ /dev/null @@ -1,128 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json deleted file mode 100644 index c7122d3b960b..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json +++ /dev/null @@ -1,164 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json deleted file mode 100644 index 92892b065212..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json +++ /dev/null @@ -1,128 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json deleted file mode 100644 index 3cea21b4d722..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json +++ /dev/null @@ -1,164 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json deleted file mode 100644 index b08fa8c54d5f..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json +++ /dev/null @@ -1,128 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 32, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 256, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 32, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 32, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 256, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 256, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 256, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 256, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 256, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 256, - "GROUP_SIZE_M": 1, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json deleted file mode 100644 index a8141f535bcf..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json +++ /dev/null @@ -1,164 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 32, - "BLOCK_SIZE_M": 256, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json deleted file mode 100644 index b7882b061d15..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json +++ /dev/null @@ -1,128 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json deleted file mode 100644 index c9566d713260..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json +++ /dev/null @@ -1,164 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json deleted file mode 100644 index b971426f3435..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json +++ /dev/null @@ -1,128 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json deleted file mode 100644 index e4716875871f..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json +++ /dev/null @@ -1,164 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "kpack": 1, - "matrix_instr_nonkdim": 16, - "num_warps": 4 - } -} \ No newline at end of file From 34e6b83537f3f486f56985a9ffa329111e7a51b7 Mon Sep 17 00:00:00 2001 From: Randall Smith Date: Thu, 13 Feb 2025 17:33:45 +0000 Subject: [PATCH 05/10] remove white space from file names Signed-off-by: Randall Smith --- ...,dtype=fp8_w8a8,block_shape=[128,128].json | 128 ++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 128 ++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 128 ++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 128 ++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 128 ++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 128 ++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 128 ++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 128 ++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 128 ++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 128 ++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 128 ++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 128 ++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 128 ++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 128 ++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 128 ++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 128 ++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 128 ++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 128 ++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 128 ++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 128 ++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 128 ++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 128 ++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 128 ++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 128 ++++++++++++++ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 164 ++++++++++++++++++ 48 files changed, 7008 insertions(+) create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json create mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 000000000000..d1d2f6cfea80 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,128 @@ +{ + "1": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 000000000000..e9a50e1d651f --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 000000000000..44fe3be6e468 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,128 @@ +{ + "1": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 000000000000..b3bf9ea26bee --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 000000000000..0532845fa352 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,128 @@ +{ + "1": { + "BLOCK_SIZE_K": 32, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 32, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 32, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 000000000000..defaacb32030 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 000000000000..0aa55c156b30 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,128 @@ +{ + "1": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 000000000000..987c8f600ea1 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 000000000000..e9a26c62493a --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,128 @@ +{ + "1": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 000000000000..b3ed43aafbd0 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 000000000000..57f8e00d75d8 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,128 @@ +{ + "1": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 000000000000..03e8235353c7 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 000000000000..901f7cef7483 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,128 @@ +{ + "1": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 000000000000..1a457b92a0ba --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 000000000000..eaa079feb3a5 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,128 @@ +{ + "1": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 000000000000..4415cc9d0bfa --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 000000000000..2b60a1c67eda --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,128 @@ +{ + "1": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 000000000000..7c039b409acb --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 000000000000..18573585c2a8 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,128 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 000000000000..5c604b9b6d9a --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 000000000000..298b704da6a6 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,128 @@ +{ + "1": { + "BLOCK_SIZE_K": 32, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 32, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 000000000000..b4d25aef96ec --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 000000000000..9e4d91bbbc17 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,128 @@ +{ + "1": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 000000000000..634c1bfab62a --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 000000000000..f9a454e7fcec --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,128 @@ +{ + "1": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 000000000000..19452dfe77b8 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 000000000000..00166252e4a3 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,128 @@ +{ + "1": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 000000000000..e6d910735f3a --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 000000000000..d4624bd8f28c --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,128 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 000000000000..c298da80a937 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 000000000000..7303c2166213 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,128 @@ +{ + "1": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 000000000000..cb993c878fc9 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 000000000000..616536c9ac62 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,128 @@ +{ + "1": { + "BLOCK_SIZE_K": 32, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 32, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 32, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 32, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "GROUP_SIZE_M": 32, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 000000000000..56d3e1feea23 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 128, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 000000000000..b4be351e6295 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,128 @@ +{ + "1": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 000000000000..9cdff134dba1 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 000000000000..006fd458e7e9 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,128 @@ +{ + "1": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 000000000000..6f9bd755cdad --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 000000000000..850fac7f3f8f --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,128 @@ +{ + "1": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 000000000000..c7122d3b960b --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 000000000000..92892b065212 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,128 @@ +{ + "1": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 000000000000..3cea21b4d722 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 000000000000..b08fa8c54d5f --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,128 @@ +{ + "1": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 32, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 32, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 32, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 64, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 256, + "GROUP_SIZE_M": 1, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 000000000000..a8141f535bcf --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 32, + "BLOCK_SIZE_M": 256, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 000000000000..b7882b061d15 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,128 @@ +{ + "1": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 000000000000..c9566d713260 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 000000000000..b971426f3435 --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,128 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 256, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 1, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 128, + "GROUP_SIZE_M": 32, + "num_warps": 4 + } +} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json new file mode 100644 index 000000000000..e4716875871f --- /dev/null +++ b/vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json @@ -0,0 +1,164 @@ +{ + "1": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "8": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "16": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "24": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "32": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 16, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "48": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 8, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "64": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 32, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "96": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 32, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "128": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 32, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "256": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "512": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1024": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "1536": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "2048": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "3072": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + }, + "4096": { + "BLOCK_SIZE_K": 128, + "BLOCK_SIZE_M": 64, + "BLOCK_SIZE_N": 64, + "GROUP_SIZE_M": 1, + "kpack": 1, + "matrix_instr_nonkdim": 16, + "num_warps": 4 + } +} \ No newline at end of file From c7abb8f0f6bcd2486ec98650b70921a2895f6cbe Mon Sep 17 00:00:00 2001 From: Randall Smith Date: Thu, 13 Feb 2025 19:24:57 +0000 Subject: [PATCH 06/10] remove XHF tunings Signed-off-by: Randall Smith --- ...,dtype=fp8_w8a8,block_shape=[128,128].json | 128 ------------------ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 128 ------------------ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 128 ------------------ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 128 ------------------ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 128 ------------------ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 128 ------------------ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 128 ------------------ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 128 ------------------ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 128 ------------------ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 128 ------------------ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 128 ------------------ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 128 ------------------ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 128 ------------------ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 128 ------------------ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 128 ------------------ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 128 ------------------ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 128 ------------------ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 128 ------------------ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 128 ------------------ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 128 ------------------ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 128 ------------------ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 128 ------------------ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 128 ------------------ ...,dtype=fp8_w8a8,block_shape=[128,128].json | 128 ------------------ 24 files changed, 3072 deletions(-) delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json delete mode 100644 vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json deleted file mode 100644 index d1d2f6cfea80..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +++ /dev/null @@ -1,128 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json deleted file mode 100644 index 44fe3be6e468..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +++ /dev/null @@ -1,128 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json deleted file mode 100644 index 0532845fa352..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +++ /dev/null @@ -1,128 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 32, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 32, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 32, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json deleted file mode 100644 index 0aa55c156b30..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +++ /dev/null @@ -1,128 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json deleted file mode 100644 index e9a26c62493a..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +++ /dev/null @@ -1,128 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json deleted file mode 100644 index 57f8e00d75d8..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +++ /dev/null @@ -1,128 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json deleted file mode 100644 index 901f7cef7483..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +++ /dev/null @@ -1,128 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json deleted file mode 100644 index eaa079feb3a5..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +++ /dev/null @@ -1,128 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json deleted file mode 100644 index 2b60a1c67eda..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +++ /dev/null @@ -1,128 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json deleted file mode 100644 index 18573585c2a8..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +++ /dev/null @@ -1,128 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json deleted file mode 100644 index 298b704da6a6..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +++ /dev/null @@ -1,128 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 32, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 32, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json deleted file mode 100644 index 9e4d91bbbc17..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +++ /dev/null @@ -1,128 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json deleted file mode 100644 index f9a454e7fcec..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +++ /dev/null @@ -1,128 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json deleted file mode 100644 index 00166252e4a3..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +++ /dev/null @@ -1,128 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json deleted file mode 100644 index d4624bd8f28c..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +++ /dev/null @@ -1,128 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json deleted file mode 100644 index 7303c2166213..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +++ /dev/null @@ -1,128 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json deleted file mode 100644 index 616536c9ac62..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +++ /dev/null @@ -1,128 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 32, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 32, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 32, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 256, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 32, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 256, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 256, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 256, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 256, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 256, - "GROUP_SIZE_M": 32, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json deleted file mode 100644 index b4be351e6295..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +++ /dev/null @@ -1,128 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json deleted file mode 100644 index 006fd458e7e9..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +++ /dev/null @@ -1,128 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json deleted file mode 100644 index 850fac7f3f8f..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +++ /dev/null @@ -1,128 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json deleted file mode 100644 index 92892b065212..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +++ /dev/null @@ -1,128 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json deleted file mode 100644 index b08fa8c54d5f..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +++ /dev/null @@ -1,128 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 32, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 256, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 32, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 32, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 64, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 256, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 256, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 256, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 256, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 256, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 64, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 256, - "GROUP_SIZE_M": 1, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json deleted file mode 100644 index b7882b061d15..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +++ /dev/null @@ -1,128 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - } -} \ No newline at end of file diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json deleted file mode 100644 index b971426f3435..000000000000 --- a/vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +++ /dev/null @@ -1,128 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "2": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "4": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "8": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "16": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "24": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "32": { - "BLOCK_SIZE_K": 256, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "48": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 8, - "num_warps": 4 - }, - "64": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 32, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "96": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 16, - "num_warps": 4 - }, - "128": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "256": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4 - }, - "512": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "1024": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "1536": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "2048": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "3072": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - }, - "4096": { - "BLOCK_SIZE_K": 128, - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 128, - "GROUP_SIZE_M": 32, - "num_warps": 4 - } -} \ No newline at end of file From 1b5690684de927341186fb68787b5f991f643db8 Mon Sep 17 00:00:00 2001 From: Randall Smith Date: Thu, 13 Feb 2025 20:13:00 +0000 Subject: [PATCH 07/10] Remove space from config Signed-off-by: Randall Smith --- ...stinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename vllm/model_executor/layers/fused_moe/configs/{E=256,N=256,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json => E=256,N=256,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128, 128].json rename to vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json From af0a44d33ebdba40c1d0e17413646b90959b210c Mon Sep 17 00:00:00 2001 From: Randall Smith Date: Thu, 13 Feb 2025 20:33:00 +0000 Subject: [PATCH 08/10] Remove changes to benchmark_moe.py Signed-off-by: Randall Smith --- benchmarks/kernels/benchmark_moe.py | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/benchmarks/kernels/benchmark_moe.py b/benchmarks/kernels/benchmark_moe.py index b75d8dc67ff6..a4a45c9cbff2 100644 --- a/benchmarks/kernels/benchmark_moe.py +++ b/benchmarks/kernels/benchmark_moe.py @@ -208,13 +208,13 @@ def get_configs_compute_bound(use_fp16) -> List[Dict[str, int]]: def prune_rocm_search_space(num_tokens, shard_intermediate_size, hidden_size, - search_space, is_fp16, topk): + search_space, is_fp16): N1, K1 = shard_intermediate_size, hidden_size N2, K2 = hidden_size, shard_intermediate_size // 2 - pruned_space_1 = prune_rocm_configs(num_tokens * topk, N1, K1, - search_space, is_fp16) - pruned_space_2 = prune_rocm_configs(num_tokens * topk, N2, K2, - search_space, is_fp16) + pruned_space_1 = prune_rocm_configs(num_tokens * 2, N1, K1, search_space, + is_fp16) + pruned_space_2 = prune_rocm_configs(num_tokens * 2, N2, K2, search_space, + is_fp16) search_space = merge_unique_dicts(pruned_space_1, pruned_space_2) return search_space @@ -380,7 +380,7 @@ def tune( search_space = prune_rocm_search_space(num_tokens, shard_intermediate_size, hidden_size, search_space, - is_fp16, topk) + is_fp16) with torch.cuda.device(self.device_id): for config in tqdm(search_space): @@ -436,8 +436,8 @@ def sort_config(config: BenchmarkConfig) -> BenchmarkConfig: def save_configs(configs: Dict[int, BenchmarkConfig], num_experts: int, shard_intermediate_size: int, hidden_size: int, topk: int, - dtype: torch.dtype, use_fp8_w8a8: bool, use_int8_w8a16: bool, - block_quant_shape: List[int]) -> None: + dtype: torch.dtype, use_fp8_w8a8: bool, + use_int8_w8a16: bool) -> None: dtype_str = get_config_dtype_str(dtype, use_int8_w8a16=use_int8_w8a16, use_fp8_w8a8=use_fp8_w8a8) @@ -445,7 +445,7 @@ def save_configs(configs: Dict[int, BenchmarkConfig], num_experts: int, # NOTE(woosuk): The current naming convention uses w2.shape[2], which # is the intermediate size after silu_and_mul. filename = get_config_file_name(num_experts, shard_intermediate_size // 2, - dtype_str, block_quant_shape) + dtype_str) print(f"Writing best config to {filename}...") with open(filename, "w") as f: @@ -458,7 +458,6 @@ def main(args: argparse.Namespace): config = AutoConfig.from_pretrained( args.model, trust_remote_code=args.trust_remote_code) - block_quant_shape = None if config.architectures[0] == "DbrxForCausalLM": E = config.ffn_config.moe_num_experts topk = config.ffn_config.moe_top_k @@ -474,7 +473,6 @@ def main(args: argparse.Namespace): topk = config.num_experts_per_tok intermediate_size = config.moe_intermediate_size shard_intermediate_size = 2 * intermediate_size // args.tp_size - block_quant_shape = [128, 128] else: # Default: Mixtral. E = config.num_local_experts @@ -525,8 +523,7 @@ def _distribute(method: str, inputs: List[Any]) -> List[Any]: for M, config in zip(batch_sizes, configs) } save_configs(best_configs, E, shard_intermediate_size, hidden_size, - topk, dtype, use_fp8_w8a8, use_int8_w8a16, - block_quant_shape) + topk, dtype, use_fp8_w8a8, use_int8_w8a16) end = time.time() print(f"Tuning took {end - start:.2f} seconds") else: From 80510047e9cda2ac551de500e72e3599fb2e1b50 Mon Sep 17 00:00:00 2001 From: Randall Smith Date: Fri, 14 Feb 2025 21:23:02 +0000 Subject: [PATCH 09/10] remove moe tuning Signed-off-by: Randall Smith --- ...,dtype=fp8_w8a8,block_shape=[128,128].json | 128 ------------------ 1 file changed, 128 deletions(-) delete mode 100644 vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json deleted file mode 100644 index 2b1167fc71e2..000000000000 --- a/vllm/model_executor/layers/fused_moe/configs/E=256,N=256,device_name=AMD_Instinct_MI300XHF_OAM,dtype=fp8_w8a8,block_shape=[128,128].json +++ /dev/null @@ -1,128 +0,0 @@ -{ - "1": { - "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 32, - "BLOCK_SIZE_K": 256, - "GROUP_SIZE_M": 1, - "num_warps": 4, - "num_stages": 2, - "waves_per_eu": 0 - }, - "2": { - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 16, - "BLOCK_SIZE_K": 256, - "GROUP_SIZE_M": 1, - "num_warps": 2, - "num_stages": 2, - "waves_per_eu": 0 - }, - "4": { - "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 256, - "GROUP_SIZE_M": 1, - "num_warps": 4, - "num_stages": 2, - "waves_per_eu": 0 - }, - "8": { - "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4, - "num_stages": 2, - "waves_per_eu": 0 - }, - "16": { - "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 4, - "num_warps": 2, - "num_stages": 2, - "waves_per_eu": 0 - }, - "24": { - "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 128, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, - "num_warps": 4, - "num_stages": 2, - "waves_per_eu": 0 - }, - "32": { - "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 4, - "num_warps": 2, - "num_stages": 2, - "waves_per_eu": 0 - }, - "48": { - "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 4, - "num_warps": 2, - "num_stages": 2, - "waves_per_eu": 0 - }, - "64": { - "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 1, - "num_warps": 2, - "num_stages": 2, - "waves_per_eu": 0 - }, - "96": { - "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 4, - "num_warps": 4, - "num_stages": 2, - "waves_per_eu": 0 - }, - "128": { - "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 256, - "GROUP_SIZE_M": 1, - "num_warps": 2, - "num_stages": 2, - "waves_per_eu": 0 - }, - "256": { - "BLOCK_SIZE_M": 16, - "BLOCK_SIZE_N": 64, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 4, - "num_warps": 4, - "num_stages": 2, - "waves_per_eu": 0 - }, - "512": { - "BLOCK_SIZE_M": 32, - "BLOCK_SIZE_N": 256, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 8, - "num_warps": 8, - "num_stages": 2, - "waves_per_eu": 0 - }, - "1024": { - "BLOCK_SIZE_M": 64, - "BLOCK_SIZE_N": 256, - "BLOCK_SIZE_K": 128, - "GROUP_SIZE_M": 8, - "num_warps": 8, - "num_stages": 2, - "waves_per_eu": 0 - } -} From 4fe44788fd8c2636eb7a741ebe5038fd2b46731e Mon Sep 17 00:00:00 2001 From: Randall Smith Date: Fri, 14 Feb 2025 21:52:55 -0600 Subject: [PATCH 10/10] move tunings Signed-off-by: Randall Smith --- ...AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ...AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ...AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ...AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ...AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ...AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ...AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ...AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ...AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ...AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ...AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ...AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ...AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ...AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ...AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ...AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ...AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ...AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ...AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ...AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ...AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ...AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ...AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 ...AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json} | 0 24 files changed, 0 insertions(+), 0 deletions(-) rename vllm/model_executor/layers/quantization/utils/configs/{N=1536,K=1536,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json => N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=1536,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json => N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=2048,K=512,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json => N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=2304,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json => N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=24576,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json => N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=256,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json => N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=3072,K=1536,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json => N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=3072,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json => N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=32768,K=512,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json => N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=36864,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json => N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=4096,K=512,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json => N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=4608,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json => N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=512,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json => N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=576,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json => N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=1024,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json => N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=1152,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json => N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=128,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json => N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=16384,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json => N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=18432,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json => N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=2048,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json => N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=2304,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json => N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=256,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json => N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=7168,K=8192,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json => N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) rename vllm/model_executor/layers/quantization/utils/configs/{N=8192,K=1536,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json => N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json} (100%) diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=1536,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=1536,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=2048,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=2304,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=24576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=256,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=3072,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=3072,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=32768,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=36864,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=4096,K=512,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=4608,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=512,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=576,K=7168,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1024,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=1152,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=128,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=16384,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=18432,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2048,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=2304,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=256,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=7168,K=8192,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json diff --git a/vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json b/vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json similarity index 100% rename from vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X_OAM,dtype=fp8_w8a8,block_shape=[128,128].json rename to vllm/model_executor/layers/quantization/utils/configs/N=8192,K=1536,device_name=AMD_Instinct_MI300X,dtype=fp8_w8a8,block_shape=[128,128].json